diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b3af739..092da72 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,6 +14,6 @@ jobs: run: | sudo apt update make install-tools - curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.60.3 + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.61.0 - name: Test run: make check diff --git a/go.mod b/go.mod index 1ab1c7b..36ce252 100644 --- a/go.mod +++ b/go.mod @@ -1,65 +1,63 @@ module github.com/skycoin/skycoin-service-discovery -go 1.21 +go 1.22.0 -toolchain go1.21.5 +toolchain go1.23.1 require ( + github.com/chen3feng/safecast v0.0.0-20220908170618-81b2ecd47937 github.com/go-chi/chi v4.1.2+incompatible github.com/go-chi/cors v1.2.1 github.com/go-redis/redis v6.15.9+incompatible github.com/ivanpirog/coloredcobra v1.0.1 github.com/lib/pq v1.10.9 github.com/sirupsen/logrus v1.9.3 - github.com/skycoin/dmsg v1.3.28 + github.com/skycoin/dmsg v1.3.29-0.20241019182716-022283c93835 github.com/skycoin/skycoin v0.28.0 github.com/skycoin/skywire v1.3.28 github.com/skycoin/skywire-utilities v1.3.25 - github.com/spf13/cobra v1.7.0 + github.com/spf13/cobra v1.8.1 github.com/stretchr/testify v1.9.0 - gorm.io/driver/postgres v1.3.8 - gorm.io/gorm v1.23.8 + gorm.io/driver/postgres v1.5.9 + gorm.io/gorm v1.25.12 ) require ( github.com/AudriusButkevicius/pfilter v0.0.11 // indirect - github.com/VictoriaMetrics/metrics v1.24.0 // indirect + github.com/VictoriaMetrics/metrics v1.35.1 // indirect github.com/ccding/go-stun/stun v0.0.0-20200514191101-4dc67bcdb029 // indirect - github.com/cespare/xxhash/v2 v2.1.2 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect - github.com/fatih/color v1.15.0 // indirect - github.com/go-chi/chi/v5 v5.0.11 // indirect + github.com/fatih/color v1.17.0 // indirect + github.com/go-chi/chi/v5 v5.1.0 // indirect github.com/go-redis/redis/v8 v8.11.5 // indirect - github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect - github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d // indirect - github.com/google/pprof v0.0.0-20230821062121-407c9e7a662f // indirect - github.com/google/uuid v1.3.1 // indirect - github.com/hashicorp/yamux v0.1.1 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 // indirect + github.com/google/pprof v0.0.0-20241017200806-017d972448fc // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/hashicorp/yamux v0.1.2 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jackc/chunkreader/v2 v2.0.1 // indirect - github.com/jackc/pgconn v1.14.3 // indirect - github.com/jackc/pgio v1.0.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect - github.com/jackc/pgproto3/v2 v2.3.3 // indirect - github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect - github.com/jackc/pgtype v1.14.0 // indirect - github.com/jackc/pgx/v4 v4.18.2 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/jackc/pgx/v5 v5.7.1 // indirect + github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jinzhu/inflection v1.0.0 // indirect github.com/jinzhu/now v1.1.5 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/cpuid/v2 v2.2.5 // indirect - github.com/klauspost/reedsolomon v1.11.8 // indirect + github.com/klauspost/cpuid/v2 v2.2.8 // indirect + github.com/klauspost/reedsolomon v1.12.4 // indirect github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.19 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/onsi/ginkgo/v2 v2.12.0 // indirect - github.com/pires/go-proxyproto v0.6.2 // indirect + github.com/onsi/ginkgo/v2 v2.20.2 // indirect + github.com/pires/go-proxyproto v0.8.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/quic-go/quic-go v0.42.0 // indirect + github.com/quic-go/quic-go v0.48.0 // indirect + github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/skycoin/noise v0.0.0-20180327030543-2492fe189ae6 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.5.2 // indirect @@ -69,16 +67,17 @@ require ( github.com/valyala/fastrand v1.1.0 // indirect github.com/valyala/histogram v1.2.0 // indirect github.com/xtaci/kcp-go v5.4.20+incompatible // indirect - go.etcd.io/bbolt v1.3.7 // indirect - go.uber.org/mock v0.4.0 // indirect - golang.org/x/crypto v0.20.0 // indirect - golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect - golang.org/x/mod v0.12.0 // indirect - golang.org/x/net v0.21.0 // indirect - golang.org/x/sys v0.20.0 // indirect - golang.org/x/term v0.17.0 // indirect - golang.org/x/text v0.14.0 // indirect - golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 // indirect + go.etcd.io/bbolt v1.3.11 // indirect + go.uber.org/mock v0.5.0 // indirect + golang.org/x/crypto v0.28.0 // indirect + golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect + golang.org/x/mod v0.21.0 // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect + golang.org/x/tools v0.26.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 742ac43..33b1496 100644 --- a/go.sum +++ b/go.sum @@ -2,10 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT github.com/AudriusButkevicius/pfilter v0.0.11 h1:6emuvqNeH1gGlqkML35pEizyPcaxdAN4JO9sdgwcx78= github.com/AudriusButkevicius/pfilter v0.0.11/go.mod h1:4eF1UYuEhoycTlr9IOP1sb0lL9u4nfAIouRqt2xJbzM= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/Masterminds/semver/v3 v3.1.1 h1:hLg3sBzpNErnxhQtUy/mmLR2I9foDujNK030IGemrRc= -github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0cBrbBpGY/8hQs= -github.com/VictoriaMetrics/metrics v1.24.0 h1:ILavebReOjYctAGY5QU2F9X0MYvkcrG3aEn2RKa1Zkw= -github.com/VictoriaMetrics/metrics v1.24.0/go.mod h1:eFT25kvsTidQFHb6U0oa0rTrDRdz4xTYjpL8+UPohys= +github.com/VictoriaMetrics/metrics v1.35.1 h1:o84wtBKQbzLdDy14XeskkCZih6anG+veZ1SwJHFGwrU= +github.com/VictoriaMetrics/metrics v1.35.1/go.mod h1:r7hveu6xMdUACXvB8TYdAj8WEsKzWB0EkpJN+RDtOf8= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps= @@ -13,17 +11,14 @@ github.com/ccding/go-stun/stun v0.0.0-20200514191101-4dc67bcdb029 h1:POmUHfxXdey github.com/ccding/go-stun/stun v0.0.0-20200514191101-4dc67bcdb029/go.mod h1:Rpr5n9cGHYdM3S3IK8ROSUUUYjQOu+MSUCZDcJbYWi8= github.com/cenkalti/backoff v1.1.0/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= -github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/chen3feng/safecast v0.0.0-20220908170618-81b2ecd47937 h1:gJMTUTnqa+f2GzdU1p3UVa3E39hogZdiRHEgBBnqtVc= +github.com/chen3feng/safecast v0.0.0-20220908170618-81b2ecd47937/go.mod h1:HPBMB1GC+eBfIUWhh9IJKdL/mVhIBZbJzjvijHxG3F0= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= -github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I= -github.com/cockroachdb/apd v1.1.0/go.mod h1:8Sl8LxpKi29FqWXR16WEFZRNSz3SoPzUzeMeY4+DwBQ= -github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= -github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -33,32 +28,27 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= -github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= -github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= +github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/go-chi/chi v4.1.2+incompatible h1:fGFk2Gmi/YKXk0OmGfBh0WgmN3XB8lVnEyNz34tQRec= github.com/go-chi/chi v4.1.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ= -github.com/go-chi/chi/v5 v5.0.11 h1:BnpYbFZ3T3S1WMpD79r7R5ThWX40TaFB7L31Y8xqSwA= -github.com/go-chi/chi/v5 v5.0.11/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= +github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw= +github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= github.com/go-chi/cors v1.2.1 h1:xEC8UT3Rlp2QuWNEr4Fs/c2EAGVKBwy/1vHx3bppil4= github.com/go-chi/cors v1.2.1/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58= -github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= -github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= -github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= -github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg= github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA= github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= github.com/go-redis/redis/v8 v8.11.5/go.mod h1:gREzHqY1hg6oD9ngVRbLStwAWKhA0FEgq8Jd4h5lpwo= -github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= -github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d h1:KbPOUXFUDJxwZ04vbmDOc3yuruGvVO+LOa7cVER3yWw= -github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d/go.mod h1:5YoVOkjYAQumqlV356Hj3xeYh4BdZuLE0/nRkf2NKkI= -github.com/gofrs/uuid v4.0.0+incompatible h1:1SD/1F5pU8p29ybwgQSwpQk+mwdRrXCYuPhW6m+TnJw= -github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 h1:FWNFq4fM1wPfcK40yHE5UO3RUdSNPaBC+j3PokzA6OQ= +github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1/go.mod h1:5YoVOkjYAQumqlV356Hj3xeYh4BdZuLE0/nRkf2NKkI= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -70,8 +60,6 @@ github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrU github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -79,114 +67,55 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20230821062121-407c9e7a662f h1:pDhu5sgp8yJlEF/g6osliIIpF9K4F5jvkULXa4daRDQ= -github.com/google/pprof v0.0.0-20230821062121-407c9e7a662f/go.mod h1:czg5+yv1E0ZGTi6S6vVK1mke0fV+FaUhNGcd6VRS9Ik= -github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= -github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/pprof v0.0.0-20241017200806-017d972448fc h1:NGyrhhFhwvRAZg02jnYVg3GBQy0qGBKmFQJwaPmpmxs= +github.com/google/pprof v0.0.0-20241017200806-017d972448fc/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= -github.com/hashicorp/yamux v0.1.1 h1:yrQxtgseBDrq9Y652vSRDvsKCJKOUD+GzTS4Y0Y8pvE= -github.com/hashicorp/yamux v0.1.1/go.mod h1:CtWFDAQgb7dxtzFs4tWbplKIe2jSi3+5vKbgIO0SLnQ= +github.com/hashicorp/yamux v0.1.2 h1:XtB8kyFOyHXYVFnwT5C3+Bdo8gArse7j2AQ0DA0Uey8= +github.com/hashicorp/yamux v0.1.2/go.mod h1:C+zze2n6e/7wshOZep2A70/aQU6QBRWJO/G6FT1wIns= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/ivanpirog/coloredcobra v1.0.1 h1:aURSdEmlR90/tSiWS0dMjdwOvCVUeYLfltLfbgNxrN4= github.com/ivanpirog/coloredcobra v1.0.1/go.mod h1:iho4nEKcnwZFiniGSdcgdvRgZNjxm+h20acv8vqmN6Q= -github.com/jackc/chunkreader v1.0.0/go.mod h1:RT6O25fNZIuasFJRyZ4R/Y2BbhasbmZXF9QQ7T3kePo= -github.com/jackc/chunkreader/v2 v2.0.0/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= -github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= -github.com/jackc/chunkreader/v2 v2.0.1/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= -github.com/jackc/pgconn v0.0.0-20190420214824-7e0022ef6ba3/go.mod h1:jkELnwuX+w9qN5YIfX0fl88Ehu4XC3keFuOJJk9pcnA= -github.com/jackc/pgconn v0.0.0-20190824142844-760dd75542eb/go.mod h1:lLjNuW/+OfW9/pnVKPazfWOgNfH2aPem8YQ7ilXGvJE= -github.com/jackc/pgconn v0.0.0-20190831204454-2fabfa3c18b7/go.mod h1:ZJKsE/KZfsUgOEh9hBm+xYTstcNHg7UPMVJqRfQxq4s= -github.com/jackc/pgconn v1.8.0/go.mod h1:1C2Pb36bGIP9QHGBYCjnyhqu7Rv3sGshaQUvmfGIB/o= -github.com/jackc/pgconn v1.9.0/go.mod h1:YctiPyvzfU11JFxoXokUOOKQXQmDMoJL9vJzHH8/2JY= -github.com/jackc/pgconn v1.9.1-0.20210724152538-d89c8390a530/go.mod h1:4z2w8XhRbP1hYxkpTuBjTS3ne3J48K83+u0zoyvg2pI= -github.com/jackc/pgconn v1.12.1/go.mod h1:ZkhRC59Llhrq3oSfrikvwQ5NaxYExr6twkdkMLaKono= -github.com/jackc/pgconn v1.14.3 h1:bVoTr12EGANZz66nZPkMInAV/KHD2TxH9npjXXgiB3w= -github.com/jackc/pgconn v1.14.3/go.mod h1:RZbme4uasqzybK2RK5c65VsHxoyaml09lx3tXOcO/VM= -github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= -github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8= -github.com/jackc/pgmock v0.0.0-20190831213851-13a1b77aafa2/go.mod h1:fGZlG77KXmcq05nJLRkk0+p82V8B8Dw8KN2/V9c/OAE= -github.com/jackc/pgmock v0.0.0-20201204152224-4fe30f7445fd/go.mod h1:hrBW0Enj2AZTNpt/7Y5rr2xe/9Mn757Wtb2xeBzPv2c= -github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65 h1:DadwsjnMwFjfWc9y5Wi/+Zz7xoE5ALHsRQlOctkOiHc= -github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65/go.mod h1:5R2h2EEX+qri8jOWMbJCtaPWkrrNc7OHwsp2TCqp7ak= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= -github.com/jackc/pgproto3 v1.1.0/go.mod h1:eR5FA3leWg7p9aeAqi37XOTgTIbkABlvcPB3E5rlc78= -github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190420180111-c116219b62db/go.mod h1:bhq50y+xrl9n5mRYyCBFKkpRVTLYJVWeCc+mEAI3yXA= -github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190609003834-432c2951c711/go.mod h1:uH0AWtUmuShn0bcesswc4aBTWGvw0cAxIJp+6OB//Wg= -github.com/jackc/pgproto3/v2 v2.0.0-rc3/go.mod h1:ryONWYqW6dqSg1Lw6vXNMXoBJhpzvWKnT95C46ckYeM= -github.com/jackc/pgproto3/v2 v2.0.0-rc3.0.20190831210041-4c03ce451f29/go.mod h1:ryONWYqW6dqSg1Lw6vXNMXoBJhpzvWKnT95C46ckYeM= -github.com/jackc/pgproto3/v2 v2.0.6/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= -github.com/jackc/pgproto3/v2 v2.1.1/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= -github.com/jackc/pgproto3/v2 v2.3.0/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= -github.com/jackc/pgproto3/v2 v2.3.3 h1:1HLSx5H+tXR9pW3in3zaztoEwQYRC9SQaYUHjTSUOag= -github.com/jackc/pgproto3/v2 v2.3.3/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA= -github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b/go.mod h1:vsD4gTJCa9TptPL8sPkXrLZ+hDuNrZCnj29CQpr4X1E= -github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= -github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= -github.com/jackc/pgtype v0.0.0-20190421001408-4ed0de4755e0/go.mod h1:hdSHsc1V01CGwFsrv11mJRHWJ6aifDLfdV3aVjFF0zg= -github.com/jackc/pgtype v0.0.0-20190824184912-ab885b375b90/go.mod h1:KcahbBH1nCMSo2DXpzsoWOAfFkdEtEJpPbVLq8eE+mc= -github.com/jackc/pgtype v0.0.0-20190828014616-a8802b16cc59/go.mod h1:MWlu30kVJrUS8lot6TQqcg7mtthZ9T0EoIBFiJcmcyw= -github.com/jackc/pgtype v1.8.1-0.20210724151600-32e20a603178/go.mod h1:C516IlIV9NKqfsMCXTdChteoXmwgUceqaLfjg2e3NlM= -github.com/jackc/pgtype v1.11.0/go.mod h1:LUMuVrfsFfdKGLw+AFFVv6KtHOFMwRgDDzBt76IqCA4= -github.com/jackc/pgtype v1.14.0 h1:y+xUdabmyMkJLyApYuPj38mW+aAIqCe5uuBB51rH3Vw= -github.com/jackc/pgtype v1.14.0/go.mod h1:LUMuVrfsFfdKGLw+AFFVv6KtHOFMwRgDDzBt76IqCA4= -github.com/jackc/pgx/v4 v4.0.0-20190420224344-cc3461e65d96/go.mod h1:mdxmSJJuR08CZQyj1PVQBHy9XOp5p8/SHH6a0psbY9Y= -github.com/jackc/pgx/v4 v4.0.0-20190421002000-1b8f0016e912/go.mod h1:no/Y67Jkk/9WuGR0JG/JseM9irFbnEPbuWV2EELPNuM= -github.com/jackc/pgx/v4 v4.0.0-pre1.0.20190824185557-6972a5742186/go.mod h1:X+GQnOEnf1dqHGpw7JmHqHc1NxDoalibchSk9/RWuDc= -github.com/jackc/pgx/v4 v4.12.1-0.20210724153913-640aa07df17c/go.mod h1:1QD0+tgSXP7iUjYm9C1NxKhny7lq6ee99u/z+IHFcgs= -github.com/jackc/pgx/v4 v4.16.1/go.mod h1:SIhx0D5hoADaiXZVyv+3gSm3LCIIINTVO0PficsvWGQ= -github.com/jackc/pgx/v4 v4.18.2 h1:xVpYkNR5pk5bMCZGfClbO962UIqVABcAGt7ha1s/FeU= -github.com/jackc/pgx/v4 v4.18.2/go.mod h1:Ey4Oru5tH5sB6tV7hDmfWFahwF15Eb7DNXlRKx2CkVw= -github.com/jackc/puddle v0.0.0-20190413234325-e4ced69a3a2b/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= -github.com/jackc/puddle v0.0.0-20190608224051-11cab39313c9/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= -github.com/jackc/puddle v1.1.3/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= -github.com/jackc/puddle v1.2.1/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs= +github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= -github.com/jinzhu/now v1.1.4/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= -github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= -github.com/klauspost/reedsolomon v1.11.8 h1:s8RpUW5TK4hjr+djiOpbZJB4ksx+TdYbRH7vHQpwPOY= -github.com/klauspost/reedsolomon v1.11.8/go.mod h1:4bXRN+cVzMdml6ti7qLouuYi32KHJ5MGv0Qd8a47h6A= +github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM= +github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/klauspost/reedsolomon v1.12.4 h1:5aDr3ZGoJbgu/8+j45KtUJxzYm8k08JGtB9Wx1VQ4OA= +github.com/klauspost/reedsolomon v1.12.4/go.mod h1:d3CzOMOt0JXGIFZm1StgkyF14EYr3xneR2rNWo7NcMU= github.com/konsorten/go-windows-terminal-sequences v0.0.0-20180402223658-b729f2633dfe/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/pty v1.1.8/go.mod h1:O1sed60cT9XZ5uDucP5qwvh+TE3NnUj51EiZO/lmSfw= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= -github.com/lib/pq v1.1.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= -github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= -github.com/lib/pq v1.10.2/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= -github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= -github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= -github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d h1:5PJl274Y63IEHC+7izoQE9x6ikvDFZS2mDVS3drnohI= github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= @@ -201,39 +130,31 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.12.0 h1:UIVDowFPwpg6yMUpPjGkYvf06K3RAiJXUhCxEwQVHRI= -github.com/onsi/ginkgo/v2 v2.12.0/go.mod h1:ZNEzXISYlqpb8S36iN71ifqLi3vVD1rVJGvWRCJOUpQ= -github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= -github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= +github.com/onsi/ginkgo/v2 v2.20.2 h1:7NVCeyIWROIAheY21RLS+3j2bb52W0W82tkberYytp4= +github.com/onsi/ginkgo/v2 v2.20.2/go.mod h1:K9gyxPIlb+aIvnZ8bd9Ak+YP18w3APlR+5coaZoE2ag= +github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k= +github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= -github.com/pires/go-proxyproto v0.6.2 h1:KAZ7UteSOt6urjme6ZldyFm4wDe/z0ZUP0Yv0Dos0d8= -github.com/pires/go-proxyproto v0.6.2/go.mod h1:Odh9VFOZJCf9G8cLW5o435Xf1J95Jw9Gw5rnCjcwzAY= -github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pires/go-proxyproto v0.8.0 h1:5unRmEAPbHXHuLjDg01CxJWf91cw3lKHc/0xzKpXEe0= +github.com/pires/go-proxyproto v0.8.0/go.mod h1:iknsfgnH8EkjrMeMyvfKByp9TiBZCKZM0jx2xmKqnVY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/quic-go/quic-go v0.42.0 h1:uSfdap0eveIl8KXnipv9K7nlwZ5IqLlYOpJ58u5utpM= -github.com/quic-go/quic-go v0.42.0/go.mod h1:132kz4kL3F9vxhW3CtQJLDVwcFe5wdWeJXXijhsO57M= -github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/quic-go/quic-go v0.48.0 h1:2TCyvBrMu1Z25rvIAlnp2dPT4lgh/uTqLqiXVpp5AeU= +github.com/quic-go/quic-go v0.48.0/go.mod h1:yBgs3rWBOADpga7F+jJsb6Ybg1LSYiQvwWlLX+/6HMs= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/rs/cors v1.6.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= -github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ= -github.com/rs/zerolog v1.13.0/go.mod h1:YbFCdg8HfsridGWAh22vktObvhZbQsZXe4/zB0OKkWU= -github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThCjNc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4= -github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= -github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/sirupsen/logrus v1.1.1/go.mod h1:zrgwTnHtNr00buQ1vSptGe8m1f/BbgsPukg8qsT7A+A= -github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= -github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/skycoin/dmsg v1.3.28 h1:zpQwkCJ0/hmlVb9wTQZDOTl/bC9e1Hsxh7Q6/jHG+4I= -github.com/skycoin/dmsg v1.3.28/go.mod h1:AxsiUhS7FpllJv/gHO/OlI3mGyeznIp9zitXwKWQXeQ= +github.com/skycoin/dmsg v1.3.29-0.20241019182716-022283c93835 h1:29jU+cxqQGJCWoxGsrgk3nIF+JDipvJsQU89x6FKSZw= +github.com/skycoin/dmsg v1.3.29-0.20241019182716-022283c93835/go.mod h1:hFJtHnX+pDC6v5Z81QPbpVktMMfMNzL/kgcZmCKu3Bg= github.com/skycoin/encodertest v0.0.0-20190217072920-14c2e31898b9/go.mod h1:OQz8NXVJUWEw7PWYASZ/1BIw5GXgVMTGvrCGDlZa9+k= github.com/skycoin/noise v0.0.0-20180327030543-2492fe189ae6 h1:1Nc5EBY6pjfw1kwW0duwyG+7WliWz5u9kgk1h5MnLuA= github.com/skycoin/noise v0.0.0-20180327030543-2492fe189ae6/go.mod h1:UXghlricA7J3aRD/k7p/zBObQfmBawwCxIVPVjz2Q3o= @@ -247,8 +168,8 @@ github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B github.com/spf13/cast v1.2.0/go.mod h1:r2rcYCSwa1IExKTDiTfzaxqT2FNHs8hODu4LnUfgKEg= github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/cobra v1.4.0/go.mod h1:Wo4iy3BUC+X2Fybo0PDqwJIv3dNRiZLHQymsfxlB84g= -github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= -github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v1.0.2/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= @@ -257,14 +178,10 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An github.com/spf13/viper v1.2.1/go.mod h1:P4AexN0a+C9tGAnUFNwDMYYZv3pjFuvmeiMyKRaNVlI= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= @@ -284,120 +201,72 @@ github.com/xtaci/kcp-go v5.4.20+incompatible h1:TN1uey3Raw0sTz0Fg8GkfM0uH3YwzhnZ github.com/xtaci/kcp-go v5.4.20+incompatible/go.mod h1:bN6vIwHQbfHaHtFpEssmWsN45a+AZwO7eyRCmEIbtvE= github.com/xtaci/lossyconn v0.0.0-20200209145036-adba10fffc37 h1:EWU6Pktpas0n8lLQwDsRyZfmkPeRbdgPtW609es+/9E= github.com/xtaci/lossyconn v0.0.0-20200209145036-adba10fffc37/go.mod h1:HpMP7DB2CyokmAh4lp0EQnnWhmycP/TvwBGzvuie+H0= -github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= -go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ= -go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw= -go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= -go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= -go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= -go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= -go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= -go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= -go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= -go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= -go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= -go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= -go.uber.org/zap v1.9.1/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= -go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= -go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= +go.etcd.io/bbolt v1.3.11 h1:yGEzV1wPz2yVCLsD8ZAiGHhHVlczyC9d1rP43/VCRJ0= +go.etcd.io/bbolt v1.3.11/go.mod h1:dksAq7YMXoljX0xu6VF5DMZGbhYYoLUalEiSySYAS4I= +go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= +go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20181015023909-0c41d7ab0a0e/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190411191339-88737f569e3a/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20201203163018-be400aefbc4c/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= -golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.20.0 h1:jmAMJJZXr5KiCw05dfYK9QnqaqKLYXijU23lsEdcQqg= -golang.org/x/crypto v0.20.0/go.mod h1:Xwo95rrVNIoSMx9wa1JroENMToLWn3RNVrTBpLHgZPQ= +golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= +golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= -golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= +golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY= +golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= -golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc= -golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180906133057-8cf3aee42992/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181023152157-44b849a8bc13/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190403152447-81d4e9dc473e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= -golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.17.0 h1:mkTF7LCd6WGJNL3K1Ad7kwxNfYAW6a8a8QqtMblp/4U= -golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190425163242-31fd60d6bfdc/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190823170909-c4a336ef6a2f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 h1:Vve/L0v7CXXuxUmaMGIEK/dEeq7uiqb5qBgQrZzIE7E= -golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM= -golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= +golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -412,27 +281,21 @@ google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQ google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/inconshreveable/log15.v2 v2.0.0-20180818164646-67afb5ed74ec/go.mod h1:aPpfJ7XW+gOuirDoZ8gHhLh3kZ1B08FtV2bbmy7Jv3s= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gorm.io/driver/postgres v1.3.8 h1:8bEphSAB69t3odsCR4NDzt581iZEWQuRM27Cg6KgfPY= -gorm.io/driver/postgres v1.3.8/go.mod h1:qB98Aj6AhRO/oyu/jmZsi/YM9g6UzVCjMxO/6frFvcA= -gorm.io/gorm v1.23.6/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= -gorm.io/gorm v1.23.8 h1:h8sGJ+biDgBA1AD1Ha9gFCx7h8npU7AsLdlkX0n2TpE= -gorm.io/gorm v1.23.8/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk= +gorm.io/driver/postgres v1.5.9 h1:DkegyItji119OlcaLjqN11kHoUgZ/j13E0jkJZgD6A8= +gorm.io/driver/postgres v1.5.9/go.mod h1:DX3GReXH+3FPWGrrgffdvCk3DQ1dwDPdmbenSkweRGI= +gorm.io/gorm v1.25.12 h1:I0u8i2hWQItBq1WfE0o2+WuL9+8L21K9e2HHSTE/0f8= +gorm.io/gorm v1.25.12/go.mod h1:xh7N7RHfYlNc5EmcI/El95gXusucDrQnHXe0+CgWcLQ= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= diff --git a/pkg/service-discovery/store/postgres_store.go b/pkg/service-discovery/store/postgres_store.go index ff2aa68..d112b11 100644 --- a/pkg/service-discovery/store/postgres_store.go +++ b/pkg/service-discovery/store/postgres_store.go @@ -7,6 +7,7 @@ import ( "net/http" "sync" + "github.com/chen3feng/safecast" "github.com/lib/pq" "github.com/sirupsen/logrus" "github.com/skycoin/skywire-utilities/pkg/geo" @@ -123,7 +124,12 @@ func (s *postgresStore) CountServiceTypes(_ context.Context) (uint64, error) { return uint64(0), fmt.Errorf("Postgres command returned unexpected error: %w", err) } - return uint64(countTypes), nil + u, ok := safecast.To[uint64](countTypes) + if ok { + return u, nil + } + return uint64(0), fmt.Errorf("uint64 conversion overflow") + } func (s *postgresStore) CountServices(ctx context.Context, serviceType string) (uint64, error) { diff --git a/vendor/github.com/VictoriaMetrics/metrics/README.md b/vendor/github.com/VictoriaMetrics/metrics/README.md index e1a2537..b01d81e 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/README.md +++ b/vendor/github.com/VictoriaMetrics/metrics/README.md @@ -73,8 +73,11 @@ http.HandleFunc("/metrics", func(w http.ResponseWriter, req *http.Request) { metrics.InitPush("http://victoria-metrics:8428/api/v1/import/prometheus", 10*time.Second, `instance="foobar"`, true) ``` -See [docs](http://godoc.org/github.com/VictoriaMetrics/metrics) for more info. +By default, exposed metrics [do not have](https://github.com/VictoriaMetrics/metrics/issues/48#issuecomment-1620765811) +`TYPE` or `HELP` meta information. Call [`ExposeMetadata(true)`](https://pkg.go.dev/github.com/VictoriaMetrics/metrics#ExposeMetadata) +in order to generate `TYPE` and `HELP` meta information per each metric. +See [docs](https://pkg.go.dev/github.com/VictoriaMetrics/metrics) for more info. ### Users diff --git a/vendor/github.com/VictoriaMetrics/metrics/counter.go b/vendor/github.com/VictoriaMetrics/metrics/counter.go index dfe9477..1076e80 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/counter.go +++ b/vendor/github.com/VictoriaMetrics/metrics/counter.go @@ -42,6 +42,11 @@ func (c *Counter) Add(n int) { atomic.AddUint64(&c.n, uint64(n)) } +// AddInt64 adds n to c. +func (c *Counter) AddInt64(n int64) { + atomic.AddUint64(&c.n, uint64(n)) +} + // Get returns the current value for c. func (c *Counter) Get() uint64 { return atomic.LoadUint64(&c.n) @@ -58,6 +63,10 @@ func (c *Counter) marshalTo(prefix string, w io.Writer) { fmt.Fprintf(w, "%s %d\n", prefix, v) } +func (c *Counter) metricType() string { + return "counter" +} + // GetOrCreateCounter returns registered counter with the given name // or creates new counter if the registry doesn't contain counter with // the given name. diff --git a/vendor/github.com/VictoriaMetrics/metrics/floatcounter.go b/vendor/github.com/VictoriaMetrics/metrics/floatcounter.go index f898790..8bd9fa6 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/floatcounter.go +++ b/vendor/github.com/VictoriaMetrics/metrics/floatcounter.go @@ -63,6 +63,10 @@ func (fc *FloatCounter) marshalTo(prefix string, w io.Writer) { fmt.Fprintf(w, "%s %g\n", prefix, v) } +func (fc *FloatCounter) metricType() string { + return "counter" +} + // GetOrCreateFloatCounter returns registered FloatCounter with the given name // or creates new FloatCounter if the registry doesn't contain FloatCounter with // the given name. diff --git a/vendor/github.com/VictoriaMetrics/metrics/gauge.go b/vendor/github.com/VictoriaMetrics/metrics/gauge.go index 9084fc4..3573e14 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/gauge.go +++ b/vendor/github.com/VictoriaMetrics/metrics/gauge.go @@ -3,10 +3,11 @@ package metrics import ( "fmt" "io" + "math" + "sync/atomic" ) -// NewGauge registers and returns gauge with the given name, which calls f -// to obtain gauge value. +// NewGauge registers and returns gauge with the given name, which calls f to obtain gauge value. // // name must be valid Prometheus-compatible metric with possible labels. // For instance, @@ -16,6 +17,7 @@ import ( // - foo{bar="baz",aaa="b"} // // f must be safe for concurrent calls. +// if f is nil, then it is expected that the gauge value is changed via Set(), Inc(), Dec() and Add() calls. // // The returned gauge is safe to use from concurrent goroutines. // @@ -25,19 +27,68 @@ func NewGauge(name string, f func() float64) *Gauge { } // Gauge is a float64 gauge. -// -// See also Counter, which could be used as a gauge with Set and Dec calls. type Gauge struct { + // valueBits contains uint64 representation of float64 passed to Gauge.Set. + valueBits uint64 + + // f is a callback, which is called for returning the gauge value. f func() float64 } // Get returns the current value for g. func (g *Gauge) Get() float64 { - return g.f() + if f := g.f; f != nil { + return f() + } + n := atomic.LoadUint64(&g.valueBits) + return math.Float64frombits(n) +} + +// Set sets g value to v. +// +// The g must be created with nil callback in order to be able to call this function. +func (g *Gauge) Set(v float64) { + if g.f != nil { + panic(fmt.Errorf("cannot call Set on gauge created with non-nil callback")) + } + n := math.Float64bits(v) + atomic.StoreUint64(&g.valueBits, n) +} + +// Inc increments g by 1. +// +// The g must be created with nil callback in order to be able to call this function. +func (g *Gauge) Inc() { + g.Add(1) +} + +// Dec decrements g by 1. +// +// The g must be created with nil callback in order to be able to call this function. +func (g *Gauge) Dec() { + g.Add(-1) +} + +// Add adds fAdd to g. fAdd may be positive and negative. +// +// The g must be created with nil callback in order to be able to call this function. +func (g *Gauge) Add(fAdd float64) { + if g.f != nil { + panic(fmt.Errorf("cannot call Set on gauge created with non-nil callback")) + } + for { + n := atomic.LoadUint64(&g.valueBits) + f := math.Float64frombits(n) + fNew := f + fAdd + nNew := math.Float64bits(fNew) + if atomic.CompareAndSwapUint64(&g.valueBits, n, nNew) { + break + } + } } func (g *Gauge) marshalTo(prefix string, w io.Writer) { - v := g.f() + v := g.Get() if float64(int64(v)) == v { // Marshal integer values without scientific notation fmt.Fprintf(w, "%s %d\n", prefix, int64(v)) @@ -46,6 +97,10 @@ func (g *Gauge) marshalTo(prefix string, w io.Writer) { } } +func (g *Gauge) metricType() string { + return "gauge" +} + // GetOrCreateGauge returns registered gauge with the given name // or creates new gauge if the registry doesn't contain gauge with // the given name. diff --git a/vendor/github.com/VictoriaMetrics/metrics/go_metrics.go b/vendor/github.com/VictoriaMetrics/metrics/go_metrics.go index f8b6067..d8b949d 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/go_metrics.go +++ b/vendor/github.com/VictoriaMetrics/metrics/go_metrics.go @@ -3,41 +3,78 @@ package metrics import ( "fmt" "io" + "log" + "math" "runtime" + runtimemetrics "runtime/metrics" + "strings" "github.com/valyala/histogram" ) +// See https://pkg.go.dev/runtime/metrics#hdr-Supported_metrics +var runtimeMetrics = [][2]string{ + {"/sched/latencies:seconds", "go_sched_latencies_seconds"}, + {"/sync/mutex/wait/total:seconds", "go_mutex_wait_seconds_total"}, + {"/cpu/classes/gc/mark/assist:cpu-seconds", "go_gc_mark_assist_cpu_seconds_total"}, + {"/cpu/classes/gc/total:cpu-seconds", "go_gc_cpu_seconds_total"}, + {"/gc/pauses:seconds", "go_gc_pauses_seconds"}, + {"/cpu/classes/scavenge/total:cpu-seconds", "go_scavenge_cpu_seconds_total"}, + {"/gc/gomemlimit:bytes", "go_memlimit_bytes"}, +} + +var supportedRuntimeMetrics = initSupportedRuntimeMetrics(runtimeMetrics) + +func initSupportedRuntimeMetrics(rms [][2]string) [][2]string { + exposedMetrics := make(map[string]struct{}) + for _, d := range runtimemetrics.All() { + exposedMetrics[d.Name] = struct{}{} + } + var supportedMetrics [][2]string + for _, rm := range rms { + metricName := rm[0] + if _, ok := exposedMetrics[metricName]; ok { + supportedMetrics = append(supportedMetrics, rm) + } else { + log.Printf("github.com/VictoriaMetrics/metrics: do not expose %s metric, since the corresponding metric %s isn't supported in the current Go runtime", rm[1], metricName) + } + } + return supportedMetrics +} + func writeGoMetrics(w io.Writer) { + writeRuntimeMetrics(w) + var ms runtime.MemStats runtime.ReadMemStats(&ms) - fmt.Fprintf(w, "go_memstats_alloc_bytes %d\n", ms.Alloc) - fmt.Fprintf(w, "go_memstats_alloc_bytes_total %d\n", ms.TotalAlloc) - fmt.Fprintf(w, "go_memstats_buck_hash_sys_bytes %d\n", ms.BuckHashSys) - fmt.Fprintf(w, "go_memstats_frees_total %d\n", ms.Frees) - fmt.Fprintf(w, "go_memstats_gc_cpu_fraction %g\n", ms.GCCPUFraction) - fmt.Fprintf(w, "go_memstats_gc_sys_bytes %d\n", ms.GCSys) - fmt.Fprintf(w, "go_memstats_heap_alloc_bytes %d\n", ms.HeapAlloc) - fmt.Fprintf(w, "go_memstats_heap_idle_bytes %d\n", ms.HeapIdle) - fmt.Fprintf(w, "go_memstats_heap_inuse_bytes %d\n", ms.HeapInuse) - fmt.Fprintf(w, "go_memstats_heap_objects %d\n", ms.HeapObjects) - fmt.Fprintf(w, "go_memstats_heap_released_bytes %d\n", ms.HeapReleased) - fmt.Fprintf(w, "go_memstats_heap_sys_bytes %d\n", ms.HeapSys) - fmt.Fprintf(w, "go_memstats_last_gc_time_seconds %g\n", float64(ms.LastGC)/1e9) - fmt.Fprintf(w, "go_memstats_lookups_total %d\n", ms.Lookups) - fmt.Fprintf(w, "go_memstats_mallocs_total %d\n", ms.Mallocs) - fmt.Fprintf(w, "go_memstats_mcache_inuse_bytes %d\n", ms.MCacheInuse) - fmt.Fprintf(w, "go_memstats_mcache_sys_bytes %d\n", ms.MCacheSys) - fmt.Fprintf(w, "go_memstats_mspan_inuse_bytes %d\n", ms.MSpanInuse) - fmt.Fprintf(w, "go_memstats_mspan_sys_bytes %d\n", ms.MSpanSys) - fmt.Fprintf(w, "go_memstats_next_gc_bytes %d\n", ms.NextGC) - fmt.Fprintf(w, "go_memstats_other_sys_bytes %d\n", ms.OtherSys) - fmt.Fprintf(w, "go_memstats_stack_inuse_bytes %d\n", ms.StackInuse) - fmt.Fprintf(w, "go_memstats_stack_sys_bytes %d\n", ms.StackSys) - fmt.Fprintf(w, "go_memstats_sys_bytes %d\n", ms.Sys) - - fmt.Fprintf(w, "go_cgo_calls_count %d\n", runtime.NumCgoCall()) - fmt.Fprintf(w, "go_cpu_count %d\n", runtime.NumCPU()) + WriteGaugeUint64(w, "go_memstats_alloc_bytes", ms.Alloc) + WriteCounterUint64(w, "go_memstats_alloc_bytes_total", ms.TotalAlloc) + WriteGaugeUint64(w, "go_memstats_buck_hash_sys_bytes", ms.BuckHashSys) + WriteCounterUint64(w, "go_memstats_frees_total", ms.Frees) + WriteGaugeFloat64(w, "go_memstats_gc_cpu_fraction", ms.GCCPUFraction) + WriteGaugeUint64(w, "go_memstats_gc_sys_bytes", ms.GCSys) + + WriteGaugeUint64(w, "go_memstats_heap_alloc_bytes", ms.HeapAlloc) + WriteGaugeUint64(w, "go_memstats_heap_idle_bytes", ms.HeapIdle) + WriteGaugeUint64(w, "go_memstats_heap_inuse_bytes", ms.HeapInuse) + WriteGaugeUint64(w, "go_memstats_heap_objects", ms.HeapObjects) + WriteGaugeUint64(w, "go_memstats_heap_released_bytes", ms.HeapReleased) + WriteGaugeUint64(w, "go_memstats_heap_sys_bytes", ms.HeapSys) + WriteGaugeFloat64(w, "go_memstats_last_gc_time_seconds", float64(ms.LastGC)/1e9) + WriteCounterUint64(w, "go_memstats_lookups_total", ms.Lookups) + WriteCounterUint64(w, "go_memstats_mallocs_total", ms.Mallocs) + WriteGaugeUint64(w, "go_memstats_mcache_inuse_bytes", ms.MCacheInuse) + WriteGaugeUint64(w, "go_memstats_mcache_sys_bytes", ms.MCacheSys) + WriteGaugeUint64(w, "go_memstats_mspan_inuse_bytes", ms.MSpanInuse) + WriteGaugeUint64(w, "go_memstats_mspan_sys_bytes", ms.MSpanSys) + WriteGaugeUint64(w, "go_memstats_next_gc_bytes", ms.NextGC) + WriteGaugeUint64(w, "go_memstats_other_sys_bytes", ms.OtherSys) + WriteGaugeUint64(w, "go_memstats_stack_inuse_bytes", ms.StackInuse) + WriteGaugeUint64(w, "go_memstats_stack_sys_bytes", ms.StackSys) + WriteGaugeUint64(w, "go_memstats_sys_bytes", ms.Sys) + + WriteCounterUint64(w, "go_cgo_calls_count", uint64(runtime.NumCgoCall())) + WriteGaugeUint64(w, "go_cpu_count", uint64(runtime.NumCPU())) gcPauses := histogram.NewFast() for _, pauseNs := range ms.PauseNs[:] { @@ -45,20 +82,103 @@ func writeGoMetrics(w io.Writer) { } phis := []float64{0, 0.25, 0.5, 0.75, 1} quantiles := make([]float64, 0, len(phis)) + WriteMetadataIfNeeded(w, "go_gc_duration_seconds", "summary") for i, q := range gcPauses.Quantiles(quantiles[:0], phis) { fmt.Fprintf(w, `go_gc_duration_seconds{quantile="%g"} %g`+"\n", phis[i], q) } - fmt.Fprintf(w, `go_gc_duration_seconds_sum %g`+"\n", float64(ms.PauseTotalNs)/1e9) - fmt.Fprintf(w, `go_gc_duration_seconds_count %d`+"\n", ms.NumGC) - fmt.Fprintf(w, `go_gc_forced_count %d`+"\n", ms.NumForcedGC) + fmt.Fprintf(w, "go_gc_duration_seconds_sum %g\n", float64(ms.PauseTotalNs)/1e9) + fmt.Fprintf(w, "go_gc_duration_seconds_count %d\n", ms.NumGC) + + WriteCounterUint64(w, "go_gc_forced_count", uint64(ms.NumForcedGC)) - fmt.Fprintf(w, `go_gomaxprocs %d`+"\n", runtime.GOMAXPROCS(0)) - fmt.Fprintf(w, `go_goroutines %d`+"\n", runtime.NumGoroutine()) + WriteGaugeUint64(w, "go_gomaxprocs", uint64(runtime.GOMAXPROCS(0))) + WriteGaugeUint64(w, "go_goroutines", uint64(runtime.NumGoroutine())) numThread, _ := runtime.ThreadCreateProfile(nil) - fmt.Fprintf(w, `go_threads %d`+"\n", numThread) + WriteGaugeUint64(w, "go_threads", uint64(numThread)) // Export build details. + WriteMetadataIfNeeded(w, "go_info", "gauge") fmt.Fprintf(w, "go_info{version=%q} 1\n", runtime.Version()) + + WriteMetadataIfNeeded(w, "go_info_ext", "gauge") fmt.Fprintf(w, "go_info_ext{compiler=%q, GOARCH=%q, GOOS=%q, GOROOT=%q} 1\n", runtime.Compiler, runtime.GOARCH, runtime.GOOS, runtime.GOROOT()) } + +func writeRuntimeMetrics(w io.Writer) { + samples := make([]runtimemetrics.Sample, len(supportedRuntimeMetrics)) + for i, rm := range supportedRuntimeMetrics { + samples[i].Name = rm[0] + } + runtimemetrics.Read(samples) + for i, rm := range supportedRuntimeMetrics { + writeRuntimeMetric(w, rm[1], &samples[i]) + } +} + +func writeRuntimeMetric(w io.Writer, name string, sample *runtimemetrics.Sample) { + kind := sample.Value.Kind() + switch kind { + case runtimemetrics.KindBad: + panic(fmt.Errorf("BUG: unexpected runtimemetrics.KindBad for sample.Name=%q", sample.Name)) + case runtimemetrics.KindUint64: + v := sample.Value.Uint64() + if strings.HasSuffix(name, "_total") { + WriteCounterUint64(w, name, v) + } else { + WriteGaugeUint64(w, name, v) + } + case runtimemetrics.KindFloat64: + v := sample.Value.Float64() + if isCounterName(name) { + WriteCounterFloat64(w, name, v) + } else { + WriteGaugeFloat64(w, name, v) + } + case runtimemetrics.KindFloat64Histogram: + h := sample.Value.Float64Histogram() + writeRuntimeHistogramMetric(w, name, h) + default: + panic(fmt.Errorf("unexpected metric kind=%d", kind)) + } +} + +func writeRuntimeHistogramMetric(w io.Writer, name string, h *runtimemetrics.Float64Histogram) { + buckets := h.Buckets + counts := h.Counts + if len(buckets) != len(counts)+1 { + panic(fmt.Errorf("the number of buckets must be bigger than the number of counts by 1 in histogram %s; got buckets=%d, counts=%d", name, len(buckets), len(counts))) + } + tailCount := uint64(0) + if strings.HasSuffix(name, "_seconds") { + // Limit the maximum bucket to 1 second, since Go runtime exposes buckets with 10K seconds, + // which have little sense. At the same time such buckets may lead to high cardinality issues + // at the scraper side. + for len(buckets) > 0 && buckets[len(buckets)-1] > 1 { + buckets = buckets[:len(buckets)-1] + tailCount += counts[len(counts)-1] + counts = counts[:len(counts)-1] + } + } + + iStep := float64(len(buckets)) / maxRuntimeHistogramBuckets + + totalCount := uint64(0) + iNext := 0.0 + WriteMetadataIfNeeded(w, name, "histogram") + for i, count := range counts { + totalCount += count + if float64(i) >= iNext { + iNext += iStep + le := buckets[i+1] + if !math.IsInf(le, 1) { + fmt.Fprintf(w, `%s_bucket{le="%g"} %d`+"\n", name, le, totalCount) + } + } + } + totalCount += tailCount + fmt.Fprintf(w, `%s_bucket{le="+Inf"} %d`+"\n", name, totalCount) +} + +// Limit the number of buckets for Go runtime histograms in order to prevent from high cardinality issues at scraper side. +const maxRuntimeHistogramBuckets = 30 diff --git a/vendor/github.com/VictoriaMetrics/metrics/histogram.go b/vendor/github.com/VictoriaMetrics/metrics/histogram.go index a576681..d703ae8 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/histogram.go +++ b/vendor/github.com/VictoriaMetrics/metrics/histogram.go @@ -36,7 +36,7 @@ var bucketMultiplier = math.Pow(10, 1.0/bucketsPerDecimal) // // Histogram buckets can be converted to Prometheus-like buckets with `le` labels // with `prometheus_buckets(_bucket)` function from PromQL extensions in VictoriaMetrics. -// (see https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL ): +// (see https://docs.victoriametrics.com/metricsql/ ): // // prometheus_buckets(request_duration_bucket) // @@ -47,13 +47,21 @@ var bucketMultiplier = math.Pow(10, 1.0/bucketsPerDecimal) // Zero histogram is usable. type Histogram struct { // Mu gurantees synchronous update for all the counters and sum. + // + // Do not use sync.RWMutex, since it has zero sense from performance PoV. + // It only complicates the code. mu sync.Mutex + // decimalBuckets contains counters for histogram buckets decimalBuckets [decimalBucketsCount]*[bucketsPerDecimal]uint64 + // lower is the number of values, which hit the lower bucket lower uint64 + + // upper is the number of values, which hit the upper bucket upper uint64 + // sum is the sum of all the values put into Histogram sum float64 } @@ -109,6 +117,34 @@ func (h *Histogram) Update(v float64) { h.mu.Unlock() } +// Merge merges src to h +func (h *Histogram) Merge(src *Histogram) { + h.mu.Lock() + defer h.mu.Unlock() + + src.mu.Lock() + defer src.mu.Unlock() + + h.lower += src.lower + h.upper += src.upper + h.sum += src.sum + + for i, dbSrc := range src.decimalBuckets { + if dbSrc == nil { + continue + } + dbDst := h.decimalBuckets[i] + if dbDst == nil { + var b [bucketsPerDecimal]uint64 + dbDst = &b + h.decimalBuckets[i] = dbDst + } + for j := range dbSrc { + dbDst[j] += dbSrc[j] + } + } +} + // VisitNonZeroBuckets calls f for all buckets with non-zero counters. // // vmrange contains "..." string with bucket bounds. The lower bound @@ -228,3 +264,7 @@ func (h *Histogram) getSum() float64 { h.mu.Unlock() return sum } + +func (h *Histogram) metricType() string { + return "histogram" +} diff --git a/vendor/github.com/VictoriaMetrics/metrics/metrics.go b/vendor/github.com/VictoriaMetrics/metrics/metrics.go index 7dfa972..74e9735 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/metrics.go +++ b/vendor/github.com/VictoriaMetrics/metrics/metrics.go @@ -13,9 +13,12 @@ package metrics import ( + "fmt" "io" "sort" + "strings" "sync" + "sync/atomic" "unsafe" ) @@ -27,6 +30,7 @@ type namedMetric struct { type metric interface { marshalTo(prefix string, w io.Writer) + metricType() string } var defaultSet = NewSet() @@ -51,16 +55,33 @@ func RegisterSet(s *Set) { // UnregisterSet stops exporting metrics for the given s via global WritePrometheus() call. // -// Call s.UnregisterAllMetrics() after unregistering s if it is no longer used. -func UnregisterSet(s *Set) { +// If destroySet is set to true, then s.UnregisterAllMetrics() is called on s after unregistering it, +// so s becomes destroyed. Otherwise the s can be registered again in the set by passing it to RegisterSet(). +func UnregisterSet(s *Set, destroySet bool) { registeredSetsLock.Lock() delete(registeredSets, s) registeredSetsLock.Unlock() + + if destroySet { + s.UnregisterAllMetrics() + } } -// WritePrometheus writes all the metrics from default set and all the registered sets in Prometheus format to w. +// RegisterMetricsWriter registers writeMetrics callback for including metrics in the output generated by WritePrometheus. +// +// The writeMetrics callback must write metrics to w in Prometheus text exposition format without timestamps and trailing comments. +// The last line generated by writeMetrics must end with \n. +// See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format +// +// It is OK to register multiple writeMetrics callbacks - all of them will be called sequentially for gererating the output at WritePrometheus. +func RegisterMetricsWriter(writeMetrics func(w io.Writer)) { + defaultSet.RegisterMetricsWriter(writeMetrics) +} + +// WritePrometheus writes all the metrics in Prometheus format from the default set, all the added sets and metrics writers to w. // // Additional sets can be registered via RegisterSet() call. +// Additional metric writers can be registered via RegisterMetricsWriter() call. // // If exposeProcessMetrics is true, then various `go_*` and `process_*` metrics // are exposed for the current process. @@ -134,10 +155,26 @@ func WritePrometheus(w io.Writer, exposeProcessMetrics bool) { // // - process_io_storage_written_bytes_total - the number of bytes actually written to disk // +// - go_sched_latencies_seconds - time spent by goroutines in ready state before they start execution +// +// - go_mutex_wait_seconds_total - summary time spent by all the goroutines while waiting for locked mutex +// +// - go_gc_mark_assist_cpu_seconds_total - summary CPU time spent by goroutines in GC mark assist state +// +// - go_gc_cpu_seconds_total - summary time spent in GC +// +// - go_gc_pauses_seconds - duration of GC pauses +// +// - go_scavenge_cpu_seconds_total - CPU time spent on returning the memory to OS +// +// - go_memlimit_bytes - the GOMEMLIMIT env var value +// // - go_memstats_alloc_bytes - memory usage for Go objects in the heap // // - go_memstats_alloc_bytes_total - the cumulative counter for total size of allocated Go objects // +// - go_memstats_buck_hash_sys_bytes - bytes of memory in profiling bucket hash tables +// // - go_memstats_frees_total - the cumulative counter for number of freed Go objects // // - go_memstats_gc_cpu_fraction - the fraction of CPU spent in Go garbage collector @@ -148,20 +185,42 @@ func WritePrometheus(w io.Writer, exposeProcessMetrics bool) { // // - go_memstats_heap_idle_bytes - idle memory ready for new Go object allocations // +// - go_memstats_heap_inuse_bytes - bytes in in-use spans +// // - go_memstats_heap_objects - the number of Go objects in the heap // +// - go_memstats_heap_released_bytes - bytes of physical memory returned to the OS +// // - go_memstats_heap_sys_bytes - memory requested for Go objects from the OS // +// - go_memstats_last_gc_time_seconds - unix timestamp the last garbage collection finished +// +// - go_memstats_lookups_total - the number of pointer lookups performed by the runtime +// // - go_memstats_mallocs_total - the number of allocations for Go objects // +// - go_memstats_mcache_inuse_bytes - bytes of allocated mcache structures +// +// - go_memstats_mcache_sys_bytes - bytes of memory obtained from the OS for mcache structures +// +// - go_memstats_mspan_inuse_bytes - bytes of allocated mspan structures +// +// - go_memstats_mspan_sys_bytes - bytes of memory obtained from the OS for mspan structures +// // - go_memstats_next_gc_bytes - the target heap size when the next garbage collection should start // +// - go_memstats_other_sys_bytes - bytes of memory in miscellaneous off-heap runtime allocations +// // - go_memstats_stack_inuse_bytes - memory used for goroutine stacks // // - go_memstats_stack_sys_bytes - memory requested fromthe OS for goroutine stacks // // - go_memstats_sys_bytes - memory requested by Go runtime from the OS // +// - go_cgo_calls_count - the total number of CGO calls +// +// - go_cpu_count - the number of CPU cores on the host where the app runs +// // The WriteProcessMetrics func is usually called in combination with writing Set metrics // inside "/metrics" handler: // @@ -170,7 +229,7 @@ func WritePrometheus(w io.Writer, exposeProcessMetrics bool) { // metrics.WriteProcessMetrics(w) // }) // -// See also WrteFDMetrics. +// See also WriteFDMetrics. func WriteProcessMetrics(w io.Writer) { writeGoMetrics(w) writeProcessMetrics(w) @@ -190,6 +249,8 @@ func UnregisterMetric(name string) bool { } // UnregisterAllMetrics unregisters all the metrics from default set. +// +// It also unregisters writeMetrics callbacks passed to RegisterMetricsWriter. func UnregisterAllMetrics() { defaultSet.UnregisterAllMetrics() } @@ -203,3 +264,76 @@ func ListMetricNames() []string { func GetDefaultSet() *Set { return defaultSet } + +// ExposeMetadata allows enabling adding TYPE and HELP metadata to the exposed metrics globally. +// +// It is safe to call this method multiple times. It is allowed to change it in runtime. +// ExposeMetadata is set to false by default. +func ExposeMetadata(v bool) { + n := 0 + if v { + n = 1 + } + atomic.StoreUint32(&exposeMetadata, uint32(n)) +} + +func isMetadataEnabled() bool { + n := atomic.LoadUint32(&exposeMetadata) + return n != 0 +} + +var exposeMetadata uint32 + +func isCounterName(name string) bool { + return strings.HasSuffix(name, "_total") +} + +// WriteGaugeUint64 writes gauge metric with the given name and value to w in Prometheus text exposition format. +func WriteGaugeUint64(w io.Writer, name string, value uint64) { + writeMetricUint64(w, name, "gauge", value) +} + +// WriteGaugeFloat64 writes gauge metric with the given name and value to w in Prometheus text exposition format. +func WriteGaugeFloat64(w io.Writer, name string, value float64) { + writeMetricFloat64(w, name, "gauge", value) +} + +// WriteCounterUint64 writes counter metric with the given name and value to w in Prometheus text exposition format. +func WriteCounterUint64(w io.Writer, name string, value uint64) { + writeMetricUint64(w, name, "counter", value) +} + +// WriteCounterFloat64 writes counter metric with the given name and value to w in Prometheus text exposition format. +func WriteCounterFloat64(w io.Writer, name string, value float64) { + writeMetricFloat64(w, name, "counter", value) +} + +func writeMetricUint64(w io.Writer, metricName, metricType string, value uint64) { + WriteMetadataIfNeeded(w, metricName, metricType) + fmt.Fprintf(w, "%s %d\n", metricName, value) +} + +func writeMetricFloat64(w io.Writer, metricName, metricType string, value float64) { + WriteMetadataIfNeeded(w, metricName, metricType) + fmt.Fprintf(w, "%s %g\n", metricName, value) +} + +// WriteMetadataIfNeeded writes HELP and TYPE metadata for the given metricName and metricType if this is globally enabled via ExposeMetadata(). +// +// If the metadata exposition isn't enabled, then this function is no-op. +func WriteMetadataIfNeeded(w io.Writer, metricName, metricType string) { + if !isMetadataEnabled() { + return + } + metricFamily := getMetricFamily(metricName) + fmt.Fprintf(w, "# HELP %s\n", metricFamily) + fmt.Fprintf(w, "# TYPE %s %s\n", metricFamily, metricType) +} + +func getMetricFamily(metricName string) string { + n := strings.IndexByte(metricName, '{') + if n < 0 { + return metricName + } + return metricName[:n] +} diff --git a/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go b/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go index 48def1c..e4587b7 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go +++ b/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go @@ -16,6 +16,11 @@ import ( // See https://github.com/prometheus/procfs/blob/a4ac0826abceb44c40fc71daed2b301db498b93e/proc_stat.go#L40 . const userHZ = 100 +// Different environments may have different page size. +// +// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6457 +var pageSizeBytes = uint64(os.Getpagesize()) + // See http://man7.org/linux/man-pages/man5/proc.5.html type procStat struct { State byte @@ -74,15 +79,15 @@ func writeProcessMetrics(w io.Writer) { utime := float64(p.Utime) / userHZ stime := float64(p.Stime) / userHZ - fmt.Fprintf(w, "process_cpu_seconds_system_total %g\n", stime) - fmt.Fprintf(w, "process_cpu_seconds_total %g\n", utime+stime) - fmt.Fprintf(w, "process_cpu_seconds_user_total %g\n", utime) - fmt.Fprintf(w, "process_major_pagefaults_total %d\n", p.Majflt) - fmt.Fprintf(w, "process_minor_pagefaults_total %d\n", p.Minflt) - fmt.Fprintf(w, "process_num_threads %d\n", p.NumThreads) - fmt.Fprintf(w, "process_resident_memory_bytes %d\n", p.Rss*4096) - fmt.Fprintf(w, "process_start_time_seconds %d\n", startTimeSeconds) - fmt.Fprintf(w, "process_virtual_memory_bytes %d\n", p.Vsize) + WriteCounterFloat64(w, "process_cpu_seconds_system_total", stime) + WriteCounterFloat64(w, "process_cpu_seconds_total", utime+stime) + WriteCounterFloat64(w, "process_cpu_seconds_user_total", utime) + WriteCounterUint64(w, "process_major_pagefaults_total", uint64(p.Majflt)) + WriteCounterUint64(w, "process_minor_pagefaults_total", uint64(p.Minflt)) + WriteGaugeUint64(w, "process_num_threads", uint64(p.NumThreads)) + WriteGaugeUint64(w, "process_resident_memory_bytes", uint64(p.Rss)*pageSizeBytes) + WriteGaugeUint64(w, "process_start_time_seconds", uint64(startTimeSeconds)) + WriteGaugeUint64(w, "process_virtual_memory_bytes", uint64(p.Vsize)) writeProcessMemMetrics(w) writeIOMetrics(w) } @@ -133,12 +138,12 @@ func writeIOMetrics(w io.Writer) { writeBytes = getInt(s) } } - fmt.Fprintf(w, "process_io_read_bytes_total %d\n", rchar) - fmt.Fprintf(w, "process_io_written_bytes_total %d\n", wchar) - fmt.Fprintf(w, "process_io_read_syscalls_total %d\n", syscr) - fmt.Fprintf(w, "process_io_write_syscalls_total %d\n", syscw) - fmt.Fprintf(w, "process_io_storage_read_bytes_total %d\n", readBytes) - fmt.Fprintf(w, "process_io_storage_written_bytes_total %d\n", writeBytes) + WriteGaugeUint64(w, "process_io_read_bytes_total", uint64(rchar)) + WriteGaugeUint64(w, "process_io_written_bytes_total", uint64(wchar)) + WriteGaugeUint64(w, "process_io_read_syscalls_total", uint64(syscr)) + WriteGaugeUint64(w, "process_io_write_syscalls_total", uint64(syscw)) + WriteGaugeUint64(w, "process_io_storage_read_bytes_total", uint64(readBytes)) + WriteGaugeUint64(w, "process_io_storage_written_bytes_total", uint64(writeBytes)) } var startTimeSeconds = time.Now().Unix() @@ -155,8 +160,8 @@ func writeFDMetrics(w io.Writer) { log.Printf("ERROR: metrics: cannot determine the limit on open file descritors: %s", err) return } - fmt.Fprintf(w, "process_max_fds %d\n", maxOpenFDs) - fmt.Fprintf(w, "process_open_fds %d\n", totalOpenFDs) + WriteGaugeUint64(w, "process_max_fds", maxOpenFDs) + WriteGaugeUint64(w, "process_open_fds", totalOpenFDs) } func getOpenFDsCount(path string) (uint64, error) { @@ -224,11 +229,11 @@ func writeProcessMemMetrics(w io.Writer) { log.Printf("ERROR: metrics: cannot determine memory status: %s", err) return } - fmt.Fprintf(w, "process_virtual_memory_peak_bytes %d\n", ms.vmPeak) - fmt.Fprintf(w, "process_resident_memory_peak_bytes %d\n", ms.rssPeak) - fmt.Fprintf(w, "process_resident_memory_anon_bytes %d\n", ms.rssAnon) - fmt.Fprintf(w, "process_resident_memory_file_bytes %d\n", ms.rssFile) - fmt.Fprintf(w, "process_resident_memory_shared_bytes %d\n", ms.rssShmem) + WriteGaugeUint64(w, "process_virtual_memory_peak_bytes", ms.vmPeak) + WriteGaugeUint64(w, "process_resident_memory_peak_bytes", ms.rssPeak) + WriteGaugeUint64(w, "process_resident_memory_anon_bytes", ms.rssAnon) + WriteGaugeUint64(w, "process_resident_memory_file_bytes", ms.rssFile) + WriteGaugeUint64(w, "process_resident_memory_shared_bytes", ms.rssShmem) } diff --git a/vendor/github.com/VictoriaMetrics/metrics/process_metrics_windows.go b/vendor/github.com/VictoriaMetrics/metrics/process_metrics_windows.go index e824ada..bda7c82 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/process_metrics_windows.go +++ b/vendor/github.com/VictoriaMetrics/metrics/process_metrics_windows.go @@ -4,7 +4,6 @@ package metrics import ( - "fmt" "io" "log" "syscall" @@ -55,16 +54,16 @@ func writeProcessMetrics(w io.Writer) { log.Printf("ERROR: metrics: cannot read process memory information: %s", err) return } - stimeSeconds := (uint64(stime.HighDateTime)<<32 + uint64(stime.LowDateTime)) / 1e7 - utimeSeconds := (uint64(utime.HighDateTime)<<32 + uint64(utime.LowDateTime)) / 1e7 - fmt.Fprintf(w, "process_cpu_seconds_system_total %d\n", stimeSeconds) - fmt.Fprintf(w, "process_cpu_seconds_total %d\n", stimeSeconds+utimeSeconds) - fmt.Fprintf(w, "process_cpu_seconds_user_total %d\n", stimeSeconds) - fmt.Fprintf(w, "process_pagefaults_total %d\n", mc.PageFaultCount) - fmt.Fprintf(w, "process_start_time_seconds %d\n", startTime.Nanoseconds()/1e9) - fmt.Fprintf(w, "process_virtual_memory_bytes %d\n", mc.PrivateUsage) - fmt.Fprintf(w, "process_resident_memory_peak_bytes %d\n", mc.PeakWorkingSetSize) - fmt.Fprintf(w, "process_resident_memory_bytes %d\n", mc.WorkingSetSize) + stimeSeconds := float64(uint64(stime.HighDateTime)<<32+uint64(stime.LowDateTime)) / 1e7 + utimeSeconds := float64(uint64(utime.HighDateTime)<<32+uint64(utime.LowDateTime)) / 1e7 + WriteCounterFloat64(w, "process_cpu_seconds_system_total", stimeSeconds) + WriteCounterFloat64(w, "process_cpu_seconds_total", stimeSeconds+utimeSeconds) + WriteCounterFloat64(w, "process_cpu_seconds_user_total", stimeSeconds) + WriteCounterUint64(w, "process_pagefaults_total", uint64(mc.PageFaultCount)) + WriteGaugeUint64(w, "process_start_time_seconds", uint64(startTime.Nanoseconds())/1e9) + WriteGaugeUint64(w, "process_virtual_memory_bytes", uint64(mc.PrivateUsage)) + WriteGaugeUint64(w, "process_resident_memory_peak_bytes", uint64(mc.PeakWorkingSetSize)) + WriteGaugeUint64(w, "process_resident_memory_bytes", uint64(mc.WorkingSetSize)) } func writeFDMetrics(w io.Writer) { @@ -80,6 +79,6 @@ func writeFDMetrics(w io.Writer) { } // it seems to be hard-coded limit for 64-bit systems // https://learn.microsoft.com/en-us/archive/blogs/markrussinovich/pushing-the-limits-of-windows-handles#maximum-number-of-handles - fmt.Fprintf(w, "process_max_fds %d\n", 16777216) - fmt.Fprintf(w, "process_open_fds %d\n", count) + WriteGaugeUint64(w, "process_max_fds", 16777216) + WriteGaugeUint64(w, "process_open_fds", uint64(count)) } diff --git a/vendor/github.com/VictoriaMetrics/metrics/push.go b/vendor/github.com/VictoriaMetrics/metrics/push.go index 4215f48..f33886f 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/push.go +++ b/vendor/github.com/VictoriaMetrics/metrics/push.go @@ -2,17 +2,69 @@ package metrics import ( "bytes" + "context" + "errors" "fmt" "io" "io/ioutil" "log" "net/http" "net/url" + "strings" + "sync" "time" "compress/gzip" ) +// PushOptions is the list of options, which may be applied to InitPushWithOptions(). +type PushOptions struct { + // ExtraLabels is an optional comma-separated list of `label="value"` labels, which must be added to all the metrics before pushing them to pushURL. + ExtraLabels string + + // Headers is an optional list of HTTP headers to add to every push request to pushURL. + // + // Every item in the list must have the form `Header: value`. For example, `Authorization: Custom my-top-secret`. + Headers []string + + // Whether to disable HTTP request body compression before sending the metrics to pushURL. + // + // By default the compression is enabled. + DisableCompression bool + + // Method is HTTP request method to use when pushing metrics to pushURL. + // + // By default the Method is GET. + Method string + + // Optional WaitGroup for waiting until all the push workers created with this WaitGroup are stopped. + WaitGroup *sync.WaitGroup +} + +// InitPushWithOptions sets up periodic push for globally registered metrics to the given pushURL with the given interval. +// +// The periodic push is stopped when ctx is canceled. +// It is possible to wait until the background metrics push worker is stopped on a WaitGroup passed via opts.WaitGroup. +// +// If pushProcessMetrics is set to true, then 'process_*' and `go_*` metrics are also pushed to pushURL. +// +// opts may contain additional configuration options if non-nil. +// +// The metrics are pushed to pushURL in Prometheus text exposition format. +// See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format +// +// It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to +// https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format +// +// It is OK calling InitPushWithOptions multiple times with different pushURL - +// in this case metrics are pushed to all the provided pushURL urls. +func InitPushWithOptions(ctx context.Context, pushURL string, interval time.Duration, pushProcessMetrics bool, opts *PushOptions) error { + writeMetrics := func(w io.Writer) { + WritePrometheus(w, pushProcessMetrics) + } + return InitPushExtWithOptions(ctx, pushURL, interval, writeMetrics, opts) +} + // InitPushProcessMetrics sets up periodic push for 'process_*' metrics to the given pushURL with the given interval. // // extraLabels may contain comma-separated list of `label="value"` labels, which will be added @@ -27,10 +79,7 @@ import ( // It is OK calling InitPushProcessMetrics multiple times with different pushURL - // in this case metrics are pushed to all the provided pushURL urls. func InitPushProcessMetrics(pushURL string, interval time.Duration, extraLabels string) error { - writeMetrics := func(w io.Writer) { - WriteProcessMetrics(w) - } - return InitPushExt(pushURL, interval, extraLabels, writeMetrics) + return InitPushExt(pushURL, interval, extraLabels, WriteProcessMetrics) } // InitPush sets up periodic push for globally registered metrics to the given pushURL with the given interval. @@ -38,7 +87,7 @@ func InitPushProcessMetrics(pushURL string, interval time.Duration, extraLabels // extraLabels may contain comma-separated list of `label="value"` labels, which will be added // to all the metrics before pushing them to pushURL. // -// If pushProcessMetrics is set to true, then 'process_*' metrics are also pushed to pushURL. +// If pushProcessMetrics is set to true, then 'process_*' and `go_*` metrics are also pushed to pushURL. // // The metrics are pushed to pushURL in Prometheus text exposition format. // See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format @@ -55,12 +104,46 @@ func InitPush(pushURL string, interval time.Duration, extraLabels string, pushPr return InitPushExt(pushURL, interval, extraLabels, writeMetrics) } +// PushMetrics pushes globally registered metrics to pushURL. +// +// If pushProcessMetrics is set to true, then 'process_*' and `go_*` metrics are also pushed to pushURL. +// +// opts may contain additional configuration options if non-nil. +// +// It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to +// https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format +func PushMetrics(ctx context.Context, pushURL string, pushProcessMetrics bool, opts *PushOptions) error { + writeMetrics := func(w io.Writer) { + WritePrometheus(w, pushProcessMetrics) + } + return PushMetricsExt(ctx, pushURL, writeMetrics, opts) +} + +// InitPushWithOptions sets up periodic push for metrics from s to the given pushURL with the given interval. +// +// The periodic push is stopped when the ctx is canceled. +// It is possible to wait until the background metrics push worker is stopped on a WaitGroup passed via opts.WaitGroup. +// +// opts may contain additional configuration options if non-nil. +// +// The metrics are pushed to pushURL in Prometheus text exposition format. +// See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format +// +// It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to +// https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format +// +// It is OK calling InitPushWithOptions multiple times with different pushURL - +// in this case metrics are pushed to all the provided pushURL urls. +func (s *Set) InitPushWithOptions(ctx context.Context, pushURL string, interval time.Duration, opts *PushOptions) error { + return InitPushExtWithOptions(ctx, pushURL, interval, s.WritePrometheus, opts) +} + // InitPush sets up periodic push for metrics from s to the given pushURL with the given interval. // // extraLabels may contain comma-separated list of `label="value"` labels, which will be added // to all the metrics before pushing them to pushURL. // -// / The metrics are pushed to pushURL in Prometheus text exposition format. +// The metrics are pushed to pushURL in Prometheus text exposition format. // See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format // // It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to @@ -69,10 +152,17 @@ func InitPush(pushURL string, interval time.Duration, extraLabels string, pushPr // It is OK calling InitPush multiple times with different pushURL - // in this case metrics are pushed to all the provided pushURL urls. func (s *Set) InitPush(pushURL string, interval time.Duration, extraLabels string) error { - writeMetrics := func(w io.Writer) { - s.WritePrometheus(w) - } - return InitPushExt(pushURL, interval, extraLabels, writeMetrics) + return InitPushExt(pushURL, interval, extraLabels, s.WritePrometheus) +} + +// PushMetrics pushes s metrics to pushURL. +// +// opts may contain additional configuration options if non-nil. +// +// It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to +// https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format +func (s *Set) PushMetrics(ctx context.Context, pushURL string, opts *PushOptions) error { + return PushMetricsExt(ctx, pushURL, s.WritePrometheus, opts) } // InitPushExt sets up periodic push for metrics obtained by calling writeMetrics with the given interval. @@ -90,94 +180,246 @@ func (s *Set) InitPush(pushURL string, interval time.Duration, extraLabels strin // in this case metrics are pushed to all the provided pushURL urls. // // It is OK calling InitPushExt multiple times with different writeMetrics - -// in this case all the metrics generated by writeMetrics callbacks are writte to pushURL. +// in this case all the metrics generated by writeMetrics callbacks are written to pushURL. func InitPushExt(pushURL string, interval time.Duration, extraLabels string, writeMetrics func(w io.Writer)) error { + opts := &PushOptions{ + ExtraLabels: extraLabels, + } + return InitPushExtWithOptions(context.Background(), pushURL, interval, writeMetrics, opts) +} + +// InitPushExtWithOptions sets up periodic push for metrics obtained by calling writeMetrics with the given interval. +// +// The writeMetrics callback must write metrics to w in Prometheus text exposition format without timestamps and trailing comments. +// See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format +// +// The periodic push is stopped when the ctx is canceled. +// It is possible to wait until the background metrics push worker is stopped on a WaitGroup passed via opts.WaitGroup. +// +// opts may contain additional configuration options if non-nil. +// +// It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to +// https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format +// +// It is OK calling InitPushExtWithOptions multiple times with different pushURL - +// in this case metrics are pushed to all the provided pushURL urls. +// +// It is OK calling InitPushExtWithOptions multiple times with different writeMetrics - +// in this case all the metrics generated by writeMetrics callbacks are written to pushURL. +func InitPushExtWithOptions(ctx context.Context, pushURL string, interval time.Duration, writeMetrics func(w io.Writer), opts *PushOptions) error { + pc, err := newPushContext(pushURL, opts) + if err != nil { + return err + } + + // validate interval if interval <= 0 { return fmt.Errorf("interval must be positive; got %s", interval) } - if err := validateTags(extraLabels); err != nil { - return fmt.Errorf("invalid extraLabels=%q: %w", extraLabels, err) + pushMetricsSet.GetOrCreateFloatCounter(fmt.Sprintf(`metrics_push_interval_seconds{url=%q}`, pc.pushURLRedacted)).Set(interval.Seconds()) + + var wg *sync.WaitGroup + if opts != nil { + wg = opts.WaitGroup + if wg != nil { + wg.Add(1) + } + } + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + stopCh := ctx.Done() + for { + select { + case <-ticker.C: + ctxLocal, cancel := context.WithTimeout(ctx, interval+time.Second) + err := pc.pushMetrics(ctxLocal, writeMetrics) + cancel() + if err != nil { + log.Printf("ERROR: metrics.push: %s", err) + } + case <-stopCh: + if wg != nil { + wg.Done() + } + return + } + } + }() + + return nil +} + +// PushMetricsExt pushes metrics generated by wirteMetrics to pushURL. +// +// The writeMetrics callback must write metrics to w in Prometheus text exposition format without timestamps and trailing comments. +// See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format +// +// opts may contain additional configuration options if non-nil. +// +// It is recommended pushing metrics to /api/v1/import/prometheus endpoint according to +// https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format +func PushMetricsExt(ctx context.Context, pushURL string, writeMetrics func(w io.Writer), opts *PushOptions) error { + pc, err := newPushContext(pushURL, opts) + if err != nil { + return err } + return pc.pushMetrics(ctx, writeMetrics) +} + +type pushContext struct { + pushURL *url.URL + method string + pushURLRedacted string + extraLabels string + headers http.Header + disableCompression bool + + client *http.Client + + pushesTotal *Counter + bytesPushedTotal *Counter + pushBlockSize *Histogram + pushDuration *Histogram + pushErrors *Counter +} + +func newPushContext(pushURL string, opts *PushOptions) (*pushContext, error) { + if opts == nil { + opts = &PushOptions{} + } + + // validate pushURL pu, err := url.Parse(pushURL) if err != nil { - return fmt.Errorf("cannot parse pushURL=%q: %w", pushURL, err) + return nil, fmt.Errorf("cannot parse pushURL=%q: %w", pushURL, err) } if pu.Scheme != "http" && pu.Scheme != "https" { - return fmt.Errorf("unsupported scheme in pushURL=%q; expecting 'http' or 'https'", pushURL) + return nil, fmt.Errorf("unsupported scheme in pushURL=%q; expecting 'http' or 'https'", pushURL) } if pu.Host == "" { - return fmt.Errorf("missing host in pushURL=%q", pushURL) + return nil, fmt.Errorf("missing host in pushURL=%q", pushURL) + } + + method := opts.Method + if method == "" { + method = http.MethodGet + } + + // validate ExtraLabels + extraLabels := opts.ExtraLabels + if err := validateTags(extraLabels); err != nil { + return nil, fmt.Errorf("invalid extraLabels=%q: %w", extraLabels, err) + } + + // validate Headers + headers := make(http.Header) + for _, h := range opts.Headers { + n := strings.IndexByte(h, ':') + if n < 0 { + return nil, fmt.Errorf("missing `:` delimiter in the header %q", h) + } + name := strings.TrimSpace(h[:n]) + value := strings.TrimSpace(h[n+1:]) + headers.Add(name, value) } + pushURLRedacted := pu.Redacted() - c := &http.Client{ - Timeout: interval, - } - pushesTotal := pushMetrics.GetOrCreateCounter(fmt.Sprintf(`metrics_push_total{url=%q}`, pushURLRedacted)) - pushErrorsTotal := pushMetrics.GetOrCreateCounter(fmt.Sprintf(`metrics_push_errors_total{url=%q}`, pushURLRedacted)) - bytesPushedTotal := pushMetrics.GetOrCreateCounter(fmt.Sprintf(`metrics_push_bytes_pushed_total{url=%q}`, pushURLRedacted)) - pushDuration := pushMetrics.GetOrCreateHistogram(fmt.Sprintf(`metrics_push_duration_seconds{url=%q}`, pushURLRedacted)) - pushBlockSize := pushMetrics.GetOrCreateHistogram(fmt.Sprintf(`metrics_push_block_size_bytes{url=%q}`, pushURLRedacted)) - pushMetrics.GetOrCreateFloatCounter(fmt.Sprintf(`metrics_push_interval_seconds{url=%q}`, pushURLRedacted)).Set(interval.Seconds()) - go func() { - ticker := time.NewTicker(interval) - var bb bytes.Buffer - var tmpBuf []byte - zw := gzip.NewWriter(&bb) - for range ticker.C { - bb.Reset() - writeMetrics(&bb) - if len(extraLabels) > 0 { - tmpBuf = addExtraLabels(tmpBuf[:0], bb.Bytes(), extraLabels) - bb.Reset() - if _, err := bb.Write(tmpBuf); err != nil { - panic(fmt.Errorf("BUG: cannot write %d bytes to bytes.Buffer: %s", len(tmpBuf), err)) - } - } - tmpBuf = append(tmpBuf[:0], bb.Bytes()...) - bb.Reset() - zw.Reset(&bb) - if _, err := zw.Write(tmpBuf); err != nil { - panic(fmt.Errorf("BUG: cannot write %d bytes to gzip writer: %s", len(tmpBuf), err)) - } - if err := zw.Close(); err != nil { - panic(fmt.Errorf("BUG: cannot flush metrics to gzip writer: %s", err)) - } - pushesTotal.Inc() - blockLen := bb.Len() - bytesPushedTotal.Add(blockLen) - pushBlockSize.Update(float64(blockLen)) - req, err := http.NewRequest("GET", pushURL, &bb) - if err != nil { - panic(fmt.Errorf("BUG: metrics.push: cannot initialize request for metrics push to %q: %w", pushURLRedacted, err)) - } - req.Header.Set("Content-Type", "text/plain") - req.Header.Set("Content-Encoding", "gzip") - startTime := time.Now() - resp, err := c.Do(req) - pushDuration.UpdateDuration(startTime) - if err != nil { - log.Printf("ERROR: metrics.push: cannot push metrics to %q: %s", pushURLRedacted, err) - pushErrorsTotal.Inc() - continue - } - if resp.StatusCode/100 != 2 { - body, _ := ioutil.ReadAll(resp.Body) - _ = resp.Body.Close() - log.Printf("ERROR: metrics.push: unexpected status code in response from %q: %d; expecting 2xx; response body: %q", - pushURLRedacted, resp.StatusCode, body) - pushErrorsTotal.Inc() - continue - } - _ = resp.Body.Close() + client := &http.Client{} + return &pushContext{ + pushURL: pu, + method: method, + pushURLRedacted: pushURLRedacted, + extraLabels: extraLabels, + headers: headers, + disableCompression: opts.DisableCompression, + + client: client, + + pushesTotal: pushMetricsSet.GetOrCreateCounter(fmt.Sprintf(`metrics_push_total{url=%q}`, pushURLRedacted)), + bytesPushedTotal: pushMetricsSet.GetOrCreateCounter(fmt.Sprintf(`metrics_push_bytes_pushed_total{url=%q}`, pushURLRedacted)), + pushBlockSize: pushMetricsSet.GetOrCreateHistogram(fmt.Sprintf(`metrics_push_block_size_bytes{url=%q}`, pushURLRedacted)), + pushDuration: pushMetricsSet.GetOrCreateHistogram(fmt.Sprintf(`metrics_push_duration_seconds{url=%q}`, pushURLRedacted)), + pushErrors: pushMetricsSet.GetOrCreateCounter(fmt.Sprintf(`metrics_push_errors_total{url=%q}`, pushURLRedacted)), + }, nil +} + +func (pc *pushContext) pushMetrics(ctx context.Context, writeMetrics func(w io.Writer)) error { + bb := getBytesBuffer() + defer putBytesBuffer(bb) + + writeMetrics(bb) + + if len(pc.extraLabels) > 0 { + bbTmp := getBytesBuffer() + bbTmp.B = append(bbTmp.B[:0], bb.B...) + bb.B = addExtraLabels(bb.B[:0], bbTmp.B, pc.extraLabels) + putBytesBuffer(bbTmp) + } + if !pc.disableCompression { + bbTmp := getBytesBuffer() + bbTmp.B = append(bbTmp.B[:0], bb.B...) + bb.B = bb.B[:0] + zw := getGzipWriter(bb) + if _, err := zw.Write(bbTmp.B); err != nil { + panic(fmt.Errorf("BUG: cannot write %d bytes to gzip writer: %s", len(bbTmp.B), err)) } - }() + if err := zw.Close(); err != nil { + panic(fmt.Errorf("BUG: cannot flush metrics to gzip writer: %s", err)) + } + putGzipWriter(zw) + putBytesBuffer(bbTmp) + } + + // Update metrics + pc.pushesTotal.Inc() + blockLen := len(bb.B) + pc.bytesPushedTotal.Add(blockLen) + pc.pushBlockSize.Update(float64(blockLen)) + + // Prepare the request to sent to pc.pushURL + reqBody := bytes.NewReader(bb.B) + req, err := http.NewRequestWithContext(ctx, pc.method, pc.pushURL.String(), reqBody) + if err != nil { + panic(fmt.Errorf("BUG: metrics.push: cannot initialize request for metrics push to %q: %w", pc.pushURLRedacted, err)) + } + + req.Header.Set("Content-Type", "text/plain") + // Set the needed headers, and `Content-Type` allowed be overwrited. + for name, values := range pc.headers { + for _, value := range values { + req.Header.Add(name, value) + } + } + if !pc.disableCompression { + req.Header.Set("Content-Encoding", "gzip") + } + + // Perform the request + startTime := time.Now() + resp, err := pc.client.Do(req) + pc.pushDuration.UpdateDuration(startTime) + if err != nil { + if errors.Is(err, context.Canceled) { + return nil + } + pc.pushErrors.Inc() + return fmt.Errorf("cannot push metrics to %q: %s", pc.pushURLRedacted, err) + } + if resp.StatusCode/100 != 2 { + body, _ := ioutil.ReadAll(resp.Body) + _ = resp.Body.Close() + pc.pushErrors.Inc() + return fmt.Errorf("unexpected status code in response from %q: %d; expecting 2xx; response body: %q", pc.pushURLRedacted, resp.StatusCode, body) + } + _ = resp.Body.Close() return nil } -var pushMetrics = NewSet() +var pushMetricsSet = NewSet() func writePushMetrics(w io.Writer) { - pushMetrics.WritePrometheus(w) + pushMetricsSet.WritePrometheus(w) } func addExtraLabels(dst, src []byte, extraLabels string) []byte { @@ -225,3 +467,44 @@ func addExtraLabels(dst, src []byte, extraLabels string) []byte { } var bashBytes = []byte("#") + +func getBytesBuffer() *bytesBuffer { + v := bytesBufferPool.Get() + if v == nil { + return &bytesBuffer{} + } + return v.(*bytesBuffer) +} + +func putBytesBuffer(bb *bytesBuffer) { + bb.B = bb.B[:0] + bytesBufferPool.Put(bb) +} + +var bytesBufferPool sync.Pool + +type bytesBuffer struct { + B []byte +} + +func (bb *bytesBuffer) Write(p []byte) (int, error) { + bb.B = append(bb.B, p...) + return len(p), nil +} + +func getGzipWriter(w io.Writer) *gzip.Writer { + v := gzipWriterPool.Get() + if v == nil { + return gzip.NewWriter(w) + } + zw := v.(*gzip.Writer) + zw.Reset(w) + return zw +} + +func putGzipWriter(zw *gzip.Writer) { + zw.Reset(io.Discard) + gzipWriterPool.Put(zw) +} + +var gzipWriterPool sync.Pool diff --git a/vendor/github.com/VictoriaMetrics/metrics/set.go b/vendor/github.com/VictoriaMetrics/metrics/set.go index 79355ea..868a01c 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/set.go +++ b/vendor/github.com/VictoriaMetrics/metrics/set.go @@ -19,6 +19,8 @@ type Set struct { a []*namedMetric m map[string]*namedMetric summaries []*Summary + + metricsWriters []func(w io.Writer) } // NewSet creates new set of metrics. @@ -45,14 +47,27 @@ func (s *Set) WritePrometheus(w io.Writer) { sort.Slice(s.a, lessFunc) } sa := append([]*namedMetric(nil), s.a...) + metricsWriters := s.metricsWriters s.mu.Unlock() - // Call marshalTo without the global lock, since certain metric types such as Gauge - // can call a callback, which, in turn, can try calling s.mu.Lock again. + prevMetricFamily := "" for _, nm := range sa { + metricFamily := getMetricFamily(nm.name) + if metricFamily != prevMetricFamily { + // write meta info only once per metric family + metricType := nm.metric.metricType() + WriteMetadataIfNeeded(&bb, nm.name, metricType) + prevMetricFamily = metricFamily + } + // Call marshalTo without the global lock, since certain metric types such as Gauge + // can call a callback, which, in turn, can try calling s.mu.Lock again. nm.metric.marshalTo(nm.name, &bb) } w.Write(bb.Bytes()) + + for _, writeMetrics := range metricsWriters { + writeMetrics(w) + } } // NewHistogram creates and returns new histogram in s with the given name. @@ -243,9 +258,6 @@ func (s *Set) GetOrCreateFloatCounter(name string) *FloatCounter { // // The returned gauge is safe to use from concurrent goroutines. func (s *Set) NewGauge(name string, f func() float64) *Gauge { - if f == nil { - panic(fmt.Errorf("BUG: f cannot be nil")) - } g := &Gauge{ f: f, } @@ -272,9 +284,6 @@ func (s *Set) GetOrCreateGauge(name string, f func() float64) *Gauge { s.mu.Unlock() if nm == nil { // Slow path - create and register missing gauge. - if f == nil { - panic(fmt.Errorf("BUG: f cannot be nil")) - } if err := validateMetric(name); err != nil { panic(fmt.Errorf("BUG: invalid metric name %q: %s", name, err)) } @@ -521,14 +530,22 @@ func (s *Set) unregisterMetricLocked(nm *namedMetric) bool { } // UnregisterAllMetrics de-registers all metrics registered in s. +// +// It also de-registers writeMetrics callbacks passed to RegisterMetricsWriter. func (s *Set) UnregisterAllMetrics() { metricNames := s.ListMetricNames() for _, name := range metricNames { s.UnregisterMetric(name) } + + s.mu.Lock() + s.metricsWriters = nil + s.mu.Unlock() } // ListMetricNames returns sorted list of all the metrics in s. +// +// The returned list doesn't include metrics generated by metricsWriter passed to RegisterMetricsWriter. func (s *Set) ListMetricNames() []string { s.mu.Lock() defer s.mu.Unlock() @@ -542,3 +559,17 @@ func (s *Set) ListMetricNames() []string { sort.Strings(metricNames) return metricNames } + +// RegisterMetricsWriter registers writeMetrics callback for including metrics in the output generated by s.WritePrometheus. +// +// The writeMetrics callback must write metrics to w in Prometheus text exposition format without timestamps and trailing comments. +// The last line generated by writeMetrics must end with \n. +// See https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#text-based-format +// +// It is OK to reguster multiple writeMetrics callbacks - all of them will be called sequentially for gererating the output at s.WritePrometheus. +func (s *Set) RegisterMetricsWriter(writeMetrics func(w io.Writer)) { + s.mu.Lock() + defer s.mu.Unlock() + + s.metricsWriters = append(s.metricsWriters, writeMetrics) +} diff --git a/vendor/github.com/VictoriaMetrics/metrics/summary.go b/vendor/github.com/VictoriaMetrics/metrics/summary.go index 52183d2..057b67b 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/summary.go +++ b/vendor/github.com/VictoriaMetrics/metrics/summary.go @@ -119,6 +119,10 @@ func (sm *Summary) marshalTo(prefix string, w io.Writer) { } } +func (sm *Summary) metricType() string { + return "summary" +} + func splitMetricName(name string) (string, string) { n := strings.IndexByte(name, '{') if n < 0 { @@ -196,6 +200,10 @@ func (qv *quantileValue) marshalTo(prefix string, w io.Writer) { } } +func (qv *quantileValue) metricType() string { + return "unsupported" +} + func addTag(name, tag string) string { if len(name) == 0 || name[len(name)-1] != '}' { return fmt.Sprintf("%s{%s}", name, tag) diff --git a/vendor/github.com/cespare/xxhash/v2/README.md b/vendor/github.com/cespare/xxhash/v2/README.md index 792b4a6..33c8830 100644 --- a/vendor/github.com/cespare/xxhash/v2/README.md +++ b/vendor/github.com/cespare/xxhash/v2/README.md @@ -3,8 +3,7 @@ [![Go Reference](https://pkg.go.dev/badge/github.com/cespare/xxhash/v2.svg)](https://pkg.go.dev/github.com/cespare/xxhash/v2) [![Test](https://github.com/cespare/xxhash/actions/workflows/test.yml/badge.svg)](https://github.com/cespare/xxhash/actions/workflows/test.yml) -xxhash is a Go implementation of the 64-bit -[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a +xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a high-quality hashing algorithm that is much faster than anything in the Go standard library. @@ -25,8 +24,11 @@ func (*Digest) WriteString(string) (int, error) func (*Digest) Sum64() uint64 ``` -This implementation provides a fast pure-Go implementation and an even faster -assembly implementation for amd64. +The package is written with optimized pure Go and also contains even faster +assembly implementations for amd64 and arm64. If desired, the `purego` build tag +opts into using the Go code even on those architectures. + +[xxHash]: http://cyan4973.github.io/xxHash/ ## Compatibility @@ -45,19 +47,20 @@ I recommend using the latest release of Go. Here are some quick benchmarks comparing the pure-Go and assembly implementations of Sum64. -| input size | purego | asm | -| --- | --- | --- | -| 5 B | 979.66 MB/s | 1291.17 MB/s | -| 100 B | 7475.26 MB/s | 7973.40 MB/s | -| 4 KB | 17573.46 MB/s | 17602.65 MB/s | -| 10 MB | 17131.46 MB/s | 17142.16 MB/s | +| input size | purego | asm | +| ---------- | --------- | --------- | +| 4 B | 1.3 GB/s | 1.2 GB/s | +| 16 B | 2.9 GB/s | 3.5 GB/s | +| 100 B | 6.9 GB/s | 8.1 GB/s | +| 4 KB | 11.7 GB/s | 16.7 GB/s | +| 10 MB | 12.0 GB/s | 17.3 GB/s | -These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using -the following commands under Go 1.11.2: +These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C +CPU using the following commands under Go 1.19.2: ``` -$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes' -$ go test -benchtime 10s -bench '/xxhash,direct,bytes' +benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$') +benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$') ``` ## Projects using this package @@ -67,3 +70,5 @@ $ go test -benchtime 10s -bench '/xxhash,direct,bytes' - [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) - [FreeCache](https://github.com/coocood/freecache) - [FastCache](https://github.com/VictoriaMetrics/fastcache) +- [Ristretto](https://github.com/dgraph-io/ristretto) +- [Badger](https://github.com/dgraph-io/badger) diff --git a/vendor/github.com/cespare/xxhash/v2/testall.sh b/vendor/github.com/cespare/xxhash/v2/testall.sh new file mode 100644 index 0000000..94b9c44 --- /dev/null +++ b/vendor/github.com/cespare/xxhash/v2/testall.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu -o pipefail + +# Small convenience script for running the tests with various combinations of +# arch/tags. This assumes we're running on amd64 and have qemu available. + +go test ./... +go test -tags purego ./... +GOARCH=arm64 go test +GOARCH=arm64 go test -tags purego diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash.go b/vendor/github.com/cespare/xxhash/v2/xxhash.go index 15c835d..78bddf1 100644 --- a/vendor/github.com/cespare/xxhash/v2/xxhash.go +++ b/vendor/github.com/cespare/xxhash/v2/xxhash.go @@ -16,21 +16,16 @@ const ( prime5 uint64 = 2870177450012600261 ) -// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where -// possible in the Go code is worth a small (but measurable) performance boost -// by avoiding some MOVQs. Vars are needed for the asm and also are useful for -// convenience in the Go code in a few places where we need to intentionally -// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the -// result overflows a uint64). -var ( - prime1v = prime1 - prime2v = prime2 - prime3v = prime3 - prime4v = prime4 - prime5v = prime5 -) +// Store the primes in an array as well. +// +// The consts are used when possible in Go code to avoid MOVs but we need a +// contiguous array for the assembly code. +var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5} // Digest implements hash.Hash64. +// +// Note that a zero-valued Digest is not ready to receive writes. +// Call Reset or create a Digest using New before calling other methods. type Digest struct { v1 uint64 v2 uint64 @@ -41,19 +36,31 @@ type Digest struct { n int // how much of mem is used } -// New creates a new Digest that computes the 64-bit xxHash algorithm. +// New creates a new Digest with a zero seed. func New() *Digest { + return NewWithSeed(0) +} + +// NewWithSeed creates a new Digest with the given seed. +func NewWithSeed(seed uint64) *Digest { var d Digest - d.Reset() + d.ResetWithSeed(seed) return &d } // Reset clears the Digest's state so that it can be reused. +// It uses a seed value of zero. func (d *Digest) Reset() { - d.v1 = prime1v + prime2 - d.v2 = prime2 - d.v3 = 0 - d.v4 = -prime1v + d.ResetWithSeed(0) +} + +// ResetWithSeed clears the Digest's state so that it can be reused. +// It uses the given seed to initialize the state. +func (d *Digest) ResetWithSeed(seed uint64) { + d.v1 = seed + prime1 + prime2 + d.v2 = seed + prime2 + d.v3 = seed + d.v4 = seed - prime1 d.total = 0 d.n = 0 } @@ -69,21 +76,23 @@ func (d *Digest) Write(b []byte) (n int, err error) { n = len(b) d.total += uint64(n) + memleft := d.mem[d.n&(len(d.mem)-1):] + if d.n+n < 32 { // This new data doesn't even fill the current block. - copy(d.mem[d.n:], b) + copy(memleft, b) d.n += n return } if d.n > 0 { // Finish off the partial block. - copy(d.mem[d.n:], b) + c := copy(memleft, b) d.v1 = round(d.v1, u64(d.mem[0:8])) d.v2 = round(d.v2, u64(d.mem[8:16])) d.v3 = round(d.v3, u64(d.mem[16:24])) d.v4 = round(d.v4, u64(d.mem[24:32])) - b = b[32-d.n:] + b = b[c:] d.n = 0 } @@ -133,21 +142,20 @@ func (d *Digest) Sum64() uint64 { h += d.total - i, end := 0, d.n - for ; i+8 <= end; i += 8 { - k1 := round(0, u64(d.mem[i:i+8])) + b := d.mem[:d.n&(len(d.mem)-1)] + for ; len(b) >= 8; b = b[8:] { + k1 := round(0, u64(b[:8])) h ^= k1 h = rol27(h)*prime1 + prime4 } - if i+4 <= end { - h ^= uint64(u32(d.mem[i:i+4])) * prime1 + if len(b) >= 4 { + h ^= uint64(u32(b[:4])) * prime1 h = rol23(h)*prime2 + prime3 - i += 4 + b = b[4:] } - for i < end { - h ^= uint64(d.mem[i]) * prime5 + for ; len(b) > 0; b = b[1:] { + h ^= uint64(b[0]) * prime5 h = rol11(h) * prime1 - i++ } h ^= h >> 33 diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s b/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s index be8db5b..3e8b132 100644 --- a/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s @@ -1,215 +1,209 @@ +//go:build !appengine && gc && !purego // +build !appengine // +build gc // +build !purego #include "textflag.h" -// Register allocation: -// AX h -// SI pointer to advance through b -// DX n -// BX loop end -// R8 v1, k1 -// R9 v2 -// R10 v3 -// R11 v4 -// R12 tmp -// R13 prime1v -// R14 prime2v -// DI prime4v - -// round reads from and advances the buffer pointer in SI. -// It assumes that R13 has prime1v and R14 has prime2v. -#define round(r) \ - MOVQ (SI), R12 \ - ADDQ $8, SI \ - IMULQ R14, R12 \ - ADDQ R12, r \ - ROLQ $31, r \ - IMULQ R13, r - -// mergeRound applies a merge round on the two registers acc and val. -// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. -#define mergeRound(acc, val) \ - IMULQ R14, val \ - ROLQ $31, val \ - IMULQ R13, val \ - XORQ val, acc \ - IMULQ R13, acc \ - ADDQ DI, acc +// Registers: +#define h AX +#define d AX +#define p SI // pointer to advance through b +#define n DX +#define end BX // loop end +#define v1 R8 +#define v2 R9 +#define v3 R10 +#define v4 R11 +#define x R12 +#define prime1 R13 +#define prime2 R14 +#define prime4 DI + +#define round(acc, x) \ + IMULQ prime2, x \ + ADDQ x, acc \ + ROLQ $31, acc \ + IMULQ prime1, acc + +// round0 performs the operation x = round(0, x). +#define round0(x) \ + IMULQ prime2, x \ + ROLQ $31, x \ + IMULQ prime1, x + +// mergeRound applies a merge round on the two registers acc and x. +// It assumes that prime1, prime2, and prime4 have been loaded. +#define mergeRound(acc, x) \ + round0(x) \ + XORQ x, acc \ + IMULQ prime1, acc \ + ADDQ prime4, acc + +// blockLoop processes as many 32-byte blocks as possible, +// updating v1, v2, v3, and v4. It assumes that there is at least one block +// to process. +#define blockLoop() \ +loop: \ + MOVQ +0(p), x \ + round(v1, x) \ + MOVQ +8(p), x \ + round(v2, x) \ + MOVQ +16(p), x \ + round(v3, x) \ + MOVQ +24(p), x \ + round(v4, x) \ + ADDQ $32, p \ + CMPQ p, end \ + JLE loop // func Sum64(b []byte) uint64 -TEXT ·Sum64(SB), NOSPLIT, $0-32 +TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 // Load fixed primes. - MOVQ ·prime1v(SB), R13 - MOVQ ·prime2v(SB), R14 - MOVQ ·prime4v(SB), DI + MOVQ ·primes+0(SB), prime1 + MOVQ ·primes+8(SB), prime2 + MOVQ ·primes+24(SB), prime4 // Load slice. - MOVQ b_base+0(FP), SI - MOVQ b_len+8(FP), DX - LEAQ (SI)(DX*1), BX + MOVQ b_base+0(FP), p + MOVQ b_len+8(FP), n + LEAQ (p)(n*1), end // The first loop limit will be len(b)-32. - SUBQ $32, BX + SUBQ $32, end // Check whether we have at least one block. - CMPQ DX, $32 + CMPQ n, $32 JLT noBlocks // Set up initial state (v1, v2, v3, v4). - MOVQ R13, R8 - ADDQ R14, R8 - MOVQ R14, R9 - XORQ R10, R10 - XORQ R11, R11 - SUBQ R13, R11 - - // Loop until SI > BX. -blockLoop: - round(R8) - round(R9) - round(R10) - round(R11) - - CMPQ SI, BX - JLE blockLoop - - MOVQ R8, AX - ROLQ $1, AX - MOVQ R9, R12 - ROLQ $7, R12 - ADDQ R12, AX - MOVQ R10, R12 - ROLQ $12, R12 - ADDQ R12, AX - MOVQ R11, R12 - ROLQ $18, R12 - ADDQ R12, AX - - mergeRound(AX, R8) - mergeRound(AX, R9) - mergeRound(AX, R10) - mergeRound(AX, R11) + MOVQ prime1, v1 + ADDQ prime2, v1 + MOVQ prime2, v2 + XORQ v3, v3 + XORQ v4, v4 + SUBQ prime1, v4 + + blockLoop() + + MOVQ v1, h + ROLQ $1, h + MOVQ v2, x + ROLQ $7, x + ADDQ x, h + MOVQ v3, x + ROLQ $12, x + ADDQ x, h + MOVQ v4, x + ROLQ $18, x + ADDQ x, h + + mergeRound(h, v1) + mergeRound(h, v2) + mergeRound(h, v3) + mergeRound(h, v4) JMP afterBlocks noBlocks: - MOVQ ·prime5v(SB), AX + MOVQ ·primes+32(SB), h afterBlocks: - ADDQ DX, AX - - // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. - ADDQ $24, BX - - CMPQ SI, BX - JG fourByte - -wordLoop: - // Calculate k1. - MOVQ (SI), R8 - ADDQ $8, SI - IMULQ R14, R8 - ROLQ $31, R8 - IMULQ R13, R8 - - XORQ R8, AX - ROLQ $27, AX - IMULQ R13, AX - ADDQ DI, AX - - CMPQ SI, BX - JLE wordLoop - -fourByte: - ADDQ $4, BX - CMPQ SI, BX - JG singles - - MOVL (SI), R8 - ADDQ $4, SI - IMULQ R13, R8 - XORQ R8, AX - - ROLQ $23, AX - IMULQ R14, AX - ADDQ ·prime3v(SB), AX - -singles: - ADDQ $4, BX - CMPQ SI, BX + ADDQ n, h + + ADDQ $24, end + CMPQ p, end + JG try4 + +loop8: + MOVQ (p), x + ADDQ $8, p + round0(x) + XORQ x, h + ROLQ $27, h + IMULQ prime1, h + ADDQ prime4, h + + CMPQ p, end + JLE loop8 + +try4: + ADDQ $4, end + CMPQ p, end + JG try1 + + MOVL (p), x + ADDQ $4, p + IMULQ prime1, x + XORQ x, h + + ROLQ $23, h + IMULQ prime2, h + ADDQ ·primes+16(SB), h + +try1: + ADDQ $4, end + CMPQ p, end JGE finalize -singlesLoop: - MOVBQZX (SI), R12 - ADDQ $1, SI - IMULQ ·prime5v(SB), R12 - XORQ R12, AX +loop1: + MOVBQZX (p), x + ADDQ $1, p + IMULQ ·primes+32(SB), x + XORQ x, h + ROLQ $11, h + IMULQ prime1, h - ROLQ $11, AX - IMULQ R13, AX - - CMPQ SI, BX - JL singlesLoop + CMPQ p, end + JL loop1 finalize: - MOVQ AX, R12 - SHRQ $33, R12 - XORQ R12, AX - IMULQ R14, AX - MOVQ AX, R12 - SHRQ $29, R12 - XORQ R12, AX - IMULQ ·prime3v(SB), AX - MOVQ AX, R12 - SHRQ $32, R12 - XORQ R12, AX - - MOVQ AX, ret+24(FP) + MOVQ h, x + SHRQ $33, x + XORQ x, h + IMULQ prime2, h + MOVQ h, x + SHRQ $29, x + XORQ x, h + IMULQ ·primes+16(SB), h + MOVQ h, x + SHRQ $32, x + XORQ x, h + + MOVQ h, ret+24(FP) RET -// writeBlocks uses the same registers as above except that it uses AX to store -// the d pointer. - // func writeBlocks(d *Digest, b []byte) int -TEXT ·writeBlocks(SB), NOSPLIT, $0-40 +TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 // Load fixed primes needed for round. - MOVQ ·prime1v(SB), R13 - MOVQ ·prime2v(SB), R14 + MOVQ ·primes+0(SB), prime1 + MOVQ ·primes+8(SB), prime2 // Load slice. - MOVQ b_base+8(FP), SI - MOVQ b_len+16(FP), DX - LEAQ (SI)(DX*1), BX - SUBQ $32, BX + MOVQ b_base+8(FP), p + MOVQ b_len+16(FP), n + LEAQ (p)(n*1), end + SUBQ $32, end // Load vN from d. - MOVQ d+0(FP), AX - MOVQ 0(AX), R8 // v1 - MOVQ 8(AX), R9 // v2 - MOVQ 16(AX), R10 // v3 - MOVQ 24(AX), R11 // v4 + MOVQ s+0(FP), d + MOVQ 0(d), v1 + MOVQ 8(d), v2 + MOVQ 16(d), v3 + MOVQ 24(d), v4 // We don't need to check the loop condition here; this function is // always called with at least one block of data to process. -blockLoop: - round(R8) - round(R9) - round(R10) - round(R11) - - CMPQ SI, BX - JLE blockLoop + blockLoop() // Copy vN back to d. - MOVQ R8, 0(AX) - MOVQ R9, 8(AX) - MOVQ R10, 16(AX) - MOVQ R11, 24(AX) - - // The number of bytes written is SI minus the old base pointer. - SUBQ b_base+8(FP), SI - MOVQ SI, ret+32(FP) + MOVQ v1, 0(d) + MOVQ v2, 8(d) + MOVQ v3, 16(d) + MOVQ v4, 24(d) + + // The number of bytes written is p minus the old base pointer. + SUBQ b_base+8(FP), p + MOVQ p, ret+32(FP) RET diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s b/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s new file mode 100644 index 0000000..7e3145a --- /dev/null +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s @@ -0,0 +1,183 @@ +//go:build !appengine && gc && !purego +// +build !appengine +// +build gc +// +build !purego + +#include "textflag.h" + +// Registers: +#define digest R1 +#define h R2 // return value +#define p R3 // input pointer +#define n R4 // input length +#define nblocks R5 // n / 32 +#define prime1 R7 +#define prime2 R8 +#define prime3 R9 +#define prime4 R10 +#define prime5 R11 +#define v1 R12 +#define v2 R13 +#define v3 R14 +#define v4 R15 +#define x1 R20 +#define x2 R21 +#define x3 R22 +#define x4 R23 + +#define round(acc, x) \ + MADD prime2, acc, x, acc \ + ROR $64-31, acc \ + MUL prime1, acc + +// round0 performs the operation x = round(0, x). +#define round0(x) \ + MUL prime2, x \ + ROR $64-31, x \ + MUL prime1, x + +#define mergeRound(acc, x) \ + round0(x) \ + EOR x, acc \ + MADD acc, prime4, prime1, acc + +// blockLoop processes as many 32-byte blocks as possible, +// updating v1, v2, v3, and v4. It assumes that n >= 32. +#define blockLoop() \ + LSR $5, n, nblocks \ + PCALIGN $16 \ + loop: \ + LDP.P 16(p), (x1, x2) \ + LDP.P 16(p), (x3, x4) \ + round(v1, x1) \ + round(v2, x2) \ + round(v3, x3) \ + round(v4, x4) \ + SUB $1, nblocks \ + CBNZ nblocks, loop + +// func Sum64(b []byte) uint64 +TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 + LDP b_base+0(FP), (p, n) + + LDP ·primes+0(SB), (prime1, prime2) + LDP ·primes+16(SB), (prime3, prime4) + MOVD ·primes+32(SB), prime5 + + CMP $32, n + CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 } + BLT afterLoop + + ADD prime1, prime2, v1 + MOVD prime2, v2 + MOVD $0, v3 + NEG prime1, v4 + + blockLoop() + + ROR $64-1, v1, x1 + ROR $64-7, v2, x2 + ADD x1, x2 + ROR $64-12, v3, x3 + ROR $64-18, v4, x4 + ADD x3, x4 + ADD x2, x4, h + + mergeRound(h, v1) + mergeRound(h, v2) + mergeRound(h, v3) + mergeRound(h, v4) + +afterLoop: + ADD n, h + + TBZ $4, n, try8 + LDP.P 16(p), (x1, x2) + + round0(x1) + + // NOTE: here and below, sequencing the EOR after the ROR (using a + // rotated register) is worth a small but measurable speedup for small + // inputs. + ROR $64-27, h + EOR x1 @> 64-27, h, h + MADD h, prime4, prime1, h + + round0(x2) + ROR $64-27, h + EOR x2 @> 64-27, h, h + MADD h, prime4, prime1, h + +try8: + TBZ $3, n, try4 + MOVD.P 8(p), x1 + + round0(x1) + ROR $64-27, h + EOR x1 @> 64-27, h, h + MADD h, prime4, prime1, h + +try4: + TBZ $2, n, try2 + MOVWU.P 4(p), x2 + + MUL prime1, x2 + ROR $64-23, h + EOR x2 @> 64-23, h, h + MADD h, prime3, prime2, h + +try2: + TBZ $1, n, try1 + MOVHU.P 2(p), x3 + AND $255, x3, x1 + LSR $8, x3, x2 + + MUL prime5, x1 + ROR $64-11, h + EOR x1 @> 64-11, h, h + MUL prime1, h + + MUL prime5, x2 + ROR $64-11, h + EOR x2 @> 64-11, h, h + MUL prime1, h + +try1: + TBZ $0, n, finalize + MOVBU (p), x4 + + MUL prime5, x4 + ROR $64-11, h + EOR x4 @> 64-11, h, h + MUL prime1, h + +finalize: + EOR h >> 33, h + MUL prime2, h + EOR h >> 29, h + MUL prime3, h + EOR h >> 32, h + + MOVD h, ret+24(FP) + RET + +// func writeBlocks(d *Digest, b []byte) int +TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 + LDP ·primes+0(SB), (prime1, prime2) + + // Load state. Assume v[1-4] are stored contiguously. + MOVD d+0(FP), digest + LDP 0(digest), (v1, v2) + LDP 16(digest), (v3, v4) + + LDP b_base+8(FP), (p, n) + + blockLoop() + + // Store updated state. + STP (v1, v2), 0(digest) + STP (v3, v4), 16(digest) + + BIC $31, n + MOVD n, ret+32(FP) + RET diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.go b/vendor/github.com/cespare/xxhash/v2/xxhash_asm.go similarity index 53% rename from vendor/github.com/cespare/xxhash/v2/xxhash_amd64.go rename to vendor/github.com/cespare/xxhash/v2/xxhash_asm.go index ad14b80..78f95f2 100644 --- a/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.go +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_asm.go @@ -1,10 +1,12 @@ +//go:build (amd64 || arm64) && !appengine && gc && !purego +// +build amd64 arm64 // +build !appengine // +build gc // +build !purego package xxhash -// Sum64 computes the 64-bit xxHash digest of b. +// Sum64 computes the 64-bit xxHash digest of b with a zero seed. // //go:noescape func Sum64(b []byte) uint64 diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_other.go b/vendor/github.com/cespare/xxhash/v2/xxhash_other.go index 4a5a821..118e49e 100644 --- a/vendor/github.com/cespare/xxhash/v2/xxhash_other.go +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_other.go @@ -1,8 +1,9 @@ -// +build !amd64 appengine !gc purego +//go:build (!amd64 && !arm64) || appengine || !gc || purego +// +build !amd64,!arm64 appengine !gc purego package xxhash -// Sum64 computes the 64-bit xxHash digest of b. +// Sum64 computes the 64-bit xxHash digest of b with a zero seed. func Sum64(b []byte) uint64 { // A simpler version would be // d := New() @@ -14,10 +15,10 @@ func Sum64(b []byte) uint64 { var h uint64 if n >= 32 { - v1 := prime1v + prime2 + v1 := primes[0] + prime2 v2 := prime2 v3 := uint64(0) - v4 := -prime1v + v4 := -primes[0] for len(b) >= 32 { v1 = round(v1, u64(b[0:8:len(b)])) v2 = round(v2, u64(b[8:16:len(b)])) @@ -36,19 +37,18 @@ func Sum64(b []byte) uint64 { h += uint64(n) - i, end := 0, len(b) - for ; i+8 <= end; i += 8 { - k1 := round(0, u64(b[i:i+8:len(b)])) + for ; len(b) >= 8; b = b[8:] { + k1 := round(0, u64(b[:8])) h ^= k1 h = rol27(h)*prime1 + prime4 } - if i+4 <= end { - h ^= uint64(u32(b[i:i+4:len(b)])) * prime1 + if len(b) >= 4 { + h ^= uint64(u32(b[:4])) * prime1 h = rol23(h)*prime2 + prime3 - i += 4 + b = b[4:] } - for ; i < end; i++ { - h ^= uint64(b[i]) * prime5 + for ; len(b) > 0; b = b[1:] { + h ^= uint64(b[0]) * prime5 h = rol11(h) * prime1 } diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_safe.go b/vendor/github.com/cespare/xxhash/v2/xxhash_safe.go index fc9bea7..05f5e7d 100644 --- a/vendor/github.com/cespare/xxhash/v2/xxhash_safe.go +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_safe.go @@ -1,10 +1,11 @@ +//go:build appengine // +build appengine // This file contains the safe implementations of otherwise unsafe-using code. package xxhash -// Sum64String computes the 64-bit xxHash digest of s. +// Sum64String computes the 64-bit xxHash digest of s with a zero seed. func Sum64String(s string) uint64 { return Sum64([]byte(s)) } diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_unsafe.go b/vendor/github.com/cespare/xxhash/v2/xxhash_unsafe.go index 376e0ca..cf9d42a 100644 --- a/vendor/github.com/cespare/xxhash/v2/xxhash_unsafe.go +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_unsafe.go @@ -1,3 +1,4 @@ +//go:build !appengine // +build !appengine // This file encapsulates usage of unsafe. @@ -11,7 +12,7 @@ import ( // In the future it's possible that compiler optimizations will make these // XxxString functions unnecessary by realizing that calls such as -// Sum64([]byte(s)) don't need to copy s. See https://golang.org/issue/2205. +// Sum64([]byte(s)) don't need to copy s. See https://go.dev/issue/2205. // If that happens, even if we keep these functions they can be replaced with // the trivial safe code. @@ -32,7 +33,7 @@ import ( // // See https://github.com/golang/go/issues/42739 for discussion. -// Sum64String computes the 64-bit xxHash digest of s. +// Sum64String computes the 64-bit xxHash digest of s with a zero seed. // It may be faster than Sum64([]byte(s)) by avoiding a copy. func Sum64String(s string) uint64 { b := *(*[]byte)(unsafe.Pointer(&sliceHeader{s, len(s)})) diff --git a/vendor/github.com/chen3feng/safecast/.gitignore b/vendor/github.com/chen3feng/safecast/.gitignore new file mode 100644 index 0000000..66fd13c --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/.gitignore @@ -0,0 +1,15 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ diff --git a/vendor/github.com/chen3feng/safecast/LICENSE.md b/vendor/github.com/chen3feng/safecast/LICENSE.md new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/LICENSE.md @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/chen3feng/safecast/README.md b/vendor/github.com/chen3feng/safecast/README.md new file mode 100644 index 0000000..a590b17 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/README.md @@ -0,0 +1,303 @@ +# safecast + +Safe Numeric Type Cast Library for Go + +English | [简体中文](README_zh.md) + +[![License Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-red.svg)](COPYING) +[![Golang](https://img.shields.io/badge/Language-go1.18+-blue.svg)](https://go.dev/) +![Build Status](https://github.com/chen3feng/safecast/actions/workflows/go.yml/badge.svg) +[![Coverage Status](https://coveralls.io/repos/github/chen3feng/safecast/badge.svg?branch=master)](https://coveralls.io/github/chen3feng/safecast?branch=master) +[![GoReport](https://goreportcard.com/badge/github.com/securego/gosec)](https://goreportcard.com/report/github.com/chen3feng/safecast) +[![Go Reference](https://pkg.go.dev/badge/github.com/chen3feng/safecast.svg)](https://pkg.go.dev/github.com/chen3feng/safecast) + +Safe numeric type cast library. suppoer all integral and floating types, except uintptr. + +This library depends on go generics, which is introduced in 1.18+. + +Usage: + +```go +val, ok := To[type](value) +``` + +`ok == false` indicates overflow occured. but whatever,`val` is always equals to the result of the normal type cast (`type(value)`) expression。 + +There are also non-parametric forms of the target types: + +````go +val, ok := ToInt32(value) +```` + +The usage is the same as the according full generic version, and the performance will be better. + + + + + +# safecast + +```go +import "github.com/chen3feng/safecast" +``` + +Package safecast provide a safe way to cast a numeric value from type A to type B, with overflow and underflow check. + +## Index + +- [func To[ToType numericType, FromType numericType](value FromType) (result ToType, ok bool)](<#func-to>) +- [func ToFloat32[FromType numericType](value FromType) (float32, bool)](<#func-tofloat32>) +- [func ToFloat64[FromType numericType](value FromType) (float64, bool)](<#func-tofloat64>) +- [func ToInt[FromType numericType](value FromType) (int, bool)](<#func-toint>) +- [func ToInt16[FromType numericType](value FromType) (int16, bool)](<#func-toint16>) +- [func ToInt32[FromType numericType](value FromType) (int32, bool)](<#func-toint32>) +- [func ToInt64[FromType numericType](value FromType) (int64, bool)](<#func-toint64>) +- [func ToInt8[FromType numericType](value FromType) (int8, bool)](<#func-toint8>) +- [func ToUint[FromType numericType](value FromType) (uint, bool)](<#func-touint>) +- [func ToUint16[FromType numericType](value FromType) (uint16, bool)](<#func-touint16>) +- [func ToUint32[FromType numericType](value FromType) (uint32, bool)](<#func-touint32>) +- [func ToUint64[FromType numericType](value FromType) (uint64, bool)](<#func-touint64>) +- [func ToUint8[FromType numericType](value FromType) (uint8, bool)](<#func-touint8>) + + +## func [To]() + +```go +func To[ToType numericType, FromType numericType](value FromType) (result ToType, ok bool) +``` + +To converts a numeric value from the FromType to the specified ToType type safely. result will always be same as the usual type cast \(type\(value\)\), but ok is false when overflow or underflow occured. + +
Example (Float Overflow) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" + "math" +) + +func main() { + n, ok := safecast.To[float32](math.MaxFloat32 * 2) + fmt.Print(n, ok) +} +``` + +#### Output + +``` ++Inf false +``` + +

+
+ +
Example (Int No Overflow) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + b, ok := safecast.To[byte](255) + fmt.Print(b, ok) +} +``` + +#### Output + +``` +255 true +``` + +

+
+ +
Example (Int Overflow) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + b, ok := safecast.To[byte](256) + fmt.Print(b, ok) +} +``` + +#### Output + +``` +0 false +``` + +

+
+ +
Example (Value In Range) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + n, ok := safecast.To[uint](1) + fmt.Print(n, ok) +} +``` + +#### Output + +``` +1 true +``` + +

+
+ +
Example (Value Out Of Range) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + n, ok := safecast.To[uint32](-1) + fmt.Print(n, ok) +} +``` + +#### Output + +``` +4294967295 false +``` + +

+
+ +## func [ToFloat32]() + +```go +func ToFloat32[FromType numericType](value FromType) (float32, bool) +``` + +ToFloat32 converts value to float32 type safely. result will always be same as the usual type cast\(float32\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToFloat64]() + +```go +func ToFloat64[FromType numericType](value FromType) (float64, bool) +``` + +ToFloat64 converts value to float64 type safely. result will always be same as the usual type cast\(float64\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt]() + +```go +func ToInt[FromType numericType](value FromType) (int, bool) +``` + +ToInt converts value to int type safely. result will always be same as the usual type cast\(int\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt16]() + +```go +func ToInt16[FromType numericType](value FromType) (int16, bool) +``` + +ToInt16 converts value to int16 type safely. result will always be same as the usual type cast\(int16\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt32]() + +```go +func ToInt32[FromType numericType](value FromType) (int32, bool) +``` + +ToInt32 converts value to int32 type safely. result will always be same as the usual type cast\(int32\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt64]() + +```go +func ToInt64[FromType numericType](value FromType) (int64, bool) +``` + +ToInt64 converts value to int64 type safely. result will always be same as the usual type cast\(int64\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt8]() + +```go +func ToInt8[FromType numericType](value FromType) (int8, bool) +``` + +ToInt8 converts value to int8 type safely. result will always be same as the usual type cast\(int8\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint]() + +```go +func ToUint[FromType numericType](value FromType) (uint, bool) +``` + +ToUint converts value to uint type safely. result will always be same as the usual type cast\(uint\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint16]() + +```go +func ToUint16[FromType numericType](value FromType) (uint16, bool) +``` + +ToUint16 converts value to uint16 type safely. result will always be same as the usual type cast\(uint16\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint32]() + +```go +func ToUint32[FromType numericType](value FromType) (uint32, bool) +``` + +ToUint32 converts value to uint32 type safely. result will always be same as the usual type cast\(uint32\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint64]() + +```go +func ToUint64[FromType numericType](value FromType) (uint64, bool) +``` + +ToUint64 converts value to uint64 type safely. result will always be same as the usual type cast\(uint64\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint8]() + +```go +func ToUint8[FromType numericType](value FromType) (uint8, bool) +``` + +ToUint8 converts value to uint8 type safely. result will always be same as the usual type cast\(uint8\(value\)\), but ok is false when overflow or underflow occured. + + + +Generated by [gomarkdoc]() + + + diff --git a/vendor/github.com/chen3feng/safecast/README_zh.md b/vendor/github.com/chen3feng/safecast/README_zh.md new file mode 100644 index 0000000..0972b5b --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/README_zh.md @@ -0,0 +1,299 @@ +# safecast -- Safe Numeric Type Cast Library For Go + +[English](README.md) | 简体中文 + +[![License Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-red.svg)](COPYING) +[![Python](https://img.shields.io/badge/Language-go1.18+-blue.svg)](https://www.python.org/) +![Build Status](https://github.com/chen3feng/safecast/actions/workflows/go.yml/badge.svg) +[![Coverage Status](https://coveralls.io/repos/github/chen3feng/safecast/badge.svg?branch=master)](https://coveralls.io/github/chen3feng/safecast?branch=master) +[![GoReport](https://goreportcard.com/badge/github.com/securego/gosec)](https://goreportcard.com/report/github.com/chen3feng/safecast) +[![Go Reference](https://pkg.go.dev/badge/github.com/chen3feng/safecast.svg)](https://pkg.go.dev/github.com/chen3feng/safecast) + +安全的数值类型转换库。支持除了 `uintptr` 外的所有整数和浮点数类型。 + +用法: + +```go +val, ok := To[type](value) +``` + +`ok == false` 表示值溢出。但是无论成功失败,`val` 都等效于直接用普通类型转换(`type(value)`)的结果。 + +另外也有目标类型非参数化的形式: + +```go +val, ok := ToInt32(value) +``` + +用法同全泛型版本,并且性能上会好一些。 + + + + + +# safecast + +```go +import "github.com/chen3feng/safecast" +``` + +Package safecast provide a safe way to cast a numeric value from type A to type B, with overflow and underflow check. + +## Index + +- [func To[ToType numericType, FromType numericType](value FromType) (result ToType, ok bool)](<#func-to>) +- [func ToFloat32[FromType numericType](value FromType) (float32, bool)](<#func-tofloat32>) +- [func ToFloat64[FromType numericType](value FromType) (float64, bool)](<#func-tofloat64>) +- [func ToInt[FromType numericType](value FromType) (int, bool)](<#func-toint>) +- [func ToInt16[FromType numericType](value FromType) (int16, bool)](<#func-toint16>) +- [func ToInt32[FromType numericType](value FromType) (int32, bool)](<#func-toint32>) +- [func ToInt64[FromType numericType](value FromType) (int64, bool)](<#func-toint64>) +- [func ToInt8[FromType numericType](value FromType) (int8, bool)](<#func-toint8>) +- [func ToUint[FromType numericType](value FromType) (uint, bool)](<#func-touint>) +- [func ToUint16[FromType numericType](value FromType) (uint16, bool)](<#func-touint16>) +- [func ToUint32[FromType numericType](value FromType) (uint32, bool)](<#func-touint32>) +- [func ToUint64[FromType numericType](value FromType) (uint64, bool)](<#func-touint64>) +- [func ToUint8[FromType numericType](value FromType) (uint8, bool)](<#func-touint8>) + + +## func [To]() + +```go +func To[ToType numericType, FromType numericType](value FromType) (result ToType, ok bool) +``` + +To converts a numeric value from the FromType to the specified ToType type safely. result will always be same as the usual type cast \(type\(value\)\), but ok is false when overflow or underflow occured. + +
Example (Float Overflow) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" + "math" +) + +func main() { + n, ok := safecast.To[float32](math.MaxFloat32 * 2) + fmt.Print(n, ok) +} +``` + +#### Output + +``` ++Inf false +``` + +

+
+ +
Example (Int No Overflow) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + b, ok := safecast.To[byte](255) + fmt.Print(b, ok) +} +``` + +#### Output + +``` +255 true +``` + +

+
+ +
Example (Int Overflow) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + b, ok := safecast.To[byte](256) + fmt.Print(b, ok) +} +``` + +#### Output + +``` +0 false +``` + +

+
+ +
Example (Value In Range) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + n, ok := safecast.To[uint](1) + fmt.Print(n, ok) +} +``` + +#### Output + +``` +1 true +``` + +

+
+ +
Example (Value Out Of Range) +

+ +```go +package main + +import ( + "fmt" + "github.com/chen3feng/safecast" +) + +func main() { + n, ok := safecast.To[uint32](-1) + fmt.Print(n, ok) +} +``` + +#### Output + +``` +4294967295 false +``` + +

+
+ +## func [ToFloat32]() + +```go +func ToFloat32[FromType numericType](value FromType) (float32, bool) +``` + +ToFloat32 converts value to float32 type safely. result will always be same as the usual type cast\(float32\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToFloat64]() + +```go +func ToFloat64[FromType numericType](value FromType) (float64, bool) +``` + +ToFloat64 converts value to float64 type safely. result will always be same as the usual type cast\(float64\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt]() + +```go +func ToInt[FromType numericType](value FromType) (int, bool) +``` + +ToInt converts value to int type safely. result will always be same as the usual type cast\(int\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt16]() + +```go +func ToInt16[FromType numericType](value FromType) (int16, bool) +``` + +ToInt16 converts value to int16 type safely. result will always be same as the usual type cast\(int16\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt32]() + +```go +func ToInt32[FromType numericType](value FromType) (int32, bool) +``` + +ToInt32 converts value to int32 type safely. result will always be same as the usual type cast\(int32\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt64]() + +```go +func ToInt64[FromType numericType](value FromType) (int64, bool) +``` + +ToInt64 converts value to int64 type safely. result will always be same as the usual type cast\(int64\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToInt8]() + +```go +func ToInt8[FromType numericType](value FromType) (int8, bool) +``` + +ToInt8 converts value to int8 type safely. result will always be same as the usual type cast\(int8\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint]() + +```go +func ToUint[FromType numericType](value FromType) (uint, bool) +``` + +ToUint converts value to uint type safely. result will always be same as the usual type cast\(uint\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint16]() + +```go +func ToUint16[FromType numericType](value FromType) (uint16, bool) +``` + +ToUint16 converts value to uint16 type safely. result will always be same as the usual type cast\(uint16\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint32]() + +```go +func ToUint32[FromType numericType](value FromType) (uint32, bool) +``` + +ToUint32 converts value to uint32 type safely. result will always be same as the usual type cast\(uint32\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint64]() + +```go +func ToUint64[FromType numericType](value FromType) (uint64, bool) +``` + +ToUint64 converts value to uint64 type safely. result will always be same as the usual type cast\(uint64\(value\)\), but ok is false when overflow or underflow occured. + +## func [ToUint8]() + +```go +func ToUint8[FromType numericType](value FromType) (uint8, bool) +``` + +ToUint8 converts value to uint8 type safely. result will always be same as the usual type cast\(uint8\(value\)\), but ok is false when overflow or underflow occured. + + + +Generated by [gomarkdoc]() + + + diff --git a/vendor/github.com/chen3feng/safecast/casts.go b/vendor/github.com/chen3feng/safecast/casts.go new file mode 100644 index 0000000..dce8188 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/casts.go @@ -0,0 +1,859 @@ +package safecast + +import "math" + +const intBits = 32 << (^uint(0) >> 63) + +// int8ToInt8 converts the int8 value to int8 safely. +func int8ToInt8(value int8) (result int8, ok bool) { + return int8(value), true +} + +// int8ToInt16 converts the int8 value to int16 safely. +func int8ToInt16(value int8) (result int16, ok bool) { + return int16(value), true +} + +// int8ToInt32 converts the int8 value to int32 safely. +func int8ToInt32(value int8) (result int32, ok bool) { + return int32(value), true +} + +// int8ToInt64 converts the int8 value to int64 safely. +func int8ToInt64(value int8) (result int64, ok bool) { + return int64(value), true +} + +// int8ToUint8 converts the int8 value to uint8 safely. +func int8ToUint8(value int8) (result uint8, ok bool) { + return uint8(value), value >= 0 +} + +// int8ToUint16 converts the int8 value to uint16 safely. +func int8ToUint16(value int8) (result uint16, ok bool) { + return uint16(value), value >= 0 +} + +// int8ToUint32 converts the int8 value to uint32 safely. +func int8ToUint32(value int8) (result uint32, ok bool) { + return uint32(value), value >= 0 +} + +// int8ToUint64 converts the int8 value to uint64 safely. +func int8ToUint64(value int8) (result uint64, ok bool) { + return uint64(value), value >= 0 +} + +// int16ToInt8 converts the int16 value to int8 safely. +func int16ToInt8(value int16) (result int8, ok bool) { + return int8(value), int16(int8(value)) == value +} + +// int16ToInt16 converts the int16 value to int16 safely. +func int16ToInt16(value int16) (result int16, ok bool) { + return int16(value), true +} + +// int16ToInt32 converts the int16 value to int32 safely. +func int16ToInt32(value int16) (result int32, ok bool) { + return int32(value), true +} + +// int16ToInt64 converts the int16 value to int64 safely. +func int16ToInt64(value int16) (result int64, ok bool) { + return int64(value), true +} + +// int16ToUint8 converts the int16 value to uint8 safely. +func int16ToUint8(value int16) (result uint8, ok bool) { + return uint8(value), value >= 0 && int16(uint8(value)) == value +} + +// int16ToUint16 converts the int16 value to uint16 safely. +func int16ToUint16(value int16) (result uint16, ok bool) { + return uint16(value), value >= 0 +} + +// int16ToUint32 converts the int16 value to uint32 safely. +func int16ToUint32(value int16) (result uint32, ok bool) { + return uint32(value), value >= 0 +} + +// int16ToUint64 converts the int16 value to uint64 safely. +func int16ToUint64(value int16) (result uint64, ok bool) { + return uint64(value), value >= 0 +} + +// int32ToInt8 converts the int32 value to int8 safely. +func int32ToInt8(value int32) (result int8, ok bool) { + return int8(value), int32(int8(value)) == value +} + +// int32ToInt16 converts the int32 value to int16 safely. +func int32ToInt16(value int32) (result int16, ok bool) { + return int16(value), int32(int16(value)) == value +} + +// int32ToInt32 converts the int32 value to int32 safely. +func int32ToInt32(value int32) (result int32, ok bool) { + return int32(value), true +} + +// int32ToInt64 converts the int32 value to int64 safely. +func int32ToInt64(value int32) (result int64, ok bool) { + return int64(value), true +} + +// int32ToUint8 converts the int32 value to uint8 safely. +func int32ToUint8(value int32) (result uint8, ok bool) { + return uint8(value), value >= 0 && int32(uint8(value)) == value +} + +// int32ToUint16 converts the int32 value to uint16 safely. +func int32ToUint16(value int32) (result uint16, ok bool) { + return uint16(value), value >= 0 && int32(uint16(value)) == value +} + +// int32ToUint32 converts the int32 value to uint32 safely. +func int32ToUint32(value int32) (result uint32, ok bool) { + return uint32(value), value >= 0 +} + +// int32ToUint64 converts the int32 value to uint64 safely. +func int32ToUint64(value int32) (result uint64, ok bool) { + return uint64(value), value >= 0 +} + +// int64ToInt8 converts the int64 value to int8 safely. +func int64ToInt8(value int64) (result int8, ok bool) { + return int8(value), int64(int8(value)) == value +} + +// int64ToInt16 converts the int64 value to int16 safely. +func int64ToInt16(value int64) (result int16, ok bool) { + return int16(value), int64(int16(value)) == value +} + +// int64ToInt32 converts the int64 value to int32 safely. +func int64ToInt32(value int64) (result int32, ok bool) { + return int32(value), int64(int32(value)) == value +} + +// int64ToInt64 converts the int64 value to int64 safely. +func int64ToInt64(value int64) (result int64, ok bool) { + return int64(value), true +} + +// int64ToUint8 converts the int64 value to uint8 safely. +func int64ToUint8(value int64) (result uint8, ok bool) { + return uint8(value), value >= 0 && int64(uint8(value)) == value +} + +// int64ToUint16 converts the int64 value to uint16 safely. +func int64ToUint16(value int64) (result uint16, ok bool) { + return uint16(value), value >= 0 && int64(uint16(value)) == value +} + +// int64ToUint32 converts the int64 value to uint32 safely. +func int64ToUint32(value int64) (result uint32, ok bool) { + return uint32(value), value >= 0 && int64(uint32(value)) == value +} + +// int64ToUint64 converts the int64 value to uint64 safely. +func int64ToUint64(value int64) (result uint64, ok bool) { + return uint64(value), value >= 0 +} + +// uint8ToInt8 converts the uint8 value to int8 safely. +func uint8ToInt8(value uint8) (result int8, ok bool) { + return int8(value), value <= math.MaxInt8 +} + +// uint8ToInt16 converts the uint8 value to int16 safely. +func uint8ToInt16(value uint8) (result int16, ok bool) { + return int16(value), true +} + +// uint8ToInt32 converts the uint8 value to int32 safely. +func uint8ToInt32(value uint8) (result int32, ok bool) { + return int32(value), true +} + +// uint8ToInt64 converts the uint8 value to int64 safely. +func uint8ToInt64(value uint8) (result int64, ok bool) { + return int64(value), true +} + +// uint8ToUint8 converts the uint8 value to uint8 safely. +func uint8ToUint8(value uint8) (result uint8, ok bool) { + return uint8(value), true +} + +// uint8ToUint16 converts the uint8 value to uint16 safely. +func uint8ToUint16(value uint8) (result uint16, ok bool) { + return uint16(value), true +} + +// uint8ToUint32 converts the uint8 value to uint32 safely. +func uint8ToUint32(value uint8) (result uint32, ok bool) { + return uint32(value), true +} + +// uint8ToUint64 converts the uint8 value to uint64 safely. +func uint8ToUint64(value uint8) (result uint64, ok bool) { + return uint64(value), true +} + +// uint16ToInt8 converts the uint16 value to int8 safely. +func uint16ToInt8(value uint16) (result int8, ok bool) { + return int8(value), uint16(int8(value)) == value +} + +// uint16ToInt16 converts the uint16 value to int16 safely. +func uint16ToInt16(value uint16) (result int16, ok bool) { + return int16(value), value <= math.MaxInt16 +} + +// uint16ToInt32 converts the uint16 value to int32 safely. +func uint16ToInt32(value uint16) (result int32, ok bool) { + return int32(value), true +} + +// uint16ToInt64 converts the uint16 value to int64 safely. +func uint16ToInt64(value uint16) (result int64, ok bool) { + return int64(value), true +} + +// uint16ToUint8 converts the uint16 value to uint8 safely. +func uint16ToUint8(value uint16) (result uint8, ok bool) { + return uint8(value), uint16(uint8(value)) == value +} + +// uint16ToUint16 converts the uint16 value to uint16 safely. +func uint16ToUint16(value uint16) (result uint16, ok bool) { + return uint16(value), true +} + +// uint16ToUint32 converts the uint16 value to uint32 safely. +func uint16ToUint32(value uint16) (result uint32, ok bool) { + return uint32(value), true +} + +// uint16ToUint64 converts the uint16 value to uint64 safely. +func uint16ToUint64(value uint16) (result uint64, ok bool) { + return uint64(value), true +} + +// uint32ToInt8 converts the uint32 value to int8 safely. +func uint32ToInt8(value uint32) (result int8, ok bool) { + return int8(value), uint32(int8(value)) == value +} + +// uint32ToInt16 converts the uint32 value to int16 safely. +func uint32ToInt16(value uint32) (result int16, ok bool) { + return int16(value), uint32(int16(value)) == value +} + +// uint32ToInt32 converts the uint32 value to int32 safely. +func uint32ToInt32(value uint32) (result int32, ok bool) { + return int32(value), value <= math.MaxInt32 +} + +// uint32ToInt64 converts the uint32 value to int64 safely. +func uint32ToInt64(value uint32) (result int64, ok bool) { + return int64(value), true +} + +// uint32ToUint8 converts the uint32 value to uint8 safely. +func uint32ToUint8(value uint32) (result uint8, ok bool) { + return uint8(value), uint32(uint8(value)) == value +} + +// uint32ToUint16 converts the uint32 value to uint16 safely. +func uint32ToUint16(value uint32) (result uint16, ok bool) { + return uint16(value), uint32(uint16(value)) == value +} + +// uint32ToUint32 converts the uint32 value to uint32 safely. +func uint32ToUint32(value uint32) (result uint32, ok bool) { + return uint32(value), true +} + +// uint32ToUint64 converts the uint32 value to uint64 safely. +func uint32ToUint64(value uint32) (result uint64, ok bool) { + return uint64(value), true +} + +// uint64ToInt8 converts the uint64 value to int8 safely. +func uint64ToInt8(value uint64) (result int8, ok bool) { + return int8(value), uint64(int8(value)) == value +} + +// uint64ToInt16 converts the uint64 value to int16 safely. +func uint64ToInt16(value uint64) (result int16, ok bool) { + return int16(value), uint64(int16(value)) == value +} + +// uint64ToInt32 converts the uint64 value to int32 safely. +func uint64ToInt32(value uint64) (result int32, ok bool) { + return int32(value), uint64(int32(value)) == value +} + +// uint64ToInt64 converts the uint64 value to int64 safely. +func uint64ToInt64(value uint64) (result int64, ok bool) { + return int64(value), value <= math.MaxInt64 +} + +// uint64ToUint8 converts the uint64 value to uint8 safely. +func uint64ToUint8(value uint64) (result uint8, ok bool) { + return uint8(value), uint64(uint8(value)) == value +} + +// uint64ToUint16 converts the uint64 value to uint16 safely. +func uint64ToUint16(value uint64) (result uint16, ok bool) { + return uint16(value), uint64(uint16(value)) == value +} + +// uint64ToUint32 converts the uint64 value to uint32 safely. +func uint64ToUint32(value uint64) (result uint32, ok bool) { + return uint32(value), uint64(uint32(value)) == value +} + +// uint64ToUint64 converts the uint64 value to uint64 safely. +func uint64ToUint64(value uint64) (result uint64, ok bool) { + return uint64(value), true +} + +// int8ToInt converts the int8 value to int safely. +func int8ToInt(value int8) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = int8ToInt32(value) + result = int(r) + } + var r int64 + r, ok = int8ToInt64(value) + result = int(r) + return +} + +// int8ToUint converts the int8 value to uint safely. +func int8ToUint(value int8) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = int8ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = int8ToUint64(value) + result = uint(r) + return +} + +// int16ToInt converts the int16 value to int safely. +func int16ToInt(value int16) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = int16ToInt32(value) + result = int(r) + } + var r int64 + r, ok = int16ToInt64(value) + result = int(r) + return +} + +// int16ToUint converts the int16 value to uint safely. +func int16ToUint(value int16) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = int16ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = int16ToUint64(value) + result = uint(r) + return +} + +// int32ToInt converts the int32 value to int safely. +func int32ToInt(value int32) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = int32ToInt32(value) + result = int(r) + } + var r int64 + r, ok = int32ToInt64(value) + result = int(r) + return +} + +// int32ToUint converts the int32 value to uint safely. +func int32ToUint(value int32) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = int32ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = int32ToUint64(value) + result = uint(r) + return +} + +// int64ToInt converts the int64 value to int safely. +func int64ToInt(value int64) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = int64ToInt32(value) + result = int(r) + } + var r int64 + r, ok = int64ToInt64(value) + result = int(r) + return +} + +// int64ToUint converts the int64 value to uint safely. +func int64ToUint(value int64) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = int64ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = int64ToUint64(value) + result = uint(r) + return +} + +// uint8ToInt converts the uint8 value to int safely. +func uint8ToInt(value uint8) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = uint8ToInt32(value) + result = int(r) + } + var r int64 + r, ok = uint8ToInt64(value) + result = int(r) + return +} + +// uint8ToUint converts the uint8 value to uint safely. +func uint8ToUint(value uint8) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = uint8ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = uint8ToUint64(value) + result = uint(r) + return +} + +// uint16ToInt converts the uint16 value to int safely. +func uint16ToInt(value uint16) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = uint16ToInt32(value) + result = int(r) + } + var r int64 + r, ok = uint16ToInt64(value) + result = int(r) + return +} + +// uint16ToUint converts the uint16 value to uint safely. +func uint16ToUint(value uint16) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = uint16ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = uint16ToUint64(value) + result = uint(r) + return +} + +// uint32ToInt converts the uint32 value to int safely. +func uint32ToInt(value uint32) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = uint32ToInt32(value) + result = int(r) + } + var r int64 + r, ok = uint32ToInt64(value) + result = int(r) + return +} + +// uint32ToUint converts the uint32 value to uint safely. +func uint32ToUint(value uint32) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = uint32ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = uint32ToUint64(value) + result = uint(r) + return +} + +// uint64ToInt converts the uint64 value to int safely. +func uint64ToInt(value uint64) (result int, ok bool) { + if intBits == 32 { + var r int32 + r, ok = uint64ToInt32(value) + result = int(r) + } + var r int64 + r, ok = uint64ToInt64(value) + result = int(r) + return +} + +// uint64ToUint converts the uint64 value to uint safely. +func uint64ToUint(value uint64) (result uint, ok bool) { + if intBits == 32 { + var r uint32 + r, ok = uint64ToUint32(value) + result = uint(r) + } + var r uint64 + r, ok = uint64ToUint64(value) + result = uint(r) + return +} + +// intToInt8 converts the int value to int8 safely. +func intToInt8(value int) (result int8, ok bool) { + if intBits == 32 { + return int32ToInt8(int32(value)) + } + return int64ToInt8(int64(value)) +} + +// intToInt16 converts the int value to int16 safely. +func intToInt16(value int) (result int16, ok bool) { + if intBits == 32 { + return int32ToInt16(int32(value)) + } + return int64ToInt16(int64(value)) +} + +// intToInt32 converts the int value to int32 safely. +func intToInt32(value int) (result int32, ok bool) { + if intBits == 32 { + return int32ToInt32(int32(value)) + } + return int64ToInt32(int64(value)) +} + +// intToInt64 converts the int value to int64 safely. +func intToInt64(value int) (result int64, ok bool) { + if intBits == 32 { + return int32ToInt64(int32(value)) + } + return int64ToInt64(int64(value)) +} + +// intToUint8 converts the int value to uint8 safely. +func intToUint8(value int) (result uint8, ok bool) { + if intBits == 32 { + return int32ToUint8(int32(value)) + } + return int64ToUint8(int64(value)) +} + +// intToUint16 converts the int value to uint16 safely. +func intToUint16(value int) (result uint16, ok bool) { + if intBits == 32 { + return int32ToUint16(int32(value)) + } + return int64ToUint16(int64(value)) +} + +// intToUint32 converts the int value to uint32 safely. +func intToUint32(value int) (result uint32, ok bool) { + if intBits == 32 { + return int32ToUint32(int32(value)) + } + return int64ToUint32(int64(value)) +} + +// intToUint64 converts the int value to uint64 safely. +func intToUint64(value int) (result uint64, ok bool) { + if intBits == 32 { + return int32ToUint64(int32(value)) + } + return int64ToUint64(int64(value)) +} + +// uintToInt8 converts the uint value to int8 safely. +func uintToInt8(value uint) (result int8, ok bool) { + if intBits == 32 { + return uint32ToInt8(uint32(value)) + } + return uint64ToInt8(uint64(value)) +} + +// uintToInt16 converts the uint value to int16 safely. +func uintToInt16(value uint) (result int16, ok bool) { + if intBits == 32 { + return uint32ToInt16(uint32(value)) + } + return uint64ToInt16(uint64(value)) +} + +// uintToInt32 converts the uint value to int32 safely. +func uintToInt32(value uint) (result int32, ok bool) { + if intBits == 32 { + return uint32ToInt32(uint32(value)) + } + return uint64ToInt32(uint64(value)) +} + +// uintToInt64 converts the uint value to int64 safely. +func uintToInt64(value uint) (result int64, ok bool) { + if intBits == 32 { + return uint32ToInt64(uint32(value)) + } + return uint64ToInt64(uint64(value)) +} + +// uintToUint8 converts the uint value to uint8 safely. +func uintToUint8(value uint) (result uint8, ok bool) { + if intBits == 32 { + return uint32ToUint8(uint32(value)) + } + return uint64ToUint8(uint64(value)) +} + +// uintToUint16 converts the uint value to uint16 safely. +func uintToUint16(value uint) (result uint16, ok bool) { + if intBits == 32 { + return uint32ToUint16(uint32(value)) + } + return uint64ToUint16(uint64(value)) +} + +// uintToUint32 converts the uint value to uint32 safely. +func uintToUint32(value uint) (result uint32, ok bool) { + if intBits == 32 { + return uint32ToUint32(uint32(value)) + } + return uint64ToUint32(uint64(value)) +} + +// uintToUint64 converts the uint value to uint64 safely. +func uintToUint64(value uint) (result uint64, ok bool) { + if intBits == 32 { + return uint32ToUint64(uint32(value)) + } + return uint64ToUint64(uint64(value)) +} + +// intToUint converts the int value to uint safely. +func intToUint(value int) (result uint, ok bool) { + return uint(value), value >= 0 +} + +// uintToInt converts the uint value to int safely. +func uintToInt(value uint) (result int, ok bool) { + return int(value), value <= math.MaxInt +} + +// float32ToInt8 converts the float32 value to int8 safely. +func float32ToInt8(value float32) (result int8, ok bool) { + return int8(value), value >= math.MinInt8 && value <= math.MaxInt8 +} + +func int8ToFloat32(value int8) (float32, bool) { + return float32(value), true +} + +// float32ToInt16 converts the float32 value to int16 safely. +func float32ToInt16(value float32) (result int16, ok bool) { + return int16(value), value >= math.MinInt16 && value <= math.MaxInt16 +} + +func int16ToFloat32(value int16) (float32, bool) { + return float32(value), true +} + +// float32ToInt32 converts the float32 value to int32 safely. +func float32ToInt32(value float32) (result int32, ok bool) { + return int32(value), value >= math.MinInt32 && value <= math.MaxInt32 +} + +func int32ToFloat32(value int32) (float32, bool) { + return float32(value), true +} + +// float32ToInt64 converts the float32 value to int64 safely. +func float32ToInt64(value float32) (result int64, ok bool) { + return int64(value), value >= math.MinInt64 && value <= math.MaxInt64 +} + +func int64ToFloat32(value int64) (float32, bool) { + return float32(value), true +} + +// float32ToInt converts the float32 value to int safely. +func float32ToInt(value float32) (result int, ok bool) { + return int(value), value >= math.MinInt && value <= math.MaxInt +} + +func intToFloat32(value int) (float32, bool) { + return float32(value), true +} + +// float32ToUint8 converts the float32 value to uint8 safely. +func float32ToUint8(value float32) (result uint8, ok bool) { + return uint8(value), value >= 0 && value <= math.MaxUint8 +} + +func uint8ToFloat32(value uint8) (float32, bool) { + return float32(value), true +} + +// float32ToUint16 converts the float32 value to uint16 safely. +func float32ToUint16(value float32) (result uint16, ok bool) { + return uint16(value), value >= 0 && value <= math.MaxUint16 +} + +func uint16ToFloat32(value uint16) (float32, bool) { + return float32(value), true +} + +// float32ToUint32 converts the float32 value to uint32 safely. +func float32ToUint32(value float32) (result uint32, ok bool) { + return uint32(value), value >= 0 && value <= math.MaxUint32 +} + +func uint32ToFloat32(value uint32) (float32, bool) { + return float32(value), true +} + +// float32ToUint64 converts the float32 value to uint64 safely. +func float32ToUint64(value float32) (result uint64, ok bool) { + return uint64(value), value >= 0 && value <= math.MaxUint64 +} + +func uint64ToFloat32(value uint64) (float32, bool) { + return float32(value), true +} + +// float32ToUint converts the float32 value to uint safely. +func float32ToUint(value float32) (result uint, ok bool) { + return uint(value), value >= 0 && value <= math.MaxUint +} + +func uintToFloat32(value uint) (float32, bool) { + return float32(value), true +} + +// float64ToInt8 converts the float64 value to int8 safely. +func float64ToInt8(value float64) (result int8, ok bool) { + return int8(value), value >= math.MinInt8 && value <= math.MaxInt8 +} + +func int8ToFloat64(value int8) (float64, bool) { + return float64(value), true +} + +// float64ToInt16 converts the float64 value to int16 safely. +func float64ToInt16(value float64) (result int16, ok bool) { + return int16(value), value >= math.MinInt16 && value <= math.MaxInt16 +} + +func int16ToFloat64(value int16) (float64, bool) { + return float64(value), true +} + +// float64ToInt32 converts the float64 value to int32 safely. +func float64ToInt32(value float64) (result int32, ok bool) { + return int32(value), value >= math.MinInt32 && value <= math.MaxInt32 +} + +func int32ToFloat64(value int32) (float64, bool) { + return float64(value), true +} + +// float64ToInt64 converts the float64 value to int64 safely. +func float64ToInt64(value float64) (result int64, ok bool) { + return int64(value), value >= math.MinInt64 && value <= math.MaxInt64 +} + +func int64ToFloat64(value int64) (float64, bool) { + return float64(value), true +} + +// float64ToInt converts the float64 value to int safely. +func float64ToInt(value float64) (result int, ok bool) { + return int(value), value >= math.MinInt && value <= math.MaxInt +} + +func intToFloat64(value int) (float64, bool) { + return float64(value), true +} + +// float64ToUint8 converts the float64 value to uint8 safely. +func float64ToUint8(value float64) (result uint8, ok bool) { + return uint8(value), value >= 0 && value <= math.MaxUint8 +} + +func uint8ToFloat64(value uint8) (float64, bool) { + return float64(value), true +} + +// float64ToUint16 converts the float64 value to uint16 safely. +func float64ToUint16(value float64) (result uint16, ok bool) { + return uint16(value), value >= 0 && value <= math.MaxUint16 +} + +func uint16ToFloat64(value uint16) (float64, bool) { + return float64(value), true +} + +// float64ToUint32 converts the float64 value to uint32 safely. +func float64ToUint32(value float64) (result uint32, ok bool) { + return uint32(value), value >= 0 && value <= math.MaxUint32 +} + +func uint32ToFloat64(value uint32) (float64, bool) { + return float64(value), true +} + +// float64ToUint64 converts the float64 value to uint64 safely. +func float64ToUint64(value float64) (result uint64, ok bool) { + return uint64(value), value >= 0 && value <= math.MaxUint64 +} + +func uint64ToFloat64(value uint64) (float64, bool) { + return float64(value), true +} + +// float64ToUint converts the float64 value to uint safely. +func float64ToUint(value float64) (result uint, ok bool) { + return uint(value), value >= 0 && value <= math.MaxUint +} + +func uintToFloat64(value uint) (float64, bool) { + return float64(value), true +} + +func float32ToFloat64(value float32) (float64, bool) { + return float64(value), true +} + +func float64ToFloat32(value float64) (float32, bool) { + return float32(value), value >= -math.MaxFloat32 && value <= math.MaxFloat32 +} diff --git a/vendor/github.com/chen3feng/safecast/check.sh b/vendor/github.com/chen3feng/safecast/check.sh new file mode 100644 index 0000000..211b409 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/check.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -e + +go build + +go test ./... -coverprofile=coverage.out + +golint + +gosec . + +go tool cover -html=coverage.out diff --git a/vendor/github.com/chen3feng/safecast/generate.py b/vendor/github.com/chen3feng/safecast/generate.py new file mode 100644 index 0000000..f1728e4 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/generate.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +from safecast import ALL_INT_BITS, ALL_FIXED_INT_BITS, to_camel_case + + +def generate_header(): + print('package safecast\n') + print('import "math"\n') + print('const intBits = 32 << (^uint(0) >> 63)\n') + + +def generate_fixed_int(from_type, from_bits, to_type, to_bits): + full_from_type = f'{from_type}{from_bits}' + full_to_type = f'{to_type}{to_bits}' + + no_overflow = f'{full_from_type}({full_to_type}(value)) == value' + + def same_signedness(): + if int(from_bits) <= int(to_bits): + return + return no_overflow + + def uint_to_int(): + if int(from_bits) < int(to_bits): + return + if int(from_bits) > int(to_bits): + return no_overflow + else: + return f'value <= math.Max{to_camel_case(full_to_type)}' + + def int_to_uint(): + cond = 'value >= 0' + if from_bits and to_bits: # intnn to uintxx + if int(from_bits) <= int(to_bits): + return cond + return f'{cond} && {no_overflow}' + + def range_chack(): + if from_type == to_type: + return same_signedness() + elif from_type == 'uint': + return uint_to_int() + else: # int to uint + return int_to_uint() + + print_function_header(full_from_type, full_to_type) + ok = range_chack() + print_function_footer(full_to_type, ok) + + +def print_function_header(full_from_type, full_to_type): + func_name = f'{full_from_type}To{to_camel_case(full_to_type)}' + print(f'// {func_name} converts the {full_from_type} value to {full_to_type} safely.') + print(f'func {func_name}(value {full_from_type}) (result {full_to_type}, ok bool) {{') + + +def print_function_footer(full_to_type, ok): + if not ok: + ok = 'true' + print(f'\treturn {full_to_type}(value), {ok}') + print('}\n') + + +def generate_int_convs(): + """Generate int conversion functions.""" + for from_type in ('int', 'uint'): + for from_bits in ALL_FIXED_INT_BITS: + for to_type in ('int', 'uint'): + for to_bits in ALL_FIXED_INT_BITS: + generate_fixed_int(from_type, from_bits, to_type, to_bits) + for from_type in ('int', 'uint'): + for from_bits in ALL_FIXED_INT_BITS: + for to_type in ('int', 'uint'): + generate_fixed_int_to_int(from_type, from_bits, to_type) + for from_type in ('int', 'uint'): + for to_type in ('int', 'uint'): + for to_bit in ALL_FIXED_INT_BITS: + generate_int_to_fixed_int(from_type, to_type, to_bit) + + print_function_header('int', 'uint') + print(''' return uint(value), value >= 0''') + print('}') + + print_function_header('uint', 'int') + print(''' return int(value), value <= math.MaxInt''') + print('}') + + +def generate_fixed_int_to_int(from_type, from_bits, to_type): + full_from_type = f'{from_type}{from_bits}' + print_function_header(full_from_type, to_type) + print(f'''\ + if intBits == 32 {{ + var r {to_type}32 + r, ok = {full_from_type}To{to_camel_case(to_type)}32(value) + result = {to_type}(r) + }} + var r {to_type}64 + r, ok = {full_from_type}To{to_camel_case(to_type)}64(value) + result = {to_type}(r) + return''') + print('}') + + +def generate_int_to_fixed_int(from_type, to_type, to_bits): + full_to_type = f'{to_type}{to_bits}' + print_function_header(from_type, full_to_type) + print(f'''\ + if intBits == 32 {{ + return {from_type}32To{to_camel_case(full_to_type)}({from_type}32(value)) + }} + return {from_type}64To{to_camel_case(full_to_type)}({from_type}64(value))''') + print('}') + + +def generate_float_convs(): + """Generate int conversion functions.""" + for float_bit in (32, 64): + for int_type in ('int', 'uint'): + for int_bits in ALL_INT_BITS: + generate_float_to_conv(float_bit, int_type, int_bits) + generate_to_float_conv(int_type, int_bits, float_bit) + + print(""" + func float32ToFloat64(value float32) (float64, bool) { + return float64(value), true + } + + func float64ToFloat32(value float64) (float32, bool) { + return float32(value), value >= -math.MaxFloat32 && value <= math.MaxFloat32 + } + """) + + +def generate_float_to_conv(from_bits, to_type, to_bits): + full_from_type = f'float{from_bits}' + full_to_type = f'{to_type}{to_bits}' + + def range_chack(): + min = f'math.Min{to_camel_case(full_to_type)}' if to_type == 'int' else '0' + cond = f'value >= {min}' + cond += f' && value <= math.Max{to_camel_case(full_to_type)}' + return cond + + func_name = f'float{from_bits}To{to_camel_case(to_type)}{to_bits}' + print(f'// {func_name} converts the {full_from_type} value to {full_to_type} safely.') + print(f'func {func_name}(value {full_from_type}) (result {full_to_type}, ok bool) {{') + ok = range_chack() or 'true' + print(f'\treturn {full_to_type}(value), {ok}') + print('}\n') + + +def generate_to_float_conv(from_type, from_bits, to_bits): + full_from_type = f'{from_type}{from_bits}' + full_to_type = f'float{to_bits}' + print(f'func {full_from_type}To{to_camel_case(full_to_type)}(value {full_from_type}) ({full_to_type}, bool) {{') + print(f'\treturn {full_to_type}(value), true') + print('}') + + +def main(): + generate_header() + generate_int_convs() + generate_float_convs() + + +main() diff --git a/vendor/github.com/chen3feng/safecast/generate_generic.py b/vendor/github.com/chen3feng/safecast/generate_generic.py new file mode 100644 index 0000000..0bb437c --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/generate_generic.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +"""Generate the generic 'To' function.""" + +from safecast import ALL_INT_BITS, to_camel_case + + +def generate_header(): + """Generate file header.""" + print('package safecast\n') + print(""" +type numericType interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | + ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | + ~float32 | ~float64 +}""" + ) + + +def generate_to_function(): + """Generate int conversion functions.""" + + print(""" +// To converts a numeric value from the FromType to the specified ToType type safely. +// result will always be same as the usual type cast (type(value)), +// but ok is false when overflow or underflow occured. +func To[ToType numericType, FromType numericType](value FromType)(result ToType, ok bool) { + ok = true + switch t := any(result).(type) {""") + for to_type in ('int', 'uint'): + for to_bits in ALL_INT_BITS: + print(f'\tcase {to_type}{to_bits}:') + print(f'\t\tt, ok = To{to_camel_case(to_type)}{to_bits}(value)') + print(f'\t\tresult = ToType(t)') + for to_bits in (32, 64): + print(f'\tcase float{to_bits}:') + print(f'\t\tt, ok = ToFloat{to_bits}(value)') + print(f'\t\tresult = ToType(t)') + + print('\t}\n\treturn result, ok') + print('}\n') + + for to_type in ('int', 'uint'): + for to_bits in ALL_INT_BITS: + generate_to_type(to_type, to_bits) + for to_bits in (32, 64): + generate_to_type('float', to_bits) + + +def generate_to_type(to_type, to_bits): + full_to_type = f'{to_type}{to_bits}' + funcname = f'To{to_camel_case(to_type)}{to_bits}' + print(f'// {funcname} converts value to {full_to_type} type safely.\n' + f'// result will always be same as the usual type cast({full_to_type}(value)),\n' + f'// but ok is false when overflow or underflow occured.' + ) + print( + f'func {funcname}[FromType numericType](value FromType) ({full_to_type}, bool) {{') + print('\tvar zero FromType // Use zero to any for type switch to avoid malloc') + print(f'\tswitch any(zero).(type) {{') + for from_type in ('int', 'uint'): + for from_bits in ALL_INT_BITS: + full_from_type = f'{from_type}{from_bits}' + generate_call(full_from_type, full_to_type) + + for from_bits in (32, 64): + full_from_type = f'float{from_bits}' + generate_call(full_from_type, full_to_type) + print(f'\t}}\n\treturn {full_to_type}(value), false') + print('}\n') + + +def generate_call(full_from_type, full_to_type): + if full_from_type == full_to_type: + print(f'\tcase {full_from_type}: return {full_to_type}(value), true') + else: + print( + f'\tcase {full_from_type}: return {full_from_type}To{to_camel_case(full_to_type)}({full_from_type}(value))') + + +def main(): + generate_header() + generate_to_function() + + +main() diff --git a/vendor/github.com/chen3feng/safecast/generate_test.py b/vendor/github.com/chen3feng/safecast/generate_test.py new file mode 100644 index 0000000..9979db3 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/generate_test.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + + +from safecast import ALL_INT_BITS, to_camel_case + + +print(''' +package safecast_test + +import ( + _ "fmt" + "math" + "testing" + + "github.com/chen3feng/safecast" +) + +func expectFalse(t * testing.T, value bool) { + if value { + t.Helper() + t.Errorf("Expect false, got true") + } +} + +func expectTrue(t * testing.T, value bool) { + if !value { + t.Helper() + t.Errorf("Expect true, got false") + } +} +''') + +for from_bit in ALL_INT_BITS: + for to_bit in ALL_INT_BITS: + to_type = f'uint{to_bit}' + from_type = f'int{from_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = -1 + _, ok := safecast.To[{to_type}](i) + expectFalse(t, ok) + i = 1 + _, ok = safecast.To[{to_type}](i) + expectTrue(t, ok) + }}''') + +for from_bit in ALL_INT_BITS: + for to_bit in ALL_INT_BITS: + from_type = f'int{from_bit}' + to_type = f'int{to_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = -1 + _, ok := safecast.To[{to_type}](i) + expectTrue(t, ok) + i = 1 + _, ok = safecast.To[{to_type}](i) + expectTrue(t, ok) + }}''') + +for from_bit in ALL_INT_BITS: + for to_bit in ALL_INT_BITS: + from_type = f'uint{from_bit}' + to_type = f'int{to_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = 1 + _, ok := safecast.To[{to_type}](i) + expectTrue(t, ok)''') + if from_bit and to_bit and int(from_bit) >= int(to_bit): + print(f''' + i = math.MaxInt{to_bit} + 1 + _, ok = safecast.To[{to_type}](i) + expectFalse(t, ok)''') + print('}') + +for from_bit in ALL_INT_BITS: + for to_bit in ALL_INT_BITS: + from_type = f'uint{from_bit}' + to_type = f'uint{to_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = 1 + _, ok := safecast.To[{to_type}](i) + expectTrue(t, ok)''') + if from_bit and to_bit and int(from_bit) > int(to_bit) and int(to_bit) < 64: + print(f''' + i = math.MaxUint{to_bit} + 1 + _, ok = safecast.To[{to_type}](i) + expectFalse(t, ok)''') + print('}') + +for from_bit in ALL_INT_BITS: + for to_bit in (32, 64): + to_type = f'float{to_bit}' + for int_type in ('int', 'uint'): + from_type = f'{int_type}{from_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = 1 + _, ok := safecast.To[{to_type}](i) + expectTrue(t, ok)''') + print('}') + +for from_bit in (32, 64): + for to_bit in ALL_INT_BITS: + from_type = f'float{from_bit}' + to_type = f'int{to_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = 1 + _, ok := safecast.To[{to_type}](i) + expectTrue(t, ok) + i = math.Max{to_camel_case(to_type)} + i *= 2 + _, ok = safecast.To[{to_type}](i) + expectFalse(t, ok)''') + print('}') + +for from_bit in (32, 64): + for to_bit in ALL_INT_BITS: + from_type = f'float{from_bit}' + to_type = f'uint{to_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = -1 + _, ok := safecast.To[{to_type}](i) + expectFalse(t, ok) + i = math.Max{to_camel_case(to_type)} + i *= 2 + _, ok = safecast.To[{to_type}](i) + expectFalse(t, ok) + i = 1 + _, ok = safecast.To[{to_type}](i) + expectTrue(t, ok)''') + print('}') + +for from_bit in (32, 64): + for to_bit in (32, 64): + from_type = f'float{from_bit}' + to_type = f'float{to_bit}' + print(f''' + func TestTo_{from_type}_to_{to_type}(t * testing.T) {{ + var i {from_type} = -1 + _, ok := safecast.To[{to_type}](i) + expectTrue(t, ok) + i = 1 + _, ok = safecast.To[{to_type}](i) + expectTrue(t, ok)''') + print('}') diff --git a/vendor/github.com/chen3feng/safecast/generics.go b/vendor/github.com/chen3feng/safecast/generics.go new file mode 100644 index 0000000..04bb538 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/generics.go @@ -0,0 +1,461 @@ +package safecast + +type numericType interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | + ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | + ~float32 | ~float64 +} + +// To converts a numeric value from the FromType to the specified ToType type safely. +// result will always be same as the usual type cast (type(value)), +// but ok is false when overflow or underflow occured. +func To[ToType numericType, FromType numericType](value FromType) (result ToType, ok bool) { + ok = true + switch t := any(result).(type) { + case int8: + t, ok = ToInt8(value) + result = ToType(t) + case int16: + t, ok = ToInt16(value) + result = ToType(t) + case int32: + t, ok = ToInt32(value) + result = ToType(t) + case int64: + t, ok = ToInt64(value) + result = ToType(t) + case int: + t, ok = ToInt(value) + result = ToType(t) + case uint8: + t, ok = ToUint8(value) + result = ToType(t) + case uint16: + t, ok = ToUint16(value) + result = ToType(t) + case uint32: + t, ok = ToUint32(value) + result = ToType(t) + case uint64: + t, ok = ToUint64(value) + result = ToType(t) + case uint: + t, ok = ToUint(value) + result = ToType(t) + case float32: + t, ok = ToFloat32(value) + result = ToType(t) + case float64: + t, ok = ToFloat64(value) + result = ToType(t) + } + return result, ok +} + +// ToInt8 converts value to int8 type safely. +// result will always be same as the usual type cast(int8(value)), +// but ok is false when overflow or underflow occured. +func ToInt8[FromType numericType](value FromType) (int8, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8(value), true + case int16: + return int16ToInt8(int16(value)) + case int32: + return int32ToInt8(int32(value)) + case int64: + return int64ToInt8(int64(value)) + case int: + return intToInt8(int(value)) + case uint8: + return uint8ToInt8(uint8(value)) + case uint16: + return uint16ToInt8(uint16(value)) + case uint32: + return uint32ToInt8(uint32(value)) + case uint64: + return uint64ToInt8(uint64(value)) + case uint: + return uintToInt8(uint(value)) + case float32: + return float32ToInt8(float32(value)) + case float64: + return float64ToInt8(float64(value)) + } + return int8(value), false +} + +// ToInt16 converts value to int16 type safely. +// result will always be same as the usual type cast(int16(value)), +// but ok is false when overflow or underflow occured. +func ToInt16[FromType numericType](value FromType) (int16, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToInt16(int8(value)) + case int16: + return int16(value), true + case int32: + return int32ToInt16(int32(value)) + case int64: + return int64ToInt16(int64(value)) + case int: + return intToInt16(int(value)) + case uint8: + return uint8ToInt16(uint8(value)) + case uint16: + return uint16ToInt16(uint16(value)) + case uint32: + return uint32ToInt16(uint32(value)) + case uint64: + return uint64ToInt16(uint64(value)) + case uint: + return uintToInt16(uint(value)) + case float32: + return float32ToInt16(float32(value)) + case float64: + return float64ToInt16(float64(value)) + } + return int16(value), false +} + +// ToInt32 converts value to int32 type safely. +// result will always be same as the usual type cast(int32(value)), +// but ok is false when overflow or underflow occured. +func ToInt32[FromType numericType](value FromType) (int32, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToInt32(int8(value)) + case int16: + return int16ToInt32(int16(value)) + case int32: + return int32(value), true + case int64: + return int64ToInt32(int64(value)) + case int: + return intToInt32(int(value)) + case uint8: + return uint8ToInt32(uint8(value)) + case uint16: + return uint16ToInt32(uint16(value)) + case uint32: + return uint32ToInt32(uint32(value)) + case uint64: + return uint64ToInt32(uint64(value)) + case uint: + return uintToInt32(uint(value)) + case float32: + return float32ToInt32(float32(value)) + case float64: + return float64ToInt32(float64(value)) + } + return int32(value), false +} + +// ToInt64 converts value to int64 type safely. +// result will always be same as the usual type cast(int64(value)), +// but ok is false when overflow or underflow occured. +func ToInt64[FromType numericType](value FromType) (int64, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToInt64(int8(value)) + case int16: + return int16ToInt64(int16(value)) + case int32: + return int32ToInt64(int32(value)) + case int64: + return int64(value), true + case int: + return intToInt64(int(value)) + case uint8: + return uint8ToInt64(uint8(value)) + case uint16: + return uint16ToInt64(uint16(value)) + case uint32: + return uint32ToInt64(uint32(value)) + case uint64: + return uint64ToInt64(uint64(value)) + case uint: + return uintToInt64(uint(value)) + case float32: + return float32ToInt64(float32(value)) + case float64: + return float64ToInt64(float64(value)) + } + return int64(value), false +} + +// ToInt converts value to int type safely. +// result will always be same as the usual type cast(int(value)), +// but ok is false when overflow or underflow occured. +func ToInt[FromType numericType](value FromType) (int, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToInt(int8(value)) + case int16: + return int16ToInt(int16(value)) + case int32: + return int32ToInt(int32(value)) + case int64: + return int64ToInt(int64(value)) + case int: + return int(value), true + case uint8: + return uint8ToInt(uint8(value)) + case uint16: + return uint16ToInt(uint16(value)) + case uint32: + return uint32ToInt(uint32(value)) + case uint64: + return uint64ToInt(uint64(value)) + case uint: + return uintToInt(uint(value)) + case float32: + return float32ToInt(float32(value)) + case float64: + return float64ToInt(float64(value)) + } + return int(value), false +} + +// ToUint8 converts value to uint8 type safely. +// result will always be same as the usual type cast(uint8(value)), +// but ok is false when overflow or underflow occured. +func ToUint8[FromType numericType](value FromType) (uint8, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToUint8(int8(value)) + case int16: + return int16ToUint8(int16(value)) + case int32: + return int32ToUint8(int32(value)) + case int64: + return int64ToUint8(int64(value)) + case int: + return intToUint8(int(value)) + case uint8: + return uint8(value), true + case uint16: + return uint16ToUint8(uint16(value)) + case uint32: + return uint32ToUint8(uint32(value)) + case uint64: + return uint64ToUint8(uint64(value)) + case uint: + return uintToUint8(uint(value)) + case float32: + return float32ToUint8(float32(value)) + case float64: + return float64ToUint8(float64(value)) + } + return uint8(value), false +} + +// ToUint16 converts value to uint16 type safely. +// result will always be same as the usual type cast(uint16(value)), +// but ok is false when overflow or underflow occured. +func ToUint16[FromType numericType](value FromType) (uint16, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToUint16(int8(value)) + case int16: + return int16ToUint16(int16(value)) + case int32: + return int32ToUint16(int32(value)) + case int64: + return int64ToUint16(int64(value)) + case int: + return intToUint16(int(value)) + case uint8: + return uint8ToUint16(uint8(value)) + case uint16: + return uint16(value), true + case uint32: + return uint32ToUint16(uint32(value)) + case uint64: + return uint64ToUint16(uint64(value)) + case uint: + return uintToUint16(uint(value)) + case float32: + return float32ToUint16(float32(value)) + case float64: + return float64ToUint16(float64(value)) + } + return uint16(value), false +} + +// ToUint32 converts value to uint32 type safely. +// result will always be same as the usual type cast(uint32(value)), +// but ok is false when overflow or underflow occured. +func ToUint32[FromType numericType](value FromType) (uint32, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToUint32(int8(value)) + case int16: + return int16ToUint32(int16(value)) + case int32: + return int32ToUint32(int32(value)) + case int64: + return int64ToUint32(int64(value)) + case int: + return intToUint32(int(value)) + case uint8: + return uint8ToUint32(uint8(value)) + case uint16: + return uint16ToUint32(uint16(value)) + case uint32: + return uint32(value), true + case uint64: + return uint64ToUint32(uint64(value)) + case uint: + return uintToUint32(uint(value)) + case float32: + return float32ToUint32(float32(value)) + case float64: + return float64ToUint32(float64(value)) + } + return uint32(value), false +} + +// ToUint64 converts value to uint64 type safely. +// result will always be same as the usual type cast(uint64(value)), +// but ok is false when overflow or underflow occured. +func ToUint64[FromType numericType](value FromType) (uint64, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToUint64(int8(value)) + case int16: + return int16ToUint64(int16(value)) + case int32: + return int32ToUint64(int32(value)) + case int64: + return int64ToUint64(int64(value)) + case int: + return intToUint64(int(value)) + case uint8: + return uint8ToUint64(uint8(value)) + case uint16: + return uint16ToUint64(uint16(value)) + case uint32: + return uint32ToUint64(uint32(value)) + case uint64: + return uint64(value), true + case uint: + return uintToUint64(uint(value)) + case float32: + return float32ToUint64(float32(value)) + case float64: + return float64ToUint64(float64(value)) + } + return uint64(value), false +} + +// ToUint converts value to uint type safely. +// result will always be same as the usual type cast(uint(value)), +// but ok is false when overflow or underflow occured. +func ToUint[FromType numericType](value FromType) (uint, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToUint(int8(value)) + case int16: + return int16ToUint(int16(value)) + case int32: + return int32ToUint(int32(value)) + case int64: + return int64ToUint(int64(value)) + case int: + return intToUint(int(value)) + case uint8: + return uint8ToUint(uint8(value)) + case uint16: + return uint16ToUint(uint16(value)) + case uint32: + return uint32ToUint(uint32(value)) + case uint64: + return uint64ToUint(uint64(value)) + case uint: + return uint(value), true + case float32: + return float32ToUint(float32(value)) + case float64: + return float64ToUint(float64(value)) + } + return uint(value), false +} + +// ToFloat32 converts value to float32 type safely. +// result will always be same as the usual type cast(float32(value)), +// but ok is false when overflow or underflow occured. +func ToFloat32[FromType numericType](value FromType) (float32, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToFloat32(int8(value)) + case int16: + return int16ToFloat32(int16(value)) + case int32: + return int32ToFloat32(int32(value)) + case int64: + return int64ToFloat32(int64(value)) + case int: + return intToFloat32(int(value)) + case uint8: + return uint8ToFloat32(uint8(value)) + case uint16: + return uint16ToFloat32(uint16(value)) + case uint32: + return uint32ToFloat32(uint32(value)) + case uint64: + return uint64ToFloat32(uint64(value)) + case uint: + return uintToFloat32(uint(value)) + case float32: + return float32(value), true + case float64: + return float64ToFloat32(float64(value)) + } + return float32(value), false +} + +// ToFloat64 converts value to float64 type safely. +// result will always be same as the usual type cast(float64(value)), +// but ok is false when overflow or underflow occured. +func ToFloat64[FromType numericType](value FromType) (float64, bool) { + var zero FromType // Use zero to any for type switch to avoid malloc + switch any(zero).(type) { + case int8: + return int8ToFloat64(int8(value)) + case int16: + return int16ToFloat64(int16(value)) + case int32: + return int32ToFloat64(int32(value)) + case int64: + return int64ToFloat64(int64(value)) + case int: + return intToFloat64(int(value)) + case uint8: + return uint8ToFloat64(uint8(value)) + case uint16: + return uint16ToFloat64(uint16(value)) + case uint32: + return uint32ToFloat64(uint32(value)) + case uint64: + return uint64ToFloat64(uint64(value)) + case uint: + return uintToFloat64(uint(value)) + case float32: + return float32ToFloat64(float32(value)) + case float64: + return float64(value), true + } + return float64(value), false +} diff --git a/vendor/github.com/chen3feng/safecast/safecast.go b/vendor/github.com/chen3feng/safecast/safecast.go new file mode 100644 index 0000000..c86180d --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/safecast.go @@ -0,0 +1,9 @@ +// Package safecast provide a safe way to cast a numeric value from type A to type B, +// with overflow and underflow check. +package safecast + +// All the code are generated. + +//go:generate sh -c "python3 generate.py | gofmt > casts.go" +//go:generate sh -c "python3 generate_generic.py | gofmt > generics.go" +//go:generate sh -c "python3 generate_test.py | gofmt > safecast_test.go" diff --git a/vendor/github.com/chen3feng/safecast/safecast.py b/vendor/github.com/chen3feng/safecast/safecast.py new file mode 100644 index 0000000..ba9663d --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/safecast.py @@ -0,0 +1,6 @@ +ALL_FIXED_INT_BITS = ('8', '16', '32', '64') +ALL_INT_BITS = ALL_FIXED_INT_BITS + ('',) + + +def to_camel_case(type): + return type[0].upper() + type[1:] diff --git a/vendor/github.com/chen3feng/safecast/updatedoc.sh b/vendor/github.com/chen3feng/safecast/updatedoc.sh new file mode 100644 index 0000000..a988f78 --- /dev/null +++ b/vendor/github.com/chen3feng/safecast/updatedoc.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +gomarkdoc -o README.md -e . +gomarkdoc -o README_zh.md -e . diff --git a/vendor/github.com/fatih/color/color.go b/vendor/github.com/fatih/color/color.go index 889f9e7..81094e8 100644 --- a/vendor/github.com/fatih/color/color.go +++ b/vendor/github.com/fatih/color/color.go @@ -65,6 +65,29 @@ const ( CrossedOut ) +const ( + ResetBold Attribute = iota + 22 + ResetItalic + ResetUnderline + ResetBlinking + _ + ResetReversed + ResetConcealed + ResetCrossedOut +) + +var mapResetAttributes map[Attribute]Attribute = map[Attribute]Attribute{ + Bold: ResetBold, + Faint: ResetBold, + Italic: ResetItalic, + Underline: ResetUnderline, + BlinkSlow: ResetBlinking, + BlinkRapid: ResetBlinking, + ReverseVideo: ResetReversed, + Concealed: ResetConcealed, + CrossedOut: ResetCrossedOut, +} + // Foreground text colors const ( FgBlack Attribute = iota + 30 @@ -246,10 +269,7 @@ func (c *Color) Printf(format string, a ...interface{}) (n int, err error) { // On Windows, users should wrap w with colorable.NewColorable() if w is of // type *os.File. func (c *Color) Fprintln(w io.Writer, a ...interface{}) (n int, err error) { - c.SetWriter(w) - defer c.UnsetWriter(w) - - return fmt.Fprintln(w, a...) + return fmt.Fprintln(w, c.wrap(sprintln(a...))) } // Println formats using the default formats for its operands and writes to @@ -258,10 +278,7 @@ func (c *Color) Fprintln(w io.Writer, a ...interface{}) (n int, err error) { // encountered. This is the standard fmt.Print() method wrapped with the given // color. func (c *Color) Println(a ...interface{}) (n int, err error) { - c.Set() - defer c.unset() - - return fmt.Fprintln(Output, a...) + return fmt.Fprintln(Output, c.wrap(sprintln(a...))) } // Sprint is just like Print, but returns a string instead of printing it. @@ -271,7 +288,7 @@ func (c *Color) Sprint(a ...interface{}) string { // Sprintln is just like Println, but returns a string instead of printing it. func (c *Color) Sprintln(a ...interface{}) string { - return c.wrap(fmt.Sprintln(a...)) + return c.wrap(sprintln(a...)) + "\n" } // Sprintf is just like Printf, but returns a string instead of printing it. @@ -353,7 +370,7 @@ func (c *Color) SprintfFunc() func(format string, a ...interface{}) string { // string. Windows users should use this in conjunction with color.Output. func (c *Color) SprintlnFunc() func(a ...interface{}) string { return func(a ...interface{}) string { - return c.wrap(fmt.Sprintln(a...)) + return c.wrap(sprintln(a...)) + "\n" } } @@ -383,7 +400,18 @@ func (c *Color) format() string { } func (c *Color) unformat() string { - return fmt.Sprintf("%s[%dm", escape, Reset) + //return fmt.Sprintf("%s[%dm", escape, Reset) + //for each element in sequence let's use the speficic reset escape, ou the generic one if not found + format := make([]string, len(c.params)) + for i, v := range c.params { + format[i] = strconv.Itoa(int(Reset)) + ra, ok := mapResetAttributes[v] + if ok { + format[i] = strconv.Itoa(int(ra)) + } + } + + return fmt.Sprintf("%s[%sm", escape, strings.Join(format, ";")) } // DisableColor disables the color output. Useful to not change any existing @@ -411,6 +439,12 @@ func (c *Color) isNoColorSet() bool { // Equals returns a boolean value indicating whether two colors are equal. func (c *Color) Equals(c2 *Color) bool { + if c == nil && c2 == nil { + return true + } + if c == nil || c2 == nil { + return false + } if len(c.params) != len(c2.params) { return false } @@ -614,3 +648,8 @@ func HiCyanString(format string, a ...interface{}) string { return colorString(f func HiWhiteString(format string, a ...interface{}) string { return colorString(format, FgHiWhite, a...) } + +// sprintln is a helper function to format a string with fmt.Sprintln and trim the trailing newline. +func sprintln(a ...interface{}) string { + return strings.TrimSuffix(fmt.Sprintln(a...), "\n") +} diff --git a/vendor/github.com/go-chi/chi/v5/CHANGELOG.md b/vendor/github.com/go-chi/chi/v5/CHANGELOG.md index 83d5aa2..25b45b9 100644 --- a/vendor/github.com/go-chi/chi/v5/CHANGELOG.md +++ b/vendor/github.com/go-chi/chi/v5/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## v5.0.12 (2024-02-16) + +- History of changes: see https://github.com/go-chi/chi/compare/v5.0.11...v5.0.12 + + ## v5.0.11 (2023-12-19) - History of changes: see https://github.com/go-chi/chi/compare/v5.0.10...v5.0.11 diff --git a/vendor/github.com/go-chi/chi/v5/CONTRIBUTING.md b/vendor/github.com/go-chi/chi/v5/CONTRIBUTING.md index c0ac2df..b4a6268 100644 --- a/vendor/github.com/go-chi/chi/v5/CONTRIBUTING.md +++ b/vendor/github.com/go-chi/chi/v5/CONTRIBUTING.md @@ -14,7 +14,7 @@ A typical workflow is: -1. [Fork the repository.][fork] [This tip maybe also helpful.][go-fork-tip] +1. [Fork the repository.][fork] 2. [Create a topic branch.][branch] 3. Add tests for your change. 4. Run `go test`. If your tests pass, return to the step 3. @@ -24,8 +24,8 @@ A typical workflow is: 8. [Submit a pull request.][pull-req] [go-install]: https://golang.org/doc/install -[go-fork-tip]: http://blog.campoy.cat/2014/03/github-and-go-forking-pull-requests-and.html -[fork]: https://help.github.com/articles/fork-a-repo -[branch]: http://learn.github.com/p/branching.html -[git-help]: https://guides.github.com -[pull-req]: https://help.github.com/articles/using-pull-requests +[fork]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo +[branch]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-branches +[git-help]: https://docs.github.com/en +[pull-req]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests + diff --git a/vendor/github.com/go-chi/chi/v5/README.md b/vendor/github.com/go-chi/chi/v5/README.md index 4b1c99d..21cbb0f 100644 --- a/vendor/github.com/go-chi/chi/v5/README.md +++ b/vendor/github.com/go-chi/chi/v5/README.md @@ -354,6 +354,7 @@ with `net/http` can be used with chi's mux. | [RouteHeaders] | Route handling for request headers | | [SetHeader] | Short-hand middleware to set a response header key/value | | [StripSlashes] | Strip slashes on routing paths | +| [Sunset] | Sunset set Deprecation/Sunset header to response | | [Throttle] | Puts a ceiling on the number of concurrent requests | | [Timeout] | Signals to the request context when the timeout deadline is reached | | [URLFormat] | Parse extension from url and put it on request context | @@ -380,6 +381,7 @@ with `net/http` can be used with chi's mux. [RouteHeaders]: https://pkg.go.dev/github.com/go-chi/chi/middleware#RouteHeaders [SetHeader]: https://pkg.go.dev/github.com/go-chi/chi/middleware#SetHeader [StripSlashes]: https://pkg.go.dev/github.com/go-chi/chi/middleware#StripSlashes +[Sunset]: https://pkg.go.dev/github.com/go-chi/chi/v5/middleware#Sunset [Throttle]: https://pkg.go.dev/github.com/go-chi/chi/middleware#Throttle [ThrottleBacklog]: https://pkg.go.dev/github.com/go-chi/chi/middleware#ThrottleBacklog [ThrottleWithOpts]: https://pkg.go.dev/github.com/go-chi/chi/middleware#ThrottleWithOpts diff --git a/vendor/github.com/go-chi/chi/v5/context.go b/vendor/github.com/go-chi/chi/v5/context.go index 82e5f28..e2cd908 100644 --- a/vendor/github.com/go-chi/chi/v5/context.go +++ b/vendor/github.com/go-chi/chi/v5/context.go @@ -74,9 +74,8 @@ type Context struct { // patterns across a stack of sub-routers. RoutePatterns []string - // methodNotAllowed hint - methodNotAllowed bool methodsAllowed []methodTyp // allowed methods in case of a 405 + methodNotAllowed bool } // Reset a routing context to its initial state. diff --git a/vendor/github.com/go-chi/chi/v5/middleware/maybe.go b/vendor/github.com/go-chi/chi/v5/middleware/maybe.go index d8ca63b..eabca00 100644 --- a/vendor/github.com/go-chi/chi/v5/middleware/maybe.go +++ b/vendor/github.com/go-chi/chi/v5/middleware/maybe.go @@ -4,7 +4,7 @@ import "net/http" // Maybe middleware will allow you to change the flow of the middleware stack execution depending on return // value of maybeFn(request). This is useful for example if you'd like to skip a middleware handler if -// a request does not satisfied the maybeFn logic. +// a request does not satisfy the maybeFn logic. func Maybe(mw func(http.Handler) http.Handler, maybeFn func(r *http.Request) bool) func(http.Handler) http.Handler { return func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/vendor/github.com/go-chi/chi/v5/middleware/realip.go b/vendor/github.com/go-chi/chi/v5/middleware/realip.go index 2c6b3b3..55c95a8 100644 --- a/vendor/github.com/go-chi/chi/v5/middleware/realip.go +++ b/vendor/github.com/go-chi/chi/v5/middleware/realip.go @@ -22,7 +22,7 @@ var xRealIP = http.CanonicalHeaderKey("X-Real-IP") // RemoteAddr will see the intended value. // // You should only use this middleware if you can trust the headers passed to -// you (in particular, the two headers this middleware uses), for example +// you (in particular, the three headers this middleware uses), for example // because you have placed a reverse proxy like HAProxy or nginx in front of // chi. If your reverse proxies are configured to pass along arbitrary header // values from the client, or if you use this middleware without a reverse diff --git a/vendor/github.com/go-chi/chi/v5/middleware/wrap_writer.go b/vendor/github.com/go-chi/chi/v5/middleware/wrap_writer.go index cf5c44d..bf27088 100644 --- a/vendor/github.com/go-chi/chi/v5/middleware/wrap_writer.go +++ b/vendor/github.com/go-chi/chi/v5/middleware/wrap_writer.go @@ -6,6 +6,7 @@ package middleware import ( "bufio" "io" + "io/ioutil" "net" "net/http" ) @@ -61,6 +62,11 @@ type WrapResponseWriter interface { Tee(io.Writer) // Unwrap returns the original proxied target. Unwrap() http.ResponseWriter + // Discard causes all writes to the original ResponseWriter be discarded, + // instead writing only to the tee'd writer if it's set. + // The caller is responsible for calling WriteHeader and Write on the + // original ResponseWriter once the processing is done. + Discard() } // basicWriter wraps a http.ResponseWriter that implements the minimal @@ -71,25 +77,34 @@ type basicWriter struct { code int bytes int tee io.Writer + discard bool } func (b *basicWriter) WriteHeader(code int) { if !b.wroteHeader { b.code = code b.wroteHeader = true - b.ResponseWriter.WriteHeader(code) + if !b.discard { + b.ResponseWriter.WriteHeader(code) + } } } -func (b *basicWriter) Write(buf []byte) (int, error) { +func (b *basicWriter) Write(buf []byte) (n int, err error) { b.maybeWriteHeader() - n, err := b.ResponseWriter.Write(buf) - if b.tee != nil { - _, err2 := b.tee.Write(buf[:n]) - // Prefer errors generated by the proxied writer. - if err == nil { - err = err2 + if !b.discard { + n, err = b.ResponseWriter.Write(buf) + if b.tee != nil { + _, err2 := b.tee.Write(buf[:n]) + // Prefer errors generated by the proxied writer. + if err == nil { + err = err2 + } } + } else if b.tee != nil { + n, err = b.tee.Write(buf) + } else { + n, err = ioutil.Discard.Write(buf) } b.bytes += n return n, err @@ -117,6 +132,10 @@ func (b *basicWriter) Unwrap() http.ResponseWriter { return b.ResponseWriter } +func (b *basicWriter) Discard() { + b.discard = true +} + // flushWriter ... type flushWriter struct { basicWriter diff --git a/vendor/github.com/go-chi/chi/v5/mux.go b/vendor/github.com/go-chi/chi/v5/mux.go index 735ab23..6dc1904 100644 --- a/vendor/github.com/go-chi/chi/v5/mux.go +++ b/vendor/github.com/go-chi/chi/v5/mux.go @@ -107,12 +107,24 @@ func (mx *Mux) Use(middlewares ...func(http.Handler) http.Handler) { // Handle adds the route `pattern` that matches any http method to // execute the `handler` http.Handler. func (mx *Mux) Handle(pattern string, handler http.Handler) { + parts := strings.SplitN(pattern, " ", 2) + if len(parts) == 2 { + mx.Method(parts[0], parts[1], handler) + return + } + mx.handle(mALL, pattern, handler) } // HandleFunc adds the route `pattern` that matches any http method to // execute the `handlerFn` http.HandlerFunc. func (mx *Mux) HandleFunc(pattern string, handlerFn http.HandlerFunc) { + parts := strings.SplitN(pattern, " ", 2) + if len(parts) == 2 { + mx.Method(parts[0], parts[1], handlerFn) + return + } + mx.handle(mALL, pattern, handlerFn) } @@ -440,6 +452,10 @@ func (mx *Mux) routeHTTP(w http.ResponseWriter, r *http.Request) { // Find the route if _, _, h := mx.tree.FindRoute(rctx, method, routePath); h != nil { + if supportsPathValue { + setPathValue(rctx, r) + } + h.ServeHTTP(w, r) return } diff --git a/vendor/github.com/go-chi/chi/v5/path_value.go b/vendor/github.com/go-chi/chi/v5/path_value.go new file mode 100644 index 0000000..7e78171 --- /dev/null +++ b/vendor/github.com/go-chi/chi/v5/path_value.go @@ -0,0 +1,20 @@ +//go:build go1.22 +// +build go1.22 + +package chi + +import "net/http" + +// supportsPathValue is true if the Go version is 1.22 and above. +// +// If this is true, `net/http.Request` has methods `SetPathValue` and `PathValue`. +const supportsPathValue = true + +// setPathValue sets the path values in the Request value +// based on the provided request context. +func setPathValue(rctx *Context, r *http.Request) { + for i, key := range rctx.URLParams.Keys { + value := rctx.URLParams.Values[i] + r.SetPathValue(key, value) + } +} diff --git a/vendor/github.com/go-chi/chi/v5/path_value_fallback.go b/vendor/github.com/go-chi/chi/v5/path_value_fallback.go new file mode 100644 index 0000000..f551781 --- /dev/null +++ b/vendor/github.com/go-chi/chi/v5/path_value_fallback.go @@ -0,0 +1,19 @@ +//go:build !go1.22 +// +build !go1.22 + +package chi + +import "net/http" + +// supportsPathValue is true if the Go version is 1.22 and above. +// +// If this is true, `net/http.Request` has methods `SetPathValue` and `PathValue`. +const supportsPathValue = false + +// setPathValue sets the path values in the Request value +// based on the provided request context. +// +// setPathValue is only supported in Go 1.22 and above so +// this is just a blank function so that it compiles. +func setPathValue(rctx *Context, r *http.Request) { +} diff --git a/vendor/github.com/go-task/slim-sprig/.editorconfig b/vendor/github.com/go-task/slim-sprig/v3/.editorconfig similarity index 100% rename from vendor/github.com/go-task/slim-sprig/.editorconfig rename to vendor/github.com/go-task/slim-sprig/v3/.editorconfig diff --git a/vendor/github.com/go-task/slim-sprig/.gitattributes b/vendor/github.com/go-task/slim-sprig/v3/.gitattributes similarity index 100% rename from vendor/github.com/go-task/slim-sprig/.gitattributes rename to vendor/github.com/go-task/slim-sprig/v3/.gitattributes diff --git a/vendor/github.com/go-task/slim-sprig/.gitignore b/vendor/github.com/go-task/slim-sprig/v3/.gitignore similarity index 100% rename from vendor/github.com/go-task/slim-sprig/.gitignore rename to vendor/github.com/go-task/slim-sprig/v3/.gitignore diff --git a/vendor/github.com/go-task/slim-sprig/CHANGELOG.md b/vendor/github.com/go-task/slim-sprig/v3/CHANGELOG.md similarity index 95% rename from vendor/github.com/go-task/slim-sprig/CHANGELOG.md rename to vendor/github.com/go-task/slim-sprig/v3/CHANGELOG.md index 61d8ebf..2ce45dd 100644 --- a/vendor/github.com/go-task/slim-sprig/CHANGELOG.md +++ b/vendor/github.com/go-task/slim-sprig/v3/CHANGELOG.md @@ -1,5 +1,24 @@ # Changelog +## Release 3.2.3 (2022-11-29) + +### Changed + +- Updated docs (thanks @book987 @aJetHorn @neelayu @pellizzetti @apricote @SaigyoujiYuyuko233 @AlekSi) +- #348: Updated huandu/xstrings which fixed a snake case bug (thanks @yxxhero) +- #353: Updated masterminds/semver which included bug fixes +- #354: Updated golang.org/x/crypto which included bug fixes + +## Release 3.2.2 (2021-02-04) + +This is a re-release of 3.2.1 to satisfy something with the Go module system. + +## Release 3.2.1 (2021-02-04) + +### Changed + +- Upgraded `Masterminds/goutils` to `v1.1.1`. see the [Security Advisory](https://github.com/Masterminds/goutils/security/advisories/GHSA-xg2h-wx96-xgxr) + ## Release 3.2.0 (2020-12-14) ### Added diff --git a/vendor/github.com/go-task/slim-sprig/LICENSE.txt b/vendor/github.com/go-task/slim-sprig/v3/LICENSE.txt similarity index 100% rename from vendor/github.com/go-task/slim-sprig/LICENSE.txt rename to vendor/github.com/go-task/slim-sprig/v3/LICENSE.txt diff --git a/vendor/github.com/go-task/slim-sprig/README.md b/vendor/github.com/go-task/slim-sprig/v3/README.md similarity index 88% rename from vendor/github.com/go-task/slim-sprig/README.md rename to vendor/github.com/go-task/slim-sprig/v3/README.md index 7257947..b5ab564 100644 --- a/vendor/github.com/go-task/slim-sprig/README.md +++ b/vendor/github.com/go-task/slim-sprig/v3/README.md @@ -1,4 +1,4 @@ -# Slim-Sprig: Template functions for Go templates [![GoDoc](https://godoc.org/github.com/go-task/slim-sprig?status.svg)](https://godoc.org/github.com/go-task/slim-sprig) [![Go Report Card](https://goreportcard.com/badge/github.com/go-task/slim-sprig)](https://goreportcard.com/report/github.com/go-task/slim-sprig) +# Slim-Sprig: Template functions for Go templates [![Go Reference](https://pkg.go.dev/badge/github.com/go-task/slim-sprig/v3.svg)](https://pkg.go.dev/github.com/go-task/slim-sprig/v3) Slim-Sprig is a fork of [Sprig](https://github.com/Masterminds/sprig), but with all functions that depend on external (non standard library) or crypto packages diff --git a/vendor/github.com/go-task/slim-sprig/Taskfile.yml b/vendor/github.com/go-task/slim-sprig/v3/Taskfile.yml similarity index 89% rename from vendor/github.com/go-task/slim-sprig/Taskfile.yml rename to vendor/github.com/go-task/slim-sprig/v3/Taskfile.yml index cdcfd22..8e6346b 100644 --- a/vendor/github.com/go-task/slim-sprig/Taskfile.yml +++ b/vendor/github.com/go-task/slim-sprig/v3/Taskfile.yml @@ -1,6 +1,6 @@ # https://taskfile.dev -version: '2' +version: '3' tasks: default: diff --git a/vendor/github.com/go-task/slim-sprig/crypto.go b/vendor/github.com/go-task/slim-sprig/v3/crypto.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/crypto.go rename to vendor/github.com/go-task/slim-sprig/v3/crypto.go diff --git a/vendor/github.com/go-task/slim-sprig/date.go b/vendor/github.com/go-task/slim-sprig/v3/date.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/date.go rename to vendor/github.com/go-task/slim-sprig/v3/date.go diff --git a/vendor/github.com/go-task/slim-sprig/defaults.go b/vendor/github.com/go-task/slim-sprig/v3/defaults.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/defaults.go rename to vendor/github.com/go-task/slim-sprig/v3/defaults.go diff --git a/vendor/github.com/go-task/slim-sprig/dict.go b/vendor/github.com/go-task/slim-sprig/v3/dict.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/dict.go rename to vendor/github.com/go-task/slim-sprig/v3/dict.go diff --git a/vendor/github.com/go-task/slim-sprig/doc.go b/vendor/github.com/go-task/slim-sprig/v3/doc.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/doc.go rename to vendor/github.com/go-task/slim-sprig/v3/doc.go diff --git a/vendor/github.com/go-task/slim-sprig/functions.go b/vendor/github.com/go-task/slim-sprig/v3/functions.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/functions.go rename to vendor/github.com/go-task/slim-sprig/v3/functions.go diff --git a/vendor/github.com/go-task/slim-sprig/list.go b/vendor/github.com/go-task/slim-sprig/v3/list.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/list.go rename to vendor/github.com/go-task/slim-sprig/v3/list.go diff --git a/vendor/github.com/go-task/slim-sprig/network.go b/vendor/github.com/go-task/slim-sprig/v3/network.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/network.go rename to vendor/github.com/go-task/slim-sprig/v3/network.go diff --git a/vendor/github.com/go-task/slim-sprig/numeric.go b/vendor/github.com/go-task/slim-sprig/v3/numeric.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/numeric.go rename to vendor/github.com/go-task/slim-sprig/v3/numeric.go diff --git a/vendor/github.com/go-task/slim-sprig/reflect.go b/vendor/github.com/go-task/slim-sprig/v3/reflect.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/reflect.go rename to vendor/github.com/go-task/slim-sprig/v3/reflect.go diff --git a/vendor/github.com/go-task/slim-sprig/regex.go b/vendor/github.com/go-task/slim-sprig/v3/regex.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/regex.go rename to vendor/github.com/go-task/slim-sprig/v3/regex.go diff --git a/vendor/github.com/go-task/slim-sprig/strings.go b/vendor/github.com/go-task/slim-sprig/v3/strings.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/strings.go rename to vendor/github.com/go-task/slim-sprig/v3/strings.go diff --git a/vendor/github.com/go-task/slim-sprig/url.go b/vendor/github.com/go-task/slim-sprig/v3/url.go similarity index 100% rename from vendor/github.com/go-task/slim-sprig/url.go rename to vendor/github.com/go-task/slim-sprig/v3/url.go diff --git a/vendor/github.com/gocarina/gocsv/decode.go b/vendor/github.com/gocarina/gocsv/decode.go index 24d49d0..e91c595 100644 --- a/vendor/github.com/gocarina/gocsv/decode.go +++ b/vendor/github.com/gocarina/gocsv/decode.go @@ -203,22 +203,22 @@ func readToWithErrorHandler(decoder Decoder, errHandler ErrorHandler, out interf objectIface := reflect.New(outValue.Index(i).Type()).Interface() outInner := createNewOutInner(outInnerWasPointer, outInnerType) for j, csvColumnContent := range csvRow { - if fieldInfo, ok := csvHeadersLabels[j]; ok { // Position found accordingly to header name - - if outInner.CanInterface() { - fieldTypeUnmarshallerWithKeys, withFieldsOK = objectIface.(TypeUnmarshalCSVWithFields) - if withFieldsOK { - if err := fieldTypeUnmarshallerWithKeys.UnmarshalCSVWithFields(fieldInfo.getFirstKey(), csvColumnContent); err != nil { - parseError := csv.ParseError{ - Line: i + 2, //add 2 to account for the header & 0-indexing of arrays - Column: j + 1, - Err: err, - } - return &parseError + if outInner.CanInterface() { + fieldTypeUnmarshallerWithKeys, withFieldsOK = objectIface.(TypeUnmarshalCSVWithFields) + if withFieldsOK { + if err := fieldTypeUnmarshallerWithKeys.UnmarshalCSVWithFields(headers[j], csvColumnContent); err != nil { + parseError := csv.ParseError{ + Line: i + 2, //add 2 to account for the header & 0-indexing of arrays + Column: j + 1, + Err: err, } - continue + return &parseError } + continue } + } + + if fieldInfo, ok := csvHeadersLabels[j]; ok { // Position found accordingly to header name value := csvColumnContent if value == "" { value = fieldInfo.defaultValue @@ -289,8 +289,13 @@ func readEach(decoder SimpleDecoder, errHandler ErrorHandler, c interface{}) err return err } } + + var withFieldsOK bool + var fieldTypeUnmarshallerWithKeys TypeUnmarshalCSVWithFields + i := 0 for { + objectIface := reflect.New(outValue.Type().Elem()).Interface() line, err := decoder.GetCSVRow() if err == io.EOF { break @@ -299,8 +304,31 @@ func readEach(decoder SimpleDecoder, errHandler ErrorHandler, c interface{}) err } outInner := createNewOutInner(outInnerWasPointer, outInnerType) for j, csvColumnContent := range line { + + if outInner.CanInterface() { + fieldTypeUnmarshallerWithKeys, withFieldsOK = objectIface.(TypeUnmarshalCSVWithFields) + if withFieldsOK { + if err := fieldTypeUnmarshallerWithKeys.UnmarshalCSVWithFields(headers[j], csvColumnContent); err != nil { + parseError := csv.ParseError{ + Line: i + 2, //add 2 to account for the header & 0-indexing of arrays + Column: j + 1, + Err: err, + } + return &parseError + } + + continue + } + } + if fieldInfo, ok := csvHeadersLabels[j]; ok { // Position found accordingly to header name - if err := setInnerField(&outInner, outInnerWasPointer, fieldInfo.IndexChain, csvColumnContent, fieldInfo.omitEmpty); err != nil { // Set field of struct + + value := csvColumnContent + if value == "" { + value = fieldInfo.defaultValue + } + + if err := setInnerField(&outInner, outInnerWasPointer, fieldInfo.IndexChain, value, fieldInfo.omitEmpty); err != nil { // Set field of struct parseError := &csv.ParseError{ Line: i + 2, //add 2 to account for the header & 0-indexing of arrays Column: j + 1, @@ -313,6 +341,12 @@ func readEach(decoder SimpleDecoder, errHandler ErrorHandler, c interface{}) err } } } + + if withFieldsOK { + reflectedObject := reflect.ValueOf(objectIface) + outInner = reflectedObject.Elem() + } + outValue.Send(outInner) i++ } diff --git a/vendor/github.com/google/pprof/profile/encode.go b/vendor/github.com/google/pprof/profile/encode.go index 182c926..8ce9d3c 100644 --- a/vendor/github.com/google/pprof/profile/encode.go +++ b/vendor/github.com/google/pprof/profile/encode.go @@ -122,6 +122,7 @@ func (p *Profile) preEncode() { } p.defaultSampleTypeX = addString(strings, p.DefaultSampleType) + p.docURLX = addString(strings, p.DocURL) p.stringTable = make([]string, len(strings)) for s, i := range strings { @@ -156,6 +157,7 @@ func (p *Profile) encode(b *buffer) { encodeInt64Opt(b, 12, p.Period) encodeInt64s(b, 13, p.commentX) encodeInt64(b, 14, p.defaultSampleTypeX) + encodeInt64Opt(b, 15, p.docURLX) } var profileDecoder = []decoder{ @@ -237,6 +239,8 @@ var profileDecoder = []decoder{ func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Profile).commentX) }, // int64 defaultSampleType = 14 func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).defaultSampleTypeX) }, + // string doc_link = 15; + func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).docURLX) }, } // postDecode takes the unexported fields populated by decode (with @@ -384,6 +388,7 @@ func (p *Profile) postDecode() error { p.commentX = nil p.DefaultSampleType, err = getString(p.stringTable, &p.defaultSampleTypeX, err) + p.DocURL, err = getString(p.stringTable, &p.docURLX, err) p.stringTable = nil return err } @@ -530,6 +535,7 @@ func (p *Line) decoder() []decoder { func (p *Line) encode(b *buffer) { encodeUint64Opt(b, 1, p.functionIDX) encodeInt64Opt(b, 2, p.Line) + encodeInt64Opt(b, 3, p.Column) } var lineDecoder = []decoder{ @@ -538,6 +544,8 @@ var lineDecoder = []decoder{ func(b *buffer, m message) error { return decodeUint64(b, &m.(*Line).functionIDX) }, // optional int64 line = 2 func(b *buffer, m message) error { return decodeInt64(b, &m.(*Line).Line) }, + // optional int64 column = 3 + func(b *buffer, m message) error { return decodeInt64(b, &m.(*Line).Column) }, } func (p *Function) decoder() []decoder { diff --git a/vendor/github.com/google/pprof/profile/legacy_java_profile.go b/vendor/github.com/google/pprof/profile/legacy_java_profile.go index 91f45e5..4580bab 100644 --- a/vendor/github.com/google/pprof/profile/legacy_java_profile.go +++ b/vendor/github.com/google/pprof/profile/legacy_java_profile.go @@ -56,7 +56,7 @@ func javaCPUProfile(b []byte, period int64, parse func(b []byte) (uint64, []byte } // Strip out addresses for better merge. - if err = p.Aggregate(true, true, true, true, false); err != nil { + if err = p.Aggregate(true, true, true, true, false, false); err != nil { return nil, err } @@ -99,7 +99,7 @@ func parseJavaProfile(b []byte) (*Profile, error) { } // Strip out addresses for better merge. - if err = p.Aggregate(true, true, true, true, false); err != nil { + if err = p.Aggregate(true, true, true, true, false, false); err != nil { return nil, err } diff --git a/vendor/github.com/google/pprof/profile/merge.go b/vendor/github.com/google/pprof/profile/merge.go index 4b66282..ba4d746 100644 --- a/vendor/github.com/google/pprof/profile/merge.go +++ b/vendor/github.com/google/pprof/profile/merge.go @@ -326,12 +326,13 @@ func (l *Location) key() locationKey { key.addr -= l.Mapping.Start key.mappingID = l.Mapping.ID } - lines := make([]string, len(l.Line)*2) + lines := make([]string, len(l.Line)*3) for i, line := range l.Line { if line.Function != nil { lines[i*2] = strconv.FormatUint(line.Function.ID, 16) } lines[i*2+1] = strconv.FormatInt(line.Line, 16) + lines[i*2+2] = strconv.FormatInt(line.Column, 16) } key.lines = strings.Join(lines, "|") return key @@ -418,6 +419,7 @@ func (pm *profileMerger) mapLine(src Line) Line { ln := Line{ Function: pm.mapFunction(src.Function), Line: src.Line, + Column: src.Column, } return ln } @@ -474,6 +476,7 @@ func combineHeaders(srcs []*Profile) (*Profile, error) { var timeNanos, durationNanos, period int64 var comments []string seenComments := map[string]bool{} + var docURL string var defaultSampleType string for _, s := range srcs { if timeNanos == 0 || s.TimeNanos < timeNanos { @@ -492,6 +495,9 @@ func combineHeaders(srcs []*Profile) (*Profile, error) { if defaultSampleType == "" { defaultSampleType = s.DefaultSampleType } + if docURL == "" { + docURL = s.DocURL + } } p := &Profile{ @@ -507,6 +513,7 @@ func combineHeaders(srcs []*Profile) (*Profile, error) { Comments: comments, DefaultSampleType: defaultSampleType, + DocURL: docURL, } copy(p.SampleType, srcs[0].SampleType) return p, nil diff --git a/vendor/github.com/google/pprof/profile/profile.go b/vendor/github.com/google/pprof/profile/profile.go index 60ef7e9..f47a243 100644 --- a/vendor/github.com/google/pprof/profile/profile.go +++ b/vendor/github.com/google/pprof/profile/profile.go @@ -39,6 +39,7 @@ type Profile struct { Location []*Location Function []*Function Comments []string + DocURL string DropFrames string KeepFrames string @@ -53,6 +54,7 @@ type Profile struct { encodeMu sync.Mutex commentX []int64 + docURLX int64 dropFramesX int64 keepFramesX int64 stringTable []string @@ -145,6 +147,7 @@ type Location struct { type Line struct { Function *Function Line int64 + Column int64 functionIDX uint64 } @@ -436,7 +439,7 @@ func (p *Profile) CheckValid() error { // Aggregate merges the locations in the profile into equivalence // classes preserving the request attributes. It also updates the // samples to point to the merged locations. -func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address bool) error { +func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, columnnumber, address bool) error { for _, m := range p.Mapping { m.HasInlineFrames = m.HasInlineFrames && inlineFrame m.HasFunctions = m.HasFunctions && function @@ -458,7 +461,7 @@ func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address } // Aggregate locations - if !inlineFrame || !address || !linenumber { + if !inlineFrame || !address || !linenumber || !columnnumber { for _, l := range p.Location { if !inlineFrame && len(l.Line) > 1 { l.Line = l.Line[len(l.Line)-1:] @@ -466,6 +469,12 @@ func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address if !linenumber { for i := range l.Line { l.Line[i].Line = 0 + l.Line[i].Column = 0 + } + } + if !columnnumber { + for i := range l.Line { + l.Line[i].Column = 0 } } if !address { @@ -548,6 +557,9 @@ func (p *Profile) String() string { for _, c := range p.Comments { ss = append(ss, "Comment: "+c) } + if url := p.DocURL; url != "" { + ss = append(ss, fmt.Sprintf("Doc: %s", url)) + } if pt := p.PeriodType; pt != nil { ss = append(ss, fmt.Sprintf("PeriodType: %s %s", pt.Type, pt.Unit)) } @@ -627,10 +639,11 @@ func (l *Location) string() string { for li := range l.Line { lnStr := "??" if fn := l.Line[li].Function; fn != nil { - lnStr = fmt.Sprintf("%s %s:%d s=%d", + lnStr = fmt.Sprintf("%s %s:%d:%d s=%d", fn.Name, fn.Filename, l.Line[li].Line, + l.Line[li].Column, fn.StartLine) if fn.Name != fn.SystemName { lnStr = lnStr + "(" + fn.SystemName + ")" @@ -836,10 +849,10 @@ func (p *Profile) HasFileLines() bool { // Unsymbolizable returns true if a mapping points to a binary for which // locations can't be symbolized in principle, at least now. Examples are -// "[vdso]", [vsyscall]" and some others, see the code. +// "[vdso]", "[vsyscall]" and some others, see the code. func (m *Mapping) Unsymbolizable() bool { name := filepath.Base(m.File) - return strings.HasPrefix(name, "[") || strings.HasPrefix(name, "linux-vdso") || strings.HasPrefix(m.File, "/dev/dri/") + return strings.HasPrefix(name, "[") || strings.HasPrefix(name, "linux-vdso") || strings.HasPrefix(m.File, "/dev/dri/") || m.File == "//anon" } // Copy makes a fully independent copy of a profile. diff --git a/vendor/github.com/google/uuid/CHANGELOG.md b/vendor/github.com/google/uuid/CHANGELOG.md index 2bd7866..7ec5ac7 100644 --- a/vendor/github.com/google/uuid/CHANGELOG.md +++ b/vendor/github.com/google/uuid/CHANGELOG.md @@ -1,5 +1,36 @@ # Changelog +## [1.6.0](https://github.com/google/uuid/compare/v1.5.0...v1.6.0) (2024-01-16) + + +### Features + +* add Max UUID constant ([#149](https://github.com/google/uuid/issues/149)) ([c58770e](https://github.com/google/uuid/commit/c58770eb495f55fe2ced6284f93c5158a62e53e3)) + + +### Bug Fixes + +* fix typo in version 7 uuid documentation ([#153](https://github.com/google/uuid/issues/153)) ([016b199](https://github.com/google/uuid/commit/016b199544692f745ffc8867b914129ecb47ef06)) +* Monotonicity in UUIDv7 ([#150](https://github.com/google/uuid/issues/150)) ([a2b2b32](https://github.com/google/uuid/commit/a2b2b32373ff0b1a312b7fdf6d38a977099698a6)) + +## [1.5.0](https://github.com/google/uuid/compare/v1.4.0...v1.5.0) (2023-12-12) + + +### Features + +* Validate UUID without creating new UUID ([#141](https://github.com/google/uuid/issues/141)) ([9ee7366](https://github.com/google/uuid/commit/9ee7366e66c9ad96bab89139418a713dc584ae29)) + +## [1.4.0](https://github.com/google/uuid/compare/v1.3.1...v1.4.0) (2023-10-26) + + +### Features + +* UUIDs slice type with Strings() convenience method ([#133](https://github.com/google/uuid/issues/133)) ([cd5fbbd](https://github.com/google/uuid/commit/cd5fbbdd02f3e3467ac18940e07e062be1f864b4)) + +### Fixes + +* Clarify that Parse's job is to parse but not necessarily validate strings. (Documents current behavior) + ## [1.3.1](https://github.com/google/uuid/compare/v1.3.0...v1.3.1) (2023-08-18) diff --git a/vendor/github.com/google/uuid/CONTRIBUTING.md b/vendor/github.com/google/uuid/CONTRIBUTING.md index 5566888..a502fdc 100644 --- a/vendor/github.com/google/uuid/CONTRIBUTING.md +++ b/vendor/github.com/google/uuid/CONTRIBUTING.md @@ -11,7 +11,7 @@ please explain why in the pull request description. ### Releasing -Commits that would precipitate a SemVer change, as desrcibed in the Conventional +Commits that would precipitate a SemVer change, as described in the Conventional Commits Specification, will trigger [`release-please`](https://github.com/google-github-actions/release-please-action) to create a release candidate pull request. Once submitted, `release-please` will create a release. diff --git a/vendor/github.com/google/uuid/hash.go b/vendor/github.com/google/uuid/hash.go index b404f4b..dc60082 100644 --- a/vendor/github.com/google/uuid/hash.go +++ b/vendor/github.com/google/uuid/hash.go @@ -17,6 +17,12 @@ var ( NameSpaceOID = Must(Parse("6ba7b812-9dad-11d1-80b4-00c04fd430c8")) NameSpaceX500 = Must(Parse("6ba7b814-9dad-11d1-80b4-00c04fd430c8")) Nil UUID // empty UUID, all zeros + + // The Max UUID is special form of UUID that is specified to have all 128 bits set to 1. + Max = UUID{ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + } ) // NewHash returns a new UUID derived from the hash of space concatenated with diff --git a/vendor/github.com/google/uuid/time.go b/vendor/github.com/google/uuid/time.go index e6ef06c..c351129 100644 --- a/vendor/github.com/google/uuid/time.go +++ b/vendor/github.com/google/uuid/time.go @@ -108,12 +108,23 @@ func setClockSequence(seq int) { } // Time returns the time in 100s of nanoseconds since 15 Oct 1582 encoded in -// uuid. The time is only defined for version 1 and 2 UUIDs. +// uuid. The time is only defined for version 1, 2, 6 and 7 UUIDs. func (uuid UUID) Time() Time { - time := int64(binary.BigEndian.Uint32(uuid[0:4])) - time |= int64(binary.BigEndian.Uint16(uuid[4:6])) << 32 - time |= int64(binary.BigEndian.Uint16(uuid[6:8])&0xfff) << 48 - return Time(time) + var t Time + switch uuid.Version() { + case 6: + time := binary.BigEndian.Uint64(uuid[:8]) // Ignore uuid[6] version b0110 + t = Time(time) + case 7: + time := binary.BigEndian.Uint64(uuid[:8]) + t = Time((time>>16)*10000 + g1582ns100) + default: // forward compatible + time := int64(binary.BigEndian.Uint32(uuid[0:4])) + time |= int64(binary.BigEndian.Uint16(uuid[4:6])) << 32 + time |= int64(binary.BigEndian.Uint16(uuid[6:8])&0xfff) << 48 + t = Time(time) + } + return t } // ClockSequence returns the clock sequence encoded in uuid. diff --git a/vendor/github.com/google/uuid/uuid.go b/vendor/github.com/google/uuid/uuid.go index a56138c..5232b48 100644 --- a/vendor/github.com/google/uuid/uuid.go +++ b/vendor/github.com/google/uuid/uuid.go @@ -56,11 +56,15 @@ func IsInvalidLengthError(err error) bool { return ok } -// Parse decodes s into a UUID or returns an error. Both the standard UUID -// forms of xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx and -// urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx are decoded as well as the -// Microsoft encoding {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx} and the raw hex -// encoding: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx. +// Parse decodes s into a UUID or returns an error if it cannot be parsed. Both +// the standard UUID forms defined in RFC 4122 +// (xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx and +// urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) are decoded. In addition, +// Parse accepts non-standard strings such as the raw hex encoding +// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx and 38 byte "Microsoft style" encodings, +// e.g. {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx}. Only the middle 36 bytes are +// examined in the latter case. Parse should not be used to validate strings as +// it parses non-standard encodings as indicated above. func Parse(s string) (UUID, error) { var uuid UUID switch len(s) { @@ -182,6 +186,59 @@ func Must(uuid UUID, err error) UUID { return uuid } +// Validate returns an error if s is not a properly formatted UUID in one of the following formats: +// xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +// urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +// {xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx} +// It returns an error if the format is invalid, otherwise nil. +func Validate(s string) error { + switch len(s) { + // Standard UUID format + case 36: + + // UUID with "urn:uuid:" prefix + case 36 + 9: + if !strings.EqualFold(s[:9], "urn:uuid:") { + return fmt.Errorf("invalid urn prefix: %q", s[:9]) + } + s = s[9:] + + // UUID enclosed in braces + case 36 + 2: + if s[0] != '{' || s[len(s)-1] != '}' { + return fmt.Errorf("invalid bracketed UUID format") + } + s = s[1 : len(s)-1] + + // UUID without hyphens + case 32: + for i := 0; i < len(s); i += 2 { + _, ok := xtob(s[i], s[i+1]) + if !ok { + return errors.New("invalid UUID format") + } + } + + default: + return invalidLengthError{len(s)} + } + + // Check for standard UUID format + if len(s) == 36 { + if s[8] != '-' || s[13] != '-' || s[18] != '-' || s[23] != '-' { + return errors.New("invalid UUID format") + } + for _, x := range []int{0, 2, 4, 6, 9, 11, 14, 16, 19, 21, 24, 26, 28, 30, 32, 34} { + if _, ok := xtob(s[x], s[x+1]); !ok { + return errors.New("invalid UUID format") + } + } + } + + return nil +} + // String returns the string form of uuid, xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx // , or "" if uuid is invalid. func (uuid UUID) String() string { @@ -294,3 +351,15 @@ func DisableRandPool() { poolMu.Lock() poolPos = randPoolSize } + +// UUIDs is a slice of UUID types. +type UUIDs []UUID + +// Strings returns a string slice containing the string form of each UUID in uuids. +func (uuids UUIDs) Strings() []string { + var uuidStrs = make([]string, len(uuids)) + for i, uuid := range uuids { + uuidStrs[i] = uuid.String() + } + return uuidStrs +} diff --git a/vendor/github.com/google/uuid/version6.go b/vendor/github.com/google/uuid/version6.go new file mode 100644 index 0000000..339a959 --- /dev/null +++ b/vendor/github.com/google/uuid/version6.go @@ -0,0 +1,56 @@ +// Copyright 2023 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package uuid + +import "encoding/binary" + +// UUID version 6 is a field-compatible version of UUIDv1, reordered for improved DB locality. +// It is expected that UUIDv6 will primarily be used in contexts where there are existing v1 UUIDs. +// Systems that do not involve legacy UUIDv1 SHOULD consider using UUIDv7 instead. +// +// see https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-03#uuidv6 +// +// NewV6 returns a Version 6 UUID based on the current NodeID and clock +// sequence, and the current time. If the NodeID has not been set by SetNodeID +// or SetNodeInterface then it will be set automatically. If the NodeID cannot +// be set NewV6 set NodeID is random bits automatically . If clock sequence has not been set by +// SetClockSequence then it will be set automatically. If GetTime fails to +// return the current NewV6 returns Nil and an error. +func NewV6() (UUID, error) { + var uuid UUID + now, seq, err := GetTime() + if err != nil { + return uuid, err + } + + /* + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | time_high | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | time_mid | time_low_and_version | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |clk_seq_hi_res | clk_seq_low | node (0-1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | node (2-5) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + + binary.BigEndian.PutUint64(uuid[0:], uint64(now)) + binary.BigEndian.PutUint16(uuid[8:], seq) + + uuid[6] = 0x60 | (uuid[6] & 0x0F) + uuid[8] = 0x80 | (uuid[8] & 0x3F) + + nodeMu.Lock() + if nodeID == zeroID { + setNodeInterface("") + } + copy(uuid[10:], nodeID[:]) + nodeMu.Unlock() + + return uuid, nil +} diff --git a/vendor/github.com/google/uuid/version7.go b/vendor/github.com/google/uuid/version7.go new file mode 100644 index 0000000..3167b64 --- /dev/null +++ b/vendor/github.com/google/uuid/version7.go @@ -0,0 +1,104 @@ +// Copyright 2023 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package uuid + +import ( + "io" +) + +// UUID version 7 features a time-ordered value field derived from the widely +// implemented and well known Unix Epoch timestamp source, +// the number of milliseconds seconds since midnight 1 Jan 1970 UTC, leap seconds excluded. +// As well as improved entropy characteristics over versions 1 or 6. +// +// see https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-03#name-uuid-version-7 +// +// Implementations SHOULD utilize UUID version 7 over UUID version 1 and 6 if possible. +// +// NewV7 returns a Version 7 UUID based on the current time(Unix Epoch). +// Uses the randomness pool if it was enabled with EnableRandPool. +// On error, NewV7 returns Nil and an error +func NewV7() (UUID, error) { + uuid, err := NewRandom() + if err != nil { + return uuid, err + } + makeV7(uuid[:]) + return uuid, nil +} + +// NewV7FromReader returns a Version 7 UUID based on the current time(Unix Epoch). +// it use NewRandomFromReader fill random bits. +// On error, NewV7FromReader returns Nil and an error. +func NewV7FromReader(r io.Reader) (UUID, error) { + uuid, err := NewRandomFromReader(r) + if err != nil { + return uuid, err + } + + makeV7(uuid[:]) + return uuid, nil +} + +// makeV7 fill 48 bits time (uuid[0] - uuid[5]), set version b0111 (uuid[6]) +// uuid[8] already has the right version number (Variant is 10) +// see function NewV7 and NewV7FromReader +func makeV7(uuid []byte) { + /* + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | unix_ts_ms | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | unix_ts_ms | ver | rand_a (12 bit seq) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |var| rand_b | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | rand_b | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ + _ = uuid[15] // bounds check + + t, s := getV7Time() + + uuid[0] = byte(t >> 40) + uuid[1] = byte(t >> 32) + uuid[2] = byte(t >> 24) + uuid[3] = byte(t >> 16) + uuid[4] = byte(t >> 8) + uuid[5] = byte(t) + + uuid[6] = 0x70 | (0x0F & byte(s>>8)) + uuid[7] = byte(s) +} + +// lastV7time is the last time we returned stored as: +// +// 52 bits of time in milliseconds since epoch +// 12 bits of (fractional nanoseconds) >> 8 +var lastV7time int64 + +const nanoPerMilli = 1000000 + +// getV7Time returns the time in milliseconds and nanoseconds / 256. +// The returned (milli << 12 + seq) is guarenteed to be greater than +// (milli << 12 + seq) returned by any previous call to getV7Time. +func getV7Time() (milli, seq int64) { + timeMu.Lock() + defer timeMu.Unlock() + + nano := timeNow().UnixNano() + milli = nano / nanoPerMilli + // Sequence number is between 0 and 3906 (nanoPerMilli>>8) + seq = (nano - milli*nanoPerMilli) >> 8 + now := milli<<12 + seq + if now <= lastV7time { + now = lastV7time + 1 + milli = now >> 12 + seq = now & 0xfff + } + lastV7time = now + return milli, seq +} diff --git a/vendor/github.com/hashicorp/yamux/LICENSE b/vendor/github.com/hashicorp/yamux/LICENSE index f0e5c79..b7df72e 100644 --- a/vendor/github.com/hashicorp/yamux/LICENSE +++ b/vendor/github.com/hashicorp/yamux/LICENSE @@ -1,3 +1,5 @@ +Copyright (c) 2014 HashiCorp, Inc. + Mozilla Public License, version 2.0 1. Definitions diff --git a/vendor/github.com/hashicorp/yamux/mux.go b/vendor/github.com/hashicorp/yamux/mux.go index 0c3e67b..d3d5b3f 100644 --- a/vendor/github.com/hashicorp/yamux/mux.go +++ b/vendor/github.com/hashicorp/yamux/mux.go @@ -3,7 +3,6 @@ package yamux import ( "fmt" "io" - "log" "os" "time" ) @@ -51,7 +50,12 @@ type Config struct { // Logger is used to pass in the logger to be used. Either Logger or // LogOutput can be set, not both. - Logger *log.Logger + Logger Logger +} + +func (c *Config) Clone() *Config { + c2 := *c + return &c2 } // DefaultConfig is used to return a default configuration diff --git a/vendor/github.com/hashicorp/yamux/session.go b/vendor/github.com/hashicorp/yamux/session.go index 38fe3ed..c08c4da 100644 --- a/vendor/github.com/hashicorp/yamux/session.go +++ b/vendor/github.com/hashicorp/yamux/session.go @@ -3,6 +3,7 @@ package yamux import ( "bufio" "bytes" + "context" "fmt" "io" "io/ioutil" @@ -34,7 +35,7 @@ type Session struct { config *Config // logger is used for our logs - logger *log.Logger + logger Logger // conn is the underlying connection conn io.ReadWriteCloser @@ -250,6 +251,22 @@ func (s *Session) AcceptStream() (*Stream, error) { } } +// AcceptStream is used to block until the next available stream +// is ready to be accepted. +func (s *Session) AcceptStreamWithContext(ctx context.Context) (*Stream, error) { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case stream := <-s.acceptCh: + if err := stream.sendWindowUpdate(); err != nil { + return nil, err + } + return stream, nil + case <-s.shutdownCh: + return nil, s.shutdownErr + } +} + // Close is used to close the session and all streams. // Attempts to send a GoAway before closing the connection. func (s *Session) Close() error { @@ -339,7 +356,7 @@ func (s *Session) Ping() (time.Duration, error) { } // Compute the RTT - return time.Now().Sub(start), nil + return time.Since(start), nil } // keepalive is a long running goroutine that periodically does diff --git a/vendor/github.com/hashicorp/yamux/spec.md b/vendor/github.com/hashicorp/yamux/spec.md index 183d797..cb98618 100644 --- a/vendor/github.com/hashicorp/yamux/spec.md +++ b/vendor/github.com/hashicorp/yamux/spec.md @@ -100,7 +100,7 @@ fire a request without waiting for the RTT of the ACK. This does introduce the possibility of a connection being rejected after data has been sent already. This is a slight semantic difference -from TCP, where the conection cannot be refused after it is opened. +from TCP, where the connection cannot be refused after it is opened. Clients should be prepared to handle this by checking for an error that indicates a RST was received. diff --git a/vendor/github.com/hashicorp/yamux/stream.go b/vendor/github.com/hashicorp/yamux/stream.go index 23d08fc..31168d9 100644 --- a/vendor/github.com/hashicorp/yamux/stream.go +++ b/vendor/github.com/hashicorp/yamux/stream.go @@ -95,10 +95,12 @@ func (s *Stream) StreamID() uint32 { func (s *Stream) Read(b []byte) (n int, err error) { defer asyncNotify(s.recvNotifyCh) START: + + // If the stream is closed and there's no data buffered, return EOF s.stateLock.Lock() switch s.state { case streamLocalClose: - fallthrough + // LocalClose only prohibits further local writes. Handle reads normally. case streamRemoteClose: fallthrough case streamClosed: @@ -138,19 +140,22 @@ WAIT: var timer *time.Timer readDeadline := s.readDeadline.Load().(time.Time) if !readDeadline.IsZero() { - delay := readDeadline.Sub(time.Now()) + delay := time.Until(readDeadline) timer = time.NewTimer(delay) timeout = timer.C } select { + case <-s.session.shutdownCh: case <-s.recvNotifyCh: - if timer != nil { - timer.Stop() - } - goto START case <-timeout: return 0, ErrTimeout } + if timer != nil { + if !timer.Stop() { + <-timeout + } + } + goto START } // Write is used to write to the stream @@ -219,18 +224,25 @@ START: WAIT: var timeout <-chan time.Time + var timer *time.Timer writeDeadline := s.writeDeadline.Load().(time.Time) if !writeDeadline.IsZero() { - delay := writeDeadline.Sub(time.Now()) - timeout = time.After(delay) + delay := time.Until(writeDeadline) + timer = time.NewTimer(delay) + timeout = timer.C } select { + case <-s.session.shutdownCh: case <-s.sendNotifyCh: - goto START case <-timeout: return 0, ErrTimeout } - return 0, nil + if timer != nil { + if !timer.Stop() { + <-timeout + } + } + goto START } // sendFlags determines any flags that are appropriate @@ -380,7 +392,7 @@ func (s *Stream) closeTimeout() { defer s.sendLock.Unlock() hdr := header(make([]byte, headerSize)) hdr.encode(typeWindowUpdate, flagRST, s.id, 0) - s.session.sendNoWait(hdr) + _ = s.session.sendNoWait(hdr) } // forceClose is used for when the session is exiting diff --git a/vendor/github.com/hashicorp/yamux/util.go b/vendor/github.com/hashicorp/yamux/util.go index 8a73e92..4462518 100644 --- a/vendor/github.com/hashicorp/yamux/util.go +++ b/vendor/github.com/hashicorp/yamux/util.go @@ -5,6 +5,13 @@ import ( "time" ) +// Logger is a abstract of *log.Logger +type Logger interface { + Print(v ...interface{}) + Printf(format string, v ...interface{}) + Println(v ...interface{}) +} + var ( timerPool = &sync.Pool{ New: func() interface{} { diff --git a/vendor/github.com/jackc/chunkreader/v2/.travis.yml b/vendor/github.com/jackc/chunkreader/v2/.travis.yml deleted file mode 100644 index e176228..0000000 --- a/vendor/github.com/jackc/chunkreader/v2/.travis.yml +++ /dev/null @@ -1,9 +0,0 @@ -language: go - -go: - - 1.x - - tip - -matrix: - allow_failures: - - go: tip diff --git a/vendor/github.com/jackc/chunkreader/v2/README.md b/vendor/github.com/jackc/chunkreader/v2/README.md deleted file mode 100644 index 01209bf..0000000 --- a/vendor/github.com/jackc/chunkreader/v2/README.md +++ /dev/null @@ -1,8 +0,0 @@ -[![](https://godoc.org/github.com/jackc/chunkreader?status.svg)](https://godoc.org/github.com/jackc/chunkreader) -[![Build Status](https://travis-ci.org/jackc/chunkreader.svg)](https://travis-ci.org/jackc/chunkreader) - -# chunkreader - -Package chunkreader provides an io.Reader wrapper that minimizes IO reads and memory allocations. - -Extracted from original implementation in https://github.com/jackc/pgx. diff --git a/vendor/github.com/jackc/chunkreader/v2/chunkreader.go b/vendor/github.com/jackc/chunkreader/v2/chunkreader.go deleted file mode 100644 index afea1c5..0000000 --- a/vendor/github.com/jackc/chunkreader/v2/chunkreader.go +++ /dev/null @@ -1,104 +0,0 @@ -// Package chunkreader provides an io.Reader wrapper that minimizes IO reads and memory allocations. -package chunkreader - -import ( - "io" -) - -// ChunkReader is a io.Reader wrapper that minimizes IO reads and memory allocations. It allocates memory in chunks and -// will read as much as will fit in the current buffer in a single call regardless of how large a read is actually -// requested. The memory returned via Next is owned by the caller. This avoids the need for an additional copy. -// -// The downside of this approach is that a large buffer can be pinned in memory even if only a small slice is -// referenced. For example, an entire 4096 byte block could be pinned in memory by even a 1 byte slice. In these rare -// cases it would be advantageous to copy the bytes to another slice. -type ChunkReader struct { - r io.Reader - - buf []byte - rp, wp int // buf read position and write position - - config Config -} - -// Config contains configuration parameters for ChunkReader. -type Config struct { - MinBufLen int // Minimum buffer length -} - -// New creates and returns a new ChunkReader for r with default configuration. -func New(r io.Reader) *ChunkReader { - cr, err := NewConfig(r, Config{}) - if err != nil { - panic("default config can't be bad") - } - - return cr -} - -// NewConfig creates and a new ChunkReader for r configured by config. -func NewConfig(r io.Reader, config Config) (*ChunkReader, error) { - if config.MinBufLen == 0 { - // By historical reasons Postgres currently has 8KB send buffer inside, - // so here we want to have at least the same size buffer. - // @see https://github.com/postgres/postgres/blob/249d64999615802752940e017ee5166e726bc7cd/src/backend/libpq/pqcomm.c#L134 - // @see https://www.postgresql.org/message-id/0cdc5485-cb3c-5e16-4a46-e3b2f7a41322%40ya.ru - config.MinBufLen = 8192 - } - - return &ChunkReader{ - r: r, - buf: make([]byte, config.MinBufLen), - config: config, - }, nil -} - -// Next returns buf filled with the next n bytes. The caller gains ownership of buf. It is not necessary to make a copy -// of buf. If an error occurs, buf will be nil. -func (r *ChunkReader) Next(n int) (buf []byte, err error) { - // n bytes already in buf - if (r.wp - r.rp) >= n { - buf = r.buf[r.rp : r.rp+n] - r.rp += n - return buf, err - } - - // available space in buf is less than n - if len(r.buf) < n { - r.copyBufContents(r.newBuf(n)) - } - - // buf is large enough, but need to shift filled area to start to make enough contiguous space - minReadCount := n - (r.wp - r.rp) - if (len(r.buf) - r.wp) < minReadCount { - newBuf := r.newBuf(n) - r.copyBufContents(newBuf) - } - - if err := r.appendAtLeast(minReadCount); err != nil { - return nil, err - } - - buf = r.buf[r.rp : r.rp+n] - r.rp += n - return buf, nil -} - -func (r *ChunkReader) appendAtLeast(fillLen int) error { - n, err := io.ReadAtLeast(r.r, r.buf[r.wp:], fillLen) - r.wp += n - return err -} - -func (r *ChunkReader) newBuf(size int) []byte { - if size < r.config.MinBufLen { - size = r.config.MinBufLen - } - return make([]byte, size) -} - -func (r *ChunkReader) copyBufContents(dest []byte) { - r.wp = copy(dest, r.buf[r.rp:r.wp]) - r.rp = 0 - r.buf = dest -} diff --git a/vendor/github.com/jackc/pgconn/.gitignore b/vendor/github.com/jackc/pgconn/.gitignore deleted file mode 100644 index e980f55..0000000 --- a/vendor/github.com/jackc/pgconn/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -.envrc -vendor/ -.vscode diff --git a/vendor/github.com/jackc/pgconn/CHANGELOG.md b/vendor/github.com/jackc/pgconn/CHANGELOG.md deleted file mode 100644 index 519996c..0000000 --- a/vendor/github.com/jackc/pgconn/CHANGELOG.md +++ /dev/null @@ -1,177 +0,0 @@ -# 1.14.3 (March 4, 2024) - -* Update golang.org/x/crypto and golang.org/x/text - -# 1.14.2 (March 4, 2024) - -* Fix CVE-2024-27304. SQL injection can occur if an attacker can cause a single query or bind message to exceed 4 GB in -size. An integer overflow in the calculated message size can cause the one large message to be sent as multiple messages -under the attacker's control. - -# 1.14.1 (July 19, 2023) - -* Fix: Enable failover efforts when pg_hba.conf disallows non-ssl connections (Brandon Kauffman) -* Fix: connect_timeout is not obeyed for sslmode=allow|prefer (smaher-edb) -* Optimize redundant pgpass parsing in case password is explicitly set (Aleksandr Alekseev) - -# 1.14.0 (February 11, 2023) - -* Fix: each connection attempt to new node gets own timeout (Nathan Giardina) -* Set SNI for SSL connections (Stas Kelvich) -* Fix: CopyFrom I/O race (Tommy Reilly) -* Minor dependency upgrades - -# 1.13.0 (August 6, 2022) - -* Add sslpassword support (Eric McCormack and yun.xu) -* Add prefer-standby target_session_attrs support (sergey.bashilov) -* Fix GSS ErrorResponse handling (Oliver Tan) - -# 1.12.1 (May 7, 2022) - -* Fix: setting krbspn and krbsrvname in connection string (sireax) -* Add support for Unix sockets on Windows (Eno Compton) -* Stop ignoring ErrorResponse during SCRAM auth (Rafi Shamim) - -# 1.12.0 (April 21, 2022) - -* Add pluggable GSSAPI support (Oliver Tan) -* Fix: Consider any "0A000" error a possible cached plan changed error due to locale -* Better match psql fallback behavior with multiple hosts - -# 1.11.0 (February 7, 2022) - -* Support port in ip from LookupFunc to override config (James Hartig) -* Fix TLS connection timeout (Blake Embrey) -* Add support for read-only, primary, standby, prefer-standby target_session_attributes (Oscar) -* Fix connect when receiving NoticeResponse - -# 1.10.1 (November 20, 2021) - -* Close without waiting for response (Kei Kamikawa) -* Save waiting for network round-trip in CopyFrom (Rueian) -* Fix concurrency issue with ContextWatcher -* LRU.Get always checks context for cancellation / expiration (Georges Varouchas) - -# 1.10.0 (July 24, 2021) - -* net.Timeout errors are no longer returned when a query is canceled via context. A wrapped context error is returned. - -# 1.9.0 (July 10, 2021) - -* pgconn.Timeout only is true for errors originating in pgconn (Michael Darr) -* Add defaults for sslcert, sslkey, and sslrootcert (Joshua Brindle) -* Solve issue with 'sslmode=verify-full' when there are multiple hosts (mgoddard) -* Fix default host when parsing URL without host but with port -* Allow dbname query parameter in URL conn string -* Update underlying dependencies - -# 1.8.1 (March 25, 2021) - -* Better connection string sanitization (ip.novikov) -* Use proper pgpass location on Windows (Moshe Katz) -* Use errors instead of golang.org/x/xerrors -* Resume fallback on server error in Connect (Andrey Borodin) - -# 1.8.0 (December 3, 2020) - -* Add StatementErrored method to stmtcache.Cache. This allows the cache to purge invalidated prepared statements. (Ethan Pailes) - -# 1.7.2 (November 3, 2020) - -* Fix data value slices into work buffer with capacities larger than length. - -# 1.7.1 (October 31, 2020) - -* Do not asyncClose after receiving FATAL error from PostgreSQL server - -# 1.7.0 (September 26, 2020) - -* Exec(Params|Prepared) return ResultReader with FieldDescriptions loaded -* Add ReceiveResults (Sebastiaan Mannem) -* Fix parsing DSN connection with bad backslash -* Add PgConn.CleanupDone so connection pools can determine when async close is complete - -# 1.6.4 (July 29, 2020) - -* Fix deadlock on error after CommandComplete but before ReadyForQuery -* Fix panic on parsing DSN with trailing '=' - -# 1.6.3 (July 22, 2020) - -* Fix error message after AppendCertsFromPEM failure (vahid-sohrabloo) - -# 1.6.2 (July 14, 2020) - -* Update pgservicefile library - -# 1.6.1 (June 27, 2020) - -* Update golang.org/x/crypto to latest -* Update golang.org/x/text to 0.3.3 -* Fix error handling for bad PGSERVICE definition -* Redact passwords in ParseConfig errors (Lukas Vogel) - -# 1.6.0 (June 6, 2020) - -* Fix panic when closing conn during cancellable query -* Fix behavior of sslmode=require with sslrootcert present (Petr Jediný) -* Fix field descriptions available after command concluded (Tobias Salzmann) -* Support connect_timeout (georgysavva) -* Handle IPv6 in connection URLs (Lukas Vogel) -* Fix ValidateConnect with cancelable context -* Improve CopyFrom performance -* Add Config.Copy (georgysavva) - -# 1.5.0 (March 30, 2020) - -* Update golang.org/x/crypto for security fix -* Implement "verify-ca" SSL mode (Greg Curtis) - -# 1.4.0 (March 7, 2020) - -* Fix ExecParams and ExecPrepared handling of empty query. -* Support reading config from PostgreSQL service files. - -# 1.3.2 (February 14, 2020) - -* Update chunkreader to v2.0.1 for optimized default buffer size. - -# 1.3.1 (February 5, 2020) - -* Fix CopyFrom deadlock when multiple NoticeResponse received during copy - -# 1.3.0 (January 23, 2020) - -* Add Hijack and Construct. -* Update pgproto3 to v2.0.1. - -# 1.2.1 (January 13, 2020) - -* Fix data race in context cancellation introduced in v1.2.0. - -# 1.2.0 (January 11, 2020) - -## Features - -* Add Insert(), Update(), Delete(), and Select() statement type query methods to CommandTag. -* Add PgError.SQLState method. This could be used for compatibility with other drivers and databases. - -## Performance - -* Improve performance when context.Background() is used. (bakape) -* CommandTag.RowsAffected is faster and does not allocate. - -## Fixes - -* Try to cancel any in-progress query when a conn is closed by ctx cancel. -* Handle NoticeResponse during CopyFrom. -* Ignore errors sending Terminate message while closing connection. This mimics the behavior of libpq PGfinish. - -# 1.1.0 (October 12, 2019) - -* Add PgConn.IsBusy() method. - -# 1.0.1 (September 19, 2019) - -* Fix statement cache not properly cleaning discarded statements. diff --git a/vendor/github.com/jackc/pgconn/LICENSE b/vendor/github.com/jackc/pgconn/LICENSE deleted file mode 100644 index aebadd6..0000000 --- a/vendor/github.com/jackc/pgconn/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2019-2021 Jack Christensen - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/jackc/pgconn/README.md b/vendor/github.com/jackc/pgconn/README.md deleted file mode 100644 index 9af04fe..0000000 --- a/vendor/github.com/jackc/pgconn/README.md +++ /dev/null @@ -1,62 +0,0 @@ -[![](https://godoc.org/github.com/jackc/pgconn?status.svg)](https://godoc.org/github.com/jackc/pgconn) -![CI](https://github.com/jackc/pgconn/workflows/CI/badge.svg) - ---- - -This version is used with pgx `v4`. In pgx `v5` it is part of the https://github.com/jackc/pgx repository. - ---- - -# pgconn - -Package pgconn is a low-level PostgreSQL database driver. It operates at nearly the same level as the C library libpq. -It is primarily intended to serve as the foundation for higher level libraries such as https://github.com/jackc/pgx. -Applications should handle normal queries with a higher level library and only use pgconn directly when required for -low-level access to PostgreSQL functionality. - -## Example Usage - -```go -pgConn, err := pgconn.Connect(context.Background(), os.Getenv("DATABASE_URL")) -if err != nil { - log.Fatalln("pgconn failed to connect:", err) -} -defer pgConn.Close(context.Background()) - -result := pgConn.ExecParams(context.Background(), "SELECT email FROM users WHERE id=$1", [][]byte{[]byte("123")}, nil, nil, nil) -for result.NextRow() { - fmt.Println("User 123 has email:", string(result.Values()[0])) -} -_, err = result.Close() -if err != nil { - log.Fatalln("failed reading result:", err) -} -``` - -## Testing - -The pgconn tests require a PostgreSQL database. It will connect to the database specified in the `PGX_TEST_CONN_STRING` -environment variable. The `PGX_TEST_CONN_STRING` environment variable can be a URL or DSN. In addition, the standard `PG*` -environment variables will be respected. Consider using [direnv](https://github.com/direnv/direnv) to simplify -environment variable handling. - -### Example Test Environment - -Connect to your PostgreSQL server and run: - -``` -create database pgx_test; -``` - -Now you can run the tests: - -```bash -PGX_TEST_CONN_STRING="host=/var/run/postgresql dbname=pgx_test" go test ./... -``` - -### Connection and Authentication Tests - -Pgconn supports multiple connection types and means of authentication. These tests are optional. They -will only run if the appropriate environment variable is set. Run `go test -v | grep SKIP` to see if any tests are being -skipped. Most developers will not need to enable these tests. See `ci/setup_test.bash` for an example set up if you need change -authentication code. diff --git a/vendor/github.com/jackc/pgconn/doc.go b/vendor/github.com/jackc/pgconn/doc.go deleted file mode 100644 index cde58cd..0000000 --- a/vendor/github.com/jackc/pgconn/doc.go +++ /dev/null @@ -1,29 +0,0 @@ -// Package pgconn is a low-level PostgreSQL database driver. -/* -pgconn provides lower level access to a PostgreSQL connection than a database/sql or pgx connection. It operates at -nearly the same level is the C library libpq. - -Establishing a Connection - -Use Connect to establish a connection. It accepts a connection string in URL or DSN and will read the environment for -libpq style environment variables. - -Executing a Query - -ExecParams and ExecPrepared execute a single query. They return readers that iterate over each row. The Read method -reads all rows into memory. - -Executing Multiple Queries in a Single Round Trip - -Exec and ExecBatch can execute multiple queries in a single round trip. They return readers that iterate over each query -result. The ReadAll method reads all query results into memory. - -Context Support - -All potentially blocking operations take a context.Context. If a context is canceled while the method is in progress the -method immediately returns. In most circumstances, this will close the underlying connection. - -The CancelRequest method may be used to request the PostgreSQL server cancel an in-progress query without forcing the -client to abort. -*/ -package pgconn diff --git a/vendor/github.com/jackc/pgconn/stmtcache/lru.go b/vendor/github.com/jackc/pgconn/stmtcache/lru.go deleted file mode 100644 index f0fb53b..0000000 --- a/vendor/github.com/jackc/pgconn/stmtcache/lru.go +++ /dev/null @@ -1,169 +0,0 @@ -package stmtcache - -import ( - "container/list" - "context" - "fmt" - "sync/atomic" - - "github.com/jackc/pgconn" -) - -var lruCount uint64 - -// LRU implements Cache with a Least Recently Used (LRU) cache. -type LRU struct { - conn *pgconn.PgConn - mode int - cap int - prepareCount int - m map[string]*list.Element - l *list.List - psNamePrefix string - stmtsToClear []string -} - -// NewLRU creates a new LRU. mode is either ModePrepare or ModeDescribe. cap is the maximum size of the cache. -func NewLRU(conn *pgconn.PgConn, mode int, cap int) *LRU { - mustBeValidMode(mode) - mustBeValidCap(cap) - - n := atomic.AddUint64(&lruCount, 1) - - return &LRU{ - conn: conn, - mode: mode, - cap: cap, - m: make(map[string]*list.Element), - l: list.New(), - psNamePrefix: fmt.Sprintf("lrupsc_%d", n), - } -} - -// Get returns the prepared statement description for sql preparing or describing the sql on the server as needed. -func (c *LRU) Get(ctx context.Context, sql string) (*pgconn.StatementDescription, error) { - if ctx != context.Background() { - select { - case <-ctx.Done(): - return nil, ctx.Err() - default: - } - } - - // flush an outstanding bad statements - txStatus := c.conn.TxStatus() - if (txStatus == 'I' || txStatus == 'T') && len(c.stmtsToClear) > 0 { - for _, stmt := range c.stmtsToClear { - err := c.clearStmt(ctx, stmt) - if err != nil { - return nil, err - } - } - } - - if el, ok := c.m[sql]; ok { - c.l.MoveToFront(el) - return el.Value.(*pgconn.StatementDescription), nil - } - - if c.l.Len() == c.cap { - err := c.removeOldest(ctx) - if err != nil { - return nil, err - } - } - - psd, err := c.prepare(ctx, sql) - if err != nil { - return nil, err - } - - el := c.l.PushFront(psd) - c.m[sql] = el - - return psd, nil -} - -// Clear removes all entries in the cache. Any prepared statements will be deallocated from the PostgreSQL session. -func (c *LRU) Clear(ctx context.Context) error { - for c.l.Len() > 0 { - err := c.removeOldest(ctx) - if err != nil { - return err - } - } - - return nil -} - -func (c *LRU) StatementErrored(sql string, err error) { - pgErr, ok := err.(*pgconn.PgError) - if !ok { - return - } - - // https://github.com/jackc/pgx/issues/1162 - // - // We used to look for the message "cached plan must not change result type". However, that message can be localized. - // Unfortunately, error code "0A000" - "FEATURE NOT SUPPORTED" is used for many different errors and the only way to - // tell the difference is by the message. But all that happens is we clear a statement that we otherwise wouldn't - // have so it should be safe. - possibleInvalidCachedPlanError := pgErr.Code == "0A000" - if possibleInvalidCachedPlanError { - c.stmtsToClear = append(c.stmtsToClear, sql) - } -} - -func (c *LRU) clearStmt(ctx context.Context, sql string) error { - elem, inMap := c.m[sql] - if !inMap { - // The statement probably fell off the back of the list. In that case, we've - // ensured that it isn't in the cache, so we can declare victory. - return nil - } - - c.l.Remove(elem) - - psd := elem.Value.(*pgconn.StatementDescription) - delete(c.m, psd.SQL) - if c.mode == ModePrepare { - return c.conn.Exec(ctx, fmt.Sprintf("deallocate %s", psd.Name)).Close() - } - return nil -} - -// Len returns the number of cached prepared statement descriptions. -func (c *LRU) Len() int { - return c.l.Len() -} - -// Cap returns the maximum number of cached prepared statement descriptions. -func (c *LRU) Cap() int { - return c.cap -} - -// Mode returns the mode of the cache (ModePrepare or ModeDescribe) -func (c *LRU) Mode() int { - return c.mode -} - -func (c *LRU) prepare(ctx context.Context, sql string) (*pgconn.StatementDescription, error) { - var name string - if c.mode == ModePrepare { - name = fmt.Sprintf("%s_%d", c.psNamePrefix, c.prepareCount) - c.prepareCount += 1 - } - - return c.conn.Prepare(ctx, name, sql, nil) -} - -func (c *LRU) removeOldest(ctx context.Context) error { - oldest := c.l.Back() - c.l.Remove(oldest) - psd := oldest.Value.(*pgconn.StatementDescription) - delete(c.m, psd.SQL) - if c.mode == ModePrepare { - return c.conn.Exec(ctx, fmt.Sprintf("deallocate %s", psd.Name)).Close() - } - return nil -} diff --git a/vendor/github.com/jackc/pgconn/stmtcache/stmtcache.go b/vendor/github.com/jackc/pgconn/stmtcache/stmtcache.go deleted file mode 100644 index d083e1b..0000000 --- a/vendor/github.com/jackc/pgconn/stmtcache/stmtcache.go +++ /dev/null @@ -1,58 +0,0 @@ -// Package stmtcache is a cache that can be used to implement lazy prepared statements. -package stmtcache - -import ( - "context" - - "github.com/jackc/pgconn" -) - -const ( - ModePrepare = iota // Cache should prepare named statements. - ModeDescribe // Cache should prepare the anonymous prepared statement to only fetch the description of the statement. -) - -// Cache prepares and caches prepared statement descriptions. -type Cache interface { - // Get returns the prepared statement description for sql preparing or describing the sql on the server as needed. - Get(ctx context.Context, sql string) (*pgconn.StatementDescription, error) - - // Clear removes all entries in the cache. Any prepared statements will be deallocated from the PostgreSQL session. - Clear(ctx context.Context) error - - // StatementErrored informs the cache that the given statement resulted in an error when it - // was last used against the database. In some cases, this will cause the cache to maer that - // statement as bad. The bad statement will instead be flushed during the next call to Get - // that occurs outside of a failed transaction. - StatementErrored(sql string, err error) - - // Len returns the number of cached prepared statement descriptions. - Len() int - - // Cap returns the maximum number of cached prepared statement descriptions. - Cap() int - - // Mode returns the mode of the cache (ModePrepare or ModeDescribe) - Mode() int -} - -// New returns the preferred cache implementation for mode and cap. mode is either ModePrepare or ModeDescribe. cap is -// the maximum size of the cache. -func New(conn *pgconn.PgConn, mode int, cap int) Cache { - mustBeValidMode(mode) - mustBeValidCap(cap) - - return NewLRU(conn, mode, cap) -} - -func mustBeValidMode(mode int) { - if mode != ModePrepare && mode != ModeDescribe { - panic("mode must be ModePrepare or ModeDescribe") - } -} - -func mustBeValidCap(cap int) { - if cap < 1 { - panic("cache must have cap of >= 1") - } -} diff --git a/vendor/github.com/jackc/pgio/.travis.yml b/vendor/github.com/jackc/pgio/.travis.yml deleted file mode 100644 index e176228..0000000 --- a/vendor/github.com/jackc/pgio/.travis.yml +++ /dev/null @@ -1,9 +0,0 @@ -language: go - -go: - - 1.x - - tip - -matrix: - allow_failures: - - go: tip diff --git a/vendor/github.com/jackc/pgio/LICENSE b/vendor/github.com/jackc/pgio/LICENSE deleted file mode 100644 index c1c4f50..0000000 --- a/vendor/github.com/jackc/pgio/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2019 Jack Christensen - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/jackc/pgio/README.md b/vendor/github.com/jackc/pgio/README.md deleted file mode 100644 index 1952ed8..0000000 --- a/vendor/github.com/jackc/pgio/README.md +++ /dev/null @@ -1,11 +0,0 @@ -[![](https://godoc.org/github.com/jackc/pgio?status.svg)](https://godoc.org/github.com/jackc/pgio) -[![Build Status](https://travis-ci.org/jackc/pgio.svg)](https://travis-ci.org/jackc/pgio) - -# pgio - -Package pgio is a low-level toolkit building messages in the PostgreSQL wire protocol. - -pgio provides functions for appending integers to a []byte while doing byte -order conversion. - -Extracted from original implementation in https://github.com/jackc/pgx. diff --git a/vendor/github.com/jackc/pgproto3/v2/.travis.yml b/vendor/github.com/jackc/pgproto3/v2/.travis.yml deleted file mode 100644 index e176228..0000000 --- a/vendor/github.com/jackc/pgproto3/v2/.travis.yml +++ /dev/null @@ -1,9 +0,0 @@ -language: go - -go: - - 1.x - - tip - -matrix: - allow_failures: - - go: tip diff --git a/vendor/github.com/jackc/pgproto3/v2/LICENSE b/vendor/github.com/jackc/pgproto3/v2/LICENSE deleted file mode 100644 index c1c4f50..0000000 --- a/vendor/github.com/jackc/pgproto3/v2/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2019 Jack Christensen - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/jackc/pgproto3/v2/README.md b/vendor/github.com/jackc/pgproto3/v2/README.md deleted file mode 100644 index 77a3170..0000000 --- a/vendor/github.com/jackc/pgproto3/v2/README.md +++ /dev/null @@ -1,18 +0,0 @@ -[![](https://godoc.org/github.com/jackc/pgproto3?status.svg)](https://godoc.org/github.com/jackc/pgproto3) -[![Build Status](https://travis-ci.org/jackc/pgproto3.svg)](https://travis-ci.org/jackc/pgproto3) - ---- - -This version is used with pgx `v4`. In pgx `v5` it is part of the https://github.com/jackc/pgx repository. - ---- - -# pgproto3 - -Package pgproto3 is a encoder and decoder of the PostgreSQL wire protocol version 3. - -pgproto3 can be used as a foundation for PostgreSQL drivers, proxies, mock servers, load balancers and more. - -See example/pgfortune for a playful example of a fake PostgreSQL server. - -Extracted from original implementation in https://github.com/jackc/pgx. diff --git a/vendor/github.com/jackc/pgproto3/v2/chunkreader.go b/vendor/github.com/jackc/pgproto3/v2/chunkreader.go deleted file mode 100644 index 92206f3..0000000 --- a/vendor/github.com/jackc/pgproto3/v2/chunkreader.go +++ /dev/null @@ -1,19 +0,0 @@ -package pgproto3 - -import ( - "io" - - "github.com/jackc/chunkreader/v2" -) - -// ChunkReader is an interface to decouple github.com/jackc/chunkreader from this package. -type ChunkReader interface { - // Next returns buf filled with the next n bytes. If an error (including a partial read) occurs, - // buf must be nil. Next must preserve any partially read data. Next must not reuse buf. - Next(n int) (buf []byte, err error) -} - -// NewChunkReader creates and returns a new default ChunkReader. -func NewChunkReader(r io.Reader) ChunkReader { - return chunkreader.New(r) -} diff --git a/vendor/github.com/jackc/pgproto3/v2/doc.go b/vendor/github.com/jackc/pgproto3/v2/doc.go deleted file mode 100644 index 8226dc9..0000000 --- a/vendor/github.com/jackc/pgproto3/v2/doc.go +++ /dev/null @@ -1,4 +0,0 @@ -// Package pgproto3 is a encoder and decoder of the PostgreSQL wire protocol version 3. -// -// See https://www.postgresql.org/docs/current/protocol-message-formats.html for meanings of the different messages. -package pgproto3 diff --git a/vendor/github.com/jackc/pgproto3/v2/frontend.go b/vendor/github.com/jackc/pgproto3/v2/frontend.go deleted file mode 100644 index 623b0a9..0000000 --- a/vendor/github.com/jackc/pgproto3/v2/frontend.go +++ /dev/null @@ -1,210 +0,0 @@ -package pgproto3 - -import ( - "encoding/binary" - "errors" - "fmt" - "io" -) - -// Frontend acts as a client for the PostgreSQL wire protocol version 3. -type Frontend struct { - cr ChunkReader - w io.Writer - - // Backend message flyweights - authenticationOk AuthenticationOk - authenticationCleartextPassword AuthenticationCleartextPassword - authenticationMD5Password AuthenticationMD5Password - authenticationGSS AuthenticationGSS - authenticationGSSContinue AuthenticationGSSContinue - authenticationSASL AuthenticationSASL - authenticationSASLContinue AuthenticationSASLContinue - authenticationSASLFinal AuthenticationSASLFinal - backendKeyData BackendKeyData - bindComplete BindComplete - closeComplete CloseComplete - commandComplete CommandComplete - copyBothResponse CopyBothResponse - copyData CopyData - copyInResponse CopyInResponse - copyOutResponse CopyOutResponse - copyDone CopyDone - dataRow DataRow - emptyQueryResponse EmptyQueryResponse - errorResponse ErrorResponse - functionCallResponse FunctionCallResponse - noData NoData - noticeResponse NoticeResponse - notificationResponse NotificationResponse - parameterDescription ParameterDescription - parameterStatus ParameterStatus - parseComplete ParseComplete - readyForQuery ReadyForQuery - rowDescription RowDescription - portalSuspended PortalSuspended - - bodyLen int - msgType byte - partialMsg bool - authType uint32 -} - -// NewFrontend creates a new Frontend. -func NewFrontend(cr ChunkReader, w io.Writer) *Frontend { - return &Frontend{cr: cr, w: w} -} - -// Send sends a message to the backend. -func (f *Frontend) Send(msg FrontendMessage) error { - buf, err := msg.Encode(nil) - if err != nil { - return err - } - _, err = f.w.Write(buf) - return err -} - -func translateEOFtoErrUnexpectedEOF(err error) error { - if err == io.EOF { - return io.ErrUnexpectedEOF - } - return err -} - -// Receive receives a message from the backend. The returned message is only valid until the next call to Receive. -func (f *Frontend) Receive() (BackendMessage, error) { - if !f.partialMsg { - header, err := f.cr.Next(5) - if err != nil { - return nil, translateEOFtoErrUnexpectedEOF(err) - } - - f.msgType = header[0] - f.bodyLen = int(binary.BigEndian.Uint32(header[1:])) - 4 - f.partialMsg = true - if f.bodyLen < 0 { - return nil, errors.New("invalid message with negative body length received") - } - } - - msgBody, err := f.cr.Next(f.bodyLen) - if err != nil { - return nil, translateEOFtoErrUnexpectedEOF(err) - } - - f.partialMsg = false - - var msg BackendMessage - switch f.msgType { - case '1': - msg = &f.parseComplete - case '2': - msg = &f.bindComplete - case '3': - msg = &f.closeComplete - case 'A': - msg = &f.notificationResponse - case 'c': - msg = &f.copyDone - case 'C': - msg = &f.commandComplete - case 'd': - msg = &f.copyData - case 'D': - msg = &f.dataRow - case 'E': - msg = &f.errorResponse - case 'G': - msg = &f.copyInResponse - case 'H': - msg = &f.copyOutResponse - case 'I': - msg = &f.emptyQueryResponse - case 'K': - msg = &f.backendKeyData - case 'n': - msg = &f.noData - case 'N': - msg = &f.noticeResponse - case 'R': - var err error - msg, err = f.findAuthenticationMessageType(msgBody) - if err != nil { - return nil, err - } - case 's': - msg = &f.portalSuspended - case 'S': - msg = &f.parameterStatus - case 't': - msg = &f.parameterDescription - case 'T': - msg = &f.rowDescription - case 'V': - msg = &f.functionCallResponse - case 'W': - msg = &f.copyBothResponse - case 'Z': - msg = &f.readyForQuery - default: - return nil, fmt.Errorf("unknown message type: %c", f.msgType) - } - - err = msg.Decode(msgBody) - return msg, err -} - -// Authentication message type constants. -// See src/include/libpq/pqcomm.h for all -// constants. -const ( - AuthTypeOk = 0 - AuthTypeCleartextPassword = 3 - AuthTypeMD5Password = 5 - AuthTypeSCMCreds = 6 - AuthTypeGSS = 7 - AuthTypeGSSCont = 8 - AuthTypeSSPI = 9 - AuthTypeSASL = 10 - AuthTypeSASLContinue = 11 - AuthTypeSASLFinal = 12 -) - -func (f *Frontend) findAuthenticationMessageType(src []byte) (BackendMessage, error) { - if len(src) < 4 { - return nil, errors.New("authentication message too short") - } - f.authType = binary.BigEndian.Uint32(src[:4]) - - switch f.authType { - case AuthTypeOk: - return &f.authenticationOk, nil - case AuthTypeCleartextPassword: - return &f.authenticationCleartextPassword, nil - case AuthTypeMD5Password: - return &f.authenticationMD5Password, nil - case AuthTypeSCMCreds: - return nil, errors.New("AuthTypeSCMCreds is unimplemented") - case AuthTypeGSS: - return &f.authenticationGSS, nil - case AuthTypeGSSCont: - return &f.authenticationGSSContinue, nil - case AuthTypeSSPI: - return nil, errors.New("AuthTypeSSPI is unimplemented") - case AuthTypeSASL: - return &f.authenticationSASL, nil - case AuthTypeSASLContinue: - return &f.authenticationSASLContinue, nil - case AuthTypeSASLFinal: - return &f.authenticationSASLFinal, nil - default: - return nil, fmt.Errorf("unknown authentication type: %d", f.authType) - } -} - -// GetAuthType returns the authType used in the current state of the frontend. -// See SetAuthType for more information. -func (f *Frontend) GetAuthType() uint32 { - return f.authType -} diff --git a/vendor/github.com/jackc/pgservicefile/.travis.yml b/vendor/github.com/jackc/pgservicefile/.travis.yml deleted file mode 100644 index e176228..0000000 --- a/vendor/github.com/jackc/pgservicefile/.travis.yml +++ /dev/null @@ -1,9 +0,0 @@ -language: go - -go: - - 1.x - - tip - -matrix: - allow_failures: - - go: tip diff --git a/vendor/github.com/jackc/pgservicefile/README.md b/vendor/github.com/jackc/pgservicefile/README.md index e50ca12..2fc7e01 100644 --- a/vendor/github.com/jackc/pgservicefile/README.md +++ b/vendor/github.com/jackc/pgservicefile/README.md @@ -1,5 +1,6 @@ -[![](https://godoc.org/github.com/jackc/pgservicefile?status.svg)](https://godoc.org/github.com/jackc/pgservicefile) -[![Build Status](https://travis-ci.org/jackc/pgservicefile.svg)](https://travis-ci.org/jackc/pgservicefile) +[![Go Reference](https://pkg.go.dev/badge/github.com/jackc/pgservicefile.svg)](https://pkg.go.dev/github.com/jackc/pgservicefile) +[![Build Status](https://github.com/jackc/pgservicefile/actions/workflows/ci.yml/badge.svg)](https://github.com/jackc/pgservicefile/actions/workflows/ci.yml) + # pgservicefile diff --git a/vendor/github.com/jackc/pgservicefile/pgservicefile.go b/vendor/github.com/jackc/pgservicefile/pgservicefile.go index 797bbab..c62caa7 100644 --- a/vendor/github.com/jackc/pgservicefile/pgservicefile.go +++ b/vendor/github.com/jackc/pgservicefile/pgservicefile.go @@ -57,7 +57,7 @@ func ParseServicefile(r io.Reader) (*Servicefile, error) { } else if strings.HasPrefix(line, "[") && strings.HasSuffix(line, "]") { service = &Service{Name: line[1 : len(line)-1], Settings: make(map[string]string)} servicefile.Services = append(servicefile.Services, service) - } else { + } else if service != nil { parts := strings.SplitN(line, "=", 2) if len(parts) != 2 { return nil, fmt.Errorf("unable to parse line %d", lineNum) @@ -67,6 +67,8 @@ func ParseServicefile(r io.Reader) (*Servicefile, error) { value := strings.TrimSpace(parts[1]) service.Settings[key] = value + } else { + return nil, fmt.Errorf("line %d is not in a section", lineNum) } } diff --git a/vendor/github.com/jackc/pgtype/CHANGELOG.md b/vendor/github.com/jackc/pgtype/CHANGELOG.md deleted file mode 100644 index a362a1d..0000000 --- a/vendor/github.com/jackc/pgtype/CHANGELOG.md +++ /dev/null @@ -1,164 +0,0 @@ -# 1.14.0 (February 11, 2023) - -* Fix: BC timestamp text format support (jozeflami) -* Add Scanner and Valuer interfaces to CIDR (Yurii Popivniak) -* Fix crash when nilifying pointer to sql.Scanner - -# 1.13.0 (December 1, 2022) - -* Fix: Reset jsonb before unmarshal (Tomas Odinas) -* Fix: return correct zero value when UUID conversion fails (ndrpnt) -* Fix: EncodeText for Lseg includes [ and ] -* Support sql Value and Scan for custom date type (Hubert Krauze) -* Support Ltree binary encoding (AmineChikhaoui) -* Fix: dates with "BC" (jozeflami) - -# 1.12.0 (August 6, 2022) - -* Add JSONArray (Jakob Ackermann) -* Support Inet from fmt.Stringer and encoding.TextMarshaler (Ville Skyttä) -* Support UUID from fmt.Stringer interface (Lasse Hyldahl Jensen) -* Fix: shopspring-numeric extension does not panic on NaN -* Numeric can be assigned to string -* Fix: Do not send IPv4 networks as IPv4-mapped IPv6 (William Storey) -* Fix: PlanScan for interface{}(nil) (James Hartig) -* Fix: *sql.Scanner for NULL handling (James Hartig) -* Timestamp[tz].Set() supports string (Harmen) -* Fix: Hstore AssignTo with map of *string (Diego Becciolini) - -# 1.11.0 (April 21, 2022) - -* Add multirange for numeric, int4, and int8 (Vu) -* JSONBArray now supports json.RawMessage (Jens Emil Schulz Østergaard) -* Add RecordArray (WGH) -* Add UnmarshalJSON to pgtype.Int2 -* Hstore.Set accepts map[string]Text - -# 1.10.0 (February 7, 2022) - -* Normalize UTC timestamps to comply with stdlib (Torkel Rogstad) -* Assign Numeric to *big.Rat (Oleg Lomaka) -* Fix typo in float8 error message (Pinank Solanki) -* Scan type aliases for floating point types (Collin Forsyth) - -# 1.9.1 (November 28, 2021) - -* Fix: binary timestamp is assumed to be in UTC (restored behavior changed in v1.9.0) - -# 1.9.0 (November 20, 2021) - -* Fix binary hstore null decoding -* Add shopspring/decimal.NullDecimal support to integration (Eli Treuherz) -* Inet.Set supports bare IP address (Carl Dunham) -* Add zeronull.Float8 -* Fix NULL being lost when scanning unknown OID into sql.Scanner -* Fix BPChar.AssignTo **rune -* Add support for fmt.Stringer and driver.Valuer in String fields encoding (Jan Dubsky) -* Fix really big timestamp(tz)s binary format parsing (e.g. year 294276) (Jim Tsao) -* Support `map[string]*string` as hstore (Adrian Sieger) -* Fix parsing text array with negative bounds -* Add infinity support for numeric (Jim Tsao) - -# 1.8.1 (July 24, 2021) - -* Cleaned up Go module dependency chain - -# 1.8.0 (July 10, 2021) - -* Maintain host bits for inet types (Cameron Daniel) -* Support pointers of wrapping structs (Ivan Daunis) -* Register JSONBArray at NewConnInfo() (Rueian) -* CompositeTextScanner handles backslash escapes - -# 1.7.0 (March 25, 2021) - -* Fix scanning int into **sql.Scanner implementor -* Add tsrange array type (Vasilii Novikov) -* Fix: escaped strings when they start or end with a newline char (Stephane Martin) -* Accept nil *time.Time in Time.Set -* Fix numeric NaN support -* Use Go 1.13 errors instead of xerrors - -# 1.6.2 (December 3, 2020) - -* Fix panic on assigning empty array to non-slice or array -* Fix text array parsing disambiguates NULL and "NULL" -* Fix Timestamptz.DecodeText with too short text - -# 1.6.1 (October 31, 2020) - -* Fix simple protocol empty array support - -# 1.6.0 (October 24, 2020) - -* Fix AssignTo pointer to pointer to slice and named types. -* Fix zero length array assignment (Simo Haasanen) -* Add float64, float32 convert to int2, int4, int8 (lqu3j) -* Support setting infinite timestamps (Erik Agsjö) -* Polygon improvements (duohedron) -* Fix Inet.Set with nil (Tomas Volf) - -# 1.5.0 (September 26, 2020) - -* Add slice of slice mapping to multi-dimensional arrays (Simo Haasanen) -* Fix JSONBArray -* Fix selecting empty array -* Text formatted values except bytea can be directly scanned to []byte -* Add JSON marshalling for UUID (bakmataliev) -* Improve point type conversions (bakmataliev) - -# 1.4.2 (July 22, 2020) - -* Fix encoding of a large composite data type (Yaz Saito) - -# 1.4.1 (July 14, 2020) - -* Fix ArrayType DecodeBinary empty array breaks future reads - -# 1.4.0 (June 27, 2020) - -* Add JSON support to ext/gofrs-uuid -* Performance improvements in Scan path -* Improved ext/shopspring-numeric binary decoding performance -* Add composite type support (Maxim Ivanov and Jack Christensen) -* Add better generic enum type support -* Add generic array type support -* Clarify and normalize Value semantics -* Fix hstore with empty string values -* Numeric supports NaN values (leighhopcroft) -* Add slice of pointer support to array types (megaturbo) -* Add jsonb array type (tserakhau) -* Allow converting intervals with months and days to duration - -# 1.3.0 (March 30, 2020) - -* Get implemented on T instead of *T -* Set will call Get on src if possible -* Range types Set method supports its own type, string, and nil -* Date.Set parses string -* Fix correct format verb for unknown type error (Robert Welin) -* Truncate nanoseconds in EncodeText for Timestamptz and Timestamp - -# 1.2.0 (February 5, 2020) - -* Add zeronull package for easier NULL <-> zero conversion -* Add JSON marshalling for shopspring-numeric extension -* Add JSON marshalling for Bool, Date, JSON/B, Timestamptz (Jeffrey Stiles) -* Fix null status in UnmarshalJSON for some types (Jeffrey Stiles) - -# 1.1.0 (January 11, 2020) - -* Add PostgreSQL time type support -* Add more automatic conversions of integer arrays of different types (Jean-Philippe Quéméner) - -# 1.0.3 (November 16, 2019) - -* Support initializing Array types from a slice of the value (Alex Gaynor) - -# 1.0.2 (October 22, 2019) - -* Fix scan into null into pointer to pointer implementing Decode* interface. (Jeremy Altavilla) - -# 1.0.1 (September 19, 2019) - -* Fix daterange OID diff --git a/vendor/github.com/jackc/pgtype/README.md b/vendor/github.com/jackc/pgtype/README.md deleted file mode 100644 index 72dadcf..0000000 --- a/vendor/github.com/jackc/pgtype/README.md +++ /dev/null @@ -1,14 +0,0 @@ -[![](https://godoc.org/github.com/jackc/pgtype?status.svg)](https://godoc.org/github.com/jackc/pgtype) -![CI](https://github.com/jackc/pgtype/workflows/CI/badge.svg) - ---- - -This version is used with pgx `v4`. In pgx `v5` it is part of the https://github.com/jackc/pgx repository. - ---- - -# pgtype - -pgtype implements Go types for over 70 PostgreSQL types. pgtype is the type system underlying the -https://github.com/jackc/pgx PostgreSQL driver. These types support the binary format for enhanced performance with pgx. -They also support the database/sql `Scan` and `Value` interfaces and can be used with https://github.com/lib/pq. diff --git a/vendor/github.com/jackc/pgtype/aclitem.go b/vendor/github.com/jackc/pgtype/aclitem.go deleted file mode 100644 index 9f6587b..0000000 --- a/vendor/github.com/jackc/pgtype/aclitem.go +++ /dev/null @@ -1,138 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" -) - -// ACLItem is used for PostgreSQL's aclitem data type. A sample aclitem -// might look like this: -// -// postgres=arwdDxt/postgres -// -// Note, however, that because the user/role name part of an aclitem is -// an identifier, it follows all the usual formatting rules for SQL -// identifiers: if it contains spaces and other special characters, -// it should appear in double-quotes: -// -// postgres=arwdDxt/"role with spaces" -// -type ACLItem struct { - String string - Status Status -} - -func (dst *ACLItem) Set(src interface{}) error { - if src == nil { - *dst = ACLItem{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case string: - *dst = ACLItem{String: value, Status: Present} - case *string: - if value == nil { - *dst = ACLItem{Status: Null} - } else { - *dst = ACLItem{String: *value, Status: Present} - } - default: - if originalSrc, ok := underlyingStringType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to ACLItem", value) - } - - return nil -} - -func (dst ACLItem) Get() interface{} { - switch dst.Status { - case Present: - return dst.String - case Null: - return nil - default: - return dst.Status - } -} - -func (src *ACLItem) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *string: - *v = src.String - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *ACLItem) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = ACLItem{Status: Null} - return nil - } - - *dst = ACLItem{String: string(src), Status: Present} - return nil -} - -func (src ACLItem) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.String...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *ACLItem) Scan(src interface{}) error { - if src == nil { - *dst = ACLItem{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src ACLItem) Value() (driver.Value, error) { - switch src.Status { - case Present: - return src.String, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} diff --git a/vendor/github.com/jackc/pgtype/aclitem_array.go b/vendor/github.com/jackc/pgtype/aclitem_array.go deleted file mode 100644 index 4e3be3b..0000000 --- a/vendor/github.com/jackc/pgtype/aclitem_array.go +++ /dev/null @@ -1,428 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "fmt" - "reflect" -) - -type ACLItemArray struct { - Elements []ACLItem - Dimensions []ArrayDimension - Status Status -} - -func (dst *ACLItemArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = ACLItemArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = ACLItemArray{Status: Null} - } else if len(value) == 0 { - *dst = ACLItemArray{Status: Present} - } else { - elements := make([]ACLItem, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = ACLItemArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*string: - if value == nil { - *dst = ACLItemArray{Status: Null} - } else if len(value) == 0 { - *dst = ACLItemArray{Status: Present} - } else { - elements := make([]ACLItem, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = ACLItemArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []ACLItem: - if value == nil { - *dst = ACLItemArray{Status: Null} - } else if len(value) == 0 { - *dst = ACLItemArray{Status: Present} - } else { - *dst = ACLItemArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = ACLItemArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for ACLItemArray", src) - } - if elementsLength == 0 { - *dst = ACLItemArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to ACLItemArray", src) - } - - *dst = ACLItemArray{ - Elements: make([]ACLItem, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]ACLItem, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to ACLItemArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *ACLItemArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to ACLItemArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in ACLItemArray", err) - } - index++ - - return index, nil -} - -func (dst ACLItemArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *ACLItemArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*string: - *v = make([]*string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *ACLItemArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from ACLItemArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from ACLItemArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *ACLItemArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = ACLItemArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []ACLItem - - if len(uta.Elements) > 0 { - elements = make([]ACLItem, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem ACLItem - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = ACLItemArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (src ACLItemArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *ACLItemArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src ACLItemArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/array_type.go b/vendor/github.com/jackc/pgtype/array_type.go deleted file mode 100644 index 7146655..0000000 --- a/vendor/github.com/jackc/pgtype/array_type.go +++ /dev/null @@ -1,353 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -// ArrayType represents an array type. While it implements Value, this is only in service of its type conversion duties -// when registered as a data type in a ConnType. It should not be used directly as a Value. ArrayType is a convenience -// type for types that do not have a concrete array type. -type ArrayType struct { - elements []ValueTranscoder - dimensions []ArrayDimension - - typeName string - newElement func() ValueTranscoder - - elementOID uint32 - status Status -} - -func NewArrayType(typeName string, elementOID uint32, newElement func() ValueTranscoder) *ArrayType { - return &ArrayType{typeName: typeName, elementOID: elementOID, newElement: newElement} -} - -func (at *ArrayType) NewTypeValue() Value { - return &ArrayType{ - elements: at.elements, - dimensions: at.dimensions, - status: at.status, - - typeName: at.typeName, - elementOID: at.elementOID, - newElement: at.newElement, - } -} - -func (at *ArrayType) TypeName() string { - return at.typeName -} - -func (dst *ArrayType) setNil() { - dst.elements = nil - dst.dimensions = nil - dst.status = Null -} - -func (dst *ArrayType) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - dst.setNil() - return nil - } - - sliceVal := reflect.ValueOf(src) - if sliceVal.Kind() != reflect.Slice { - return fmt.Errorf("cannot set non-slice") - } - - if sliceVal.IsNil() { - dst.setNil() - return nil - } - - dst.elements = make([]ValueTranscoder, sliceVal.Len()) - for i := range dst.elements { - v := dst.newElement() - err := v.Set(sliceVal.Index(i).Interface()) - if err != nil { - return err - } - - dst.elements[i] = v - } - dst.dimensions = []ArrayDimension{{Length: int32(len(dst.elements)), LowerBound: 1}} - dst.status = Present - - return nil -} - -func (dst ArrayType) Get() interface{} { - switch dst.status { - case Present: - elementValues := make([]interface{}, len(dst.elements)) - for i := range dst.elements { - elementValues[i] = dst.elements[i].Get() - } - return elementValues - case Null: - return nil - default: - return dst.status - } -} - -func (src *ArrayType) AssignTo(dst interface{}) error { - ptrSlice := reflect.ValueOf(dst) - if ptrSlice.Kind() != reflect.Ptr { - return fmt.Errorf("cannot assign to non-pointer") - } - - sliceVal := ptrSlice.Elem() - sliceType := sliceVal.Type() - - if sliceType.Kind() != reflect.Slice { - return fmt.Errorf("cannot assign to pointer to non-slice") - } - - switch src.status { - case Present: - slice := reflect.MakeSlice(sliceType, len(src.elements), len(src.elements)) - elemType := sliceType.Elem() - - for i := range src.elements { - ptrElem := reflect.New(elemType) - err := src.elements[i].AssignTo(ptrElem.Interface()) - if err != nil { - return err - } - - slice.Index(i).Set(ptrElem.Elem()) - } - - sliceVal.Set(slice) - return nil - case Null: - sliceVal.Set(reflect.Zero(sliceType)) - return nil - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *ArrayType) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - dst.setNil() - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []ValueTranscoder - - if len(uta.Elements) > 0 { - elements = make([]ValueTranscoder, len(uta.Elements)) - - for i, s := range uta.Elements { - elem := dst.newElement() - var elemSrc []byte - if s != "NULL" { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - dst.elements = elements - dst.dimensions = uta.Dimensions - dst.status = Present - - return nil -} - -func (dst *ArrayType) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - dst.setNil() - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - var elements []ValueTranscoder - - if len(arrayHeader.Dimensions) == 0 { - dst.elements = elements - dst.dimensions = arrayHeader.Dimensions - dst.status = Present - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements = make([]ValueTranscoder, elementCount) - - for i := range elements { - elem := dst.newElement() - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elem.DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - - dst.elements = elements - dst.dimensions = arrayHeader.Dimensions - dst.status = Present - - return nil -} - -func (src ArrayType) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.dimensions)) - dimElemCounts[len(src.dimensions)-1] = int(src.dimensions[len(src.dimensions)-1].Length) - for i := len(src.dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src ArrayType) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.dimensions, - ElementOID: int32(src.elementOID), - } - - for i := range src.elements { - if src.elements[i].Get() == nil { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *ArrayType) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src ArrayType) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/bit.go b/vendor/github.com/jackc/pgtype/bit.go deleted file mode 100644 index c1709e6..0000000 --- a/vendor/github.com/jackc/pgtype/bit.go +++ /dev/null @@ -1,45 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -type Bit Varbit - -func (dst *Bit) Set(src interface{}) error { - return (*Varbit)(dst).Set(src) -} - -func (dst Bit) Get() interface{} { - return (Varbit)(dst).Get() -} - -func (src *Bit) AssignTo(dst interface{}) error { - return (*Varbit)(src).AssignTo(dst) -} - -func (dst *Bit) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Varbit)(dst).DecodeBinary(ci, src) -} - -func (src Bit) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Varbit)(src).EncodeBinary(ci, buf) -} - -func (dst *Bit) DecodeText(ci *ConnInfo, src []byte) error { - return (*Varbit)(dst).DecodeText(ci, src) -} - -func (src Bit) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Varbit)(src).EncodeText(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *Bit) Scan(src interface{}) error { - return (*Varbit)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Bit) Value() (driver.Value, error) { - return (Varbit)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/bool.go b/vendor/github.com/jackc/pgtype/bool.go deleted file mode 100644 index 676c8e5..0000000 --- a/vendor/github.com/jackc/pgtype/bool.go +++ /dev/null @@ -1,217 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/json" - "fmt" - "strconv" -) - -type Bool struct { - Bool bool - Status Status -} - -func (dst *Bool) Set(src interface{}) error { - if src == nil { - *dst = Bool{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case bool: - *dst = Bool{Bool: value, Status: Present} - case string: - bb, err := strconv.ParseBool(value) - if err != nil { - return err - } - *dst = Bool{Bool: bb, Status: Present} - case *bool: - if value == nil { - *dst = Bool{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Bool{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingBoolType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Bool", value) - } - - return nil -} - -func (dst Bool) Get() interface{} { - switch dst.Status { - case Present: - return dst.Bool - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Bool) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *bool: - *v = src.Bool - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Bool) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Bool{Status: Null} - return nil - } - - if len(src) != 1 { - return fmt.Errorf("invalid length for bool: %v", len(src)) - } - - *dst = Bool{Bool: src[0] == 't', Status: Present} - return nil -} - -func (dst *Bool) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Bool{Status: Null} - return nil - } - - if len(src) != 1 { - return fmt.Errorf("invalid length for bool: %v", len(src)) - } - - *dst = Bool{Bool: src[0] == 1, Status: Present} - return nil -} - -func (src Bool) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if src.Bool { - buf = append(buf, 't') - } else { - buf = append(buf, 'f') - } - - return buf, nil -} - -func (src Bool) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if src.Bool { - buf = append(buf, 1) - } else { - buf = append(buf, 0) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Bool) Scan(src interface{}) error { - if src == nil { - *dst = Bool{Status: Null} - return nil - } - - switch src := src.(type) { - case bool: - *dst = Bool{Bool: src, Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Bool) Value() (driver.Value, error) { - switch src.Status { - case Present: - return src.Bool, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Bool) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - if src.Bool { - return []byte("true"), nil - } else { - return []byte("false"), nil - } - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - return nil, errBadStatus -} - -func (dst *Bool) UnmarshalJSON(b []byte) error { - var v *bool - err := json.Unmarshal(b, &v) - if err != nil { - return err - } - - if v == nil { - *dst = Bool{Status: Null} - } else { - *dst = Bool{Bool: *v, Status: Present} - } - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/bool_array.go b/vendor/github.com/jackc/pgtype/bool_array.go deleted file mode 100644 index 6558d97..0000000 --- a/vendor/github.com/jackc/pgtype/bool_array.go +++ /dev/null @@ -1,517 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type BoolArray struct { - Elements []Bool - Dimensions []ArrayDimension - Status Status -} - -func (dst *BoolArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = BoolArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []bool: - if value == nil { - *dst = BoolArray{Status: Null} - } else if len(value) == 0 { - *dst = BoolArray{Status: Present} - } else { - elements := make([]Bool, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = BoolArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*bool: - if value == nil { - *dst = BoolArray{Status: Null} - } else if len(value) == 0 { - *dst = BoolArray{Status: Present} - } else { - elements := make([]Bool, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = BoolArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Bool: - if value == nil { - *dst = BoolArray{Status: Null} - } else if len(value) == 0 { - *dst = BoolArray{Status: Present} - } else { - *dst = BoolArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = BoolArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for BoolArray", src) - } - if elementsLength == 0 { - *dst = BoolArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to BoolArray", src) - } - - *dst = BoolArray{ - Elements: make([]Bool, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Bool, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to BoolArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *BoolArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to BoolArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in BoolArray", err) - } - index++ - - return index, nil -} - -func (dst BoolArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *BoolArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]bool: - *v = make([]bool, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*bool: - *v = make([]*bool, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *BoolArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from BoolArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from BoolArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *BoolArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = BoolArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Bool - - if len(uta.Elements) > 0 { - elements = make([]Bool, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Bool - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = BoolArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *BoolArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = BoolArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = BoolArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Bool, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = BoolArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src BoolArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src BoolArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("bool"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "bool") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *BoolArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src BoolArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/box.go b/vendor/github.com/jackc/pgtype/box.go deleted file mode 100644 index 27fb829..0000000 --- a/vendor/github.com/jackc/pgtype/box.go +++ /dev/null @@ -1,165 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Box struct { - P [2]Vec2 - Status Status -} - -func (dst *Box) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to Box", src) -} - -func (dst Box) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Box) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Box) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Box{Status: Null} - return nil - } - - if len(src) < 11 { - return fmt.Errorf("invalid length for Box: %v", len(src)) - } - - str := string(src[1:]) - - var end int - end = strings.IndexByte(str, ',') - - x1, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1:] - end = strings.IndexByte(str, ')') - - y1, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+3:] - end = strings.IndexByte(str, ',') - - x2, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1 : len(str)-1] - - y2, err := strconv.ParseFloat(str, 64) - if err != nil { - return err - } - - *dst = Box{P: [2]Vec2{{x1, y1}, {x2, y2}}, Status: Present} - return nil -} - -func (dst *Box) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Box{Status: Null} - return nil - } - - if len(src) != 32 { - return fmt.Errorf("invalid length for Box: %v", len(src)) - } - - x1 := binary.BigEndian.Uint64(src) - y1 := binary.BigEndian.Uint64(src[8:]) - x2 := binary.BigEndian.Uint64(src[16:]) - y2 := binary.BigEndian.Uint64(src[24:]) - - *dst = Box{ - P: [2]Vec2{ - {math.Float64frombits(x1), math.Float64frombits(y1)}, - {math.Float64frombits(x2), math.Float64frombits(y2)}, - }, - Status: Present, - } - return nil -} - -func (src Box) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, fmt.Sprintf(`(%s,%s),(%s,%s)`, - strconv.FormatFloat(src.P[0].X, 'f', -1, 64), - strconv.FormatFloat(src.P[0].Y, 'f', -1, 64), - strconv.FormatFloat(src.P[1].X, 'f', -1, 64), - strconv.FormatFloat(src.P[1].Y, 'f', -1, 64), - )...) - return buf, nil -} - -func (src Box) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[0].X)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[0].Y)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[1].X)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[1].Y)) - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Box) Scan(src interface{}) error { - if src == nil { - *dst = Box{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Box) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/bpchar.go b/vendor/github.com/jackc/pgtype/bpchar.go deleted file mode 100644 index c5fa42e..0000000 --- a/vendor/github.com/jackc/pgtype/bpchar.go +++ /dev/null @@ -1,93 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" -) - -// BPChar is fixed-length, blank padded char type -// character(n), char(n) -type BPChar Text - -// Set converts from src to dst. -func (dst *BPChar) Set(src interface{}) error { - return (*Text)(dst).Set(src) -} - -// Get returns underlying value -func (dst BPChar) Get() interface{} { - return (Text)(dst).Get() -} - -// AssignTo assigns from src to dst. -func (src *BPChar) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *rune: - runes := []rune(src.String) - if len(runes) == 1 { - *v = runes[0] - return nil - } - case *string: - *v = src.String - return nil - case *[]byte: - *v = make([]byte, len(src.String)) - copy(*v, src.String) - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (BPChar) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *BPChar) DecodeText(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeText(ci, src) -} - -func (dst *BPChar) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeBinary(ci, src) -} - -func (BPChar) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (src BPChar) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeText(ci, buf) -} - -func (src BPChar) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *BPChar) Scan(src interface{}) error { - return (*Text)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src BPChar) Value() (driver.Value, error) { - return (Text)(src).Value() -} - -func (src BPChar) MarshalJSON() ([]byte, error) { - return (Text)(src).MarshalJSON() -} - -func (dst *BPChar) UnmarshalJSON(b []byte) error { - return (*Text)(dst).UnmarshalJSON(b) -} diff --git a/vendor/github.com/jackc/pgtype/bpchar_array.go b/vendor/github.com/jackc/pgtype/bpchar_array.go deleted file mode 100644 index 8e79221..0000000 --- a/vendor/github.com/jackc/pgtype/bpchar_array.go +++ /dev/null @@ -1,517 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type BPCharArray struct { - Elements []BPChar - Dimensions []ArrayDimension - Status Status -} - -func (dst *BPCharArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = BPCharArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = BPCharArray{Status: Null} - } else if len(value) == 0 { - *dst = BPCharArray{Status: Present} - } else { - elements := make([]BPChar, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = BPCharArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*string: - if value == nil { - *dst = BPCharArray{Status: Null} - } else if len(value) == 0 { - *dst = BPCharArray{Status: Present} - } else { - elements := make([]BPChar, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = BPCharArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []BPChar: - if value == nil { - *dst = BPCharArray{Status: Null} - } else if len(value) == 0 { - *dst = BPCharArray{Status: Present} - } else { - *dst = BPCharArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = BPCharArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for BPCharArray", src) - } - if elementsLength == 0 { - *dst = BPCharArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to BPCharArray", src) - } - - *dst = BPCharArray{ - Elements: make([]BPChar, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]BPChar, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to BPCharArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *BPCharArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to BPCharArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in BPCharArray", err) - } - index++ - - return index, nil -} - -func (dst BPCharArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *BPCharArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*string: - *v = make([]*string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *BPCharArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from BPCharArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from BPCharArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *BPCharArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = BPCharArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []BPChar - - if len(uta.Elements) > 0 { - elements = make([]BPChar, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem BPChar - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = BPCharArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *BPCharArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = BPCharArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = BPCharArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]BPChar, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = BPCharArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src BPCharArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src BPCharArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("bpchar"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "bpchar") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *BPCharArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src BPCharArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/bytea.go b/vendor/github.com/jackc/pgtype/bytea.go deleted file mode 100644 index 67eba35..0000000 --- a/vendor/github.com/jackc/pgtype/bytea.go +++ /dev/null @@ -1,163 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/hex" - "fmt" -) - -type Bytea struct { - Bytes []byte - Status Status -} - -func (dst *Bytea) Set(src interface{}) error { - if src == nil { - *dst = Bytea{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case []byte: - if value != nil { - *dst = Bytea{Bytes: value, Status: Present} - } else { - *dst = Bytea{Status: Null} - } - default: - if originalSrc, ok := underlyingBytesType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Bytea", value) - } - - return nil -} - -func (dst Bytea) Get() interface{} { - switch dst.Status { - case Present: - return dst.Bytes - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Bytea) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *[]byte: - buf := make([]byte, len(src.Bytes)) - copy(buf, src.Bytes) - *v = buf - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -// DecodeText only supports the hex format. This has been the default since -// PostgreSQL 9.0. -func (dst *Bytea) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Bytea{Status: Null} - return nil - } - - if len(src) < 2 || src[0] != '\\' || src[1] != 'x' { - return fmt.Errorf("invalid hex format") - } - - buf := make([]byte, (len(src)-2)/2) - _, err := hex.Decode(buf, src[2:]) - if err != nil { - return err - } - - *dst = Bytea{Bytes: buf, Status: Present} - return nil -} - -func (dst *Bytea) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Bytea{Status: Null} - return nil - } - - *dst = Bytea{Bytes: src, Status: Present} - return nil -} - -func (src Bytea) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, `\x`...) - buf = append(buf, hex.EncodeToString(src.Bytes)...) - return buf, nil -} - -func (src Bytea) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.Bytes...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Bytea) Scan(src interface{}) error { - if src == nil { - *dst = Bytea{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - buf := make([]byte, len(src)) - copy(buf, src) - *dst = Bytea{Bytes: buf, Status: Present} - return nil - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Bytea) Value() (driver.Value, error) { - switch src.Status { - case Present: - return src.Bytes, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} diff --git a/vendor/github.com/jackc/pgtype/bytea_array.go b/vendor/github.com/jackc/pgtype/bytea_array.go deleted file mode 100644 index 69d1ceb..0000000 --- a/vendor/github.com/jackc/pgtype/bytea_array.go +++ /dev/null @@ -1,489 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type ByteaArray struct { - Elements []Bytea - Dimensions []ArrayDimension - Status Status -} - -func (dst *ByteaArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = ByteaArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case [][]byte: - if value == nil { - *dst = ByteaArray{Status: Null} - } else if len(value) == 0 { - *dst = ByteaArray{Status: Present} - } else { - elements := make([]Bytea, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = ByteaArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Bytea: - if value == nil { - *dst = ByteaArray{Status: Null} - } else if len(value) == 0 { - *dst = ByteaArray{Status: Present} - } else { - *dst = ByteaArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = ByteaArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for ByteaArray", src) - } - if elementsLength == 0 { - *dst = ByteaArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to ByteaArray", src) - } - - *dst = ByteaArray{ - Elements: make([]Bytea, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Bytea, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to ByteaArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *ByteaArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to ByteaArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in ByteaArray", err) - } - index++ - - return index, nil -} - -func (dst ByteaArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *ByteaArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[][]byte: - *v = make([][]byte, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *ByteaArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from ByteaArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from ByteaArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *ByteaArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = ByteaArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Bytea - - if len(uta.Elements) > 0 { - elements = make([]Bytea, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Bytea - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = ByteaArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *ByteaArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = ByteaArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = ByteaArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Bytea, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = ByteaArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src ByteaArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src ByteaArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("bytea"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "bytea") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *ByteaArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src ByteaArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/cid.go b/vendor/github.com/jackc/pgtype/cid.go deleted file mode 100644 index b944748..0000000 --- a/vendor/github.com/jackc/pgtype/cid.go +++ /dev/null @@ -1,61 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -// CID is PostgreSQL's Command Identifier type. -// -// When one does -// -// select cmin, cmax, * from some_table; -// -// it is the data type of the cmin and cmax hidden system columns. -// -// It is currently implemented as an unsigned four byte integer. -// Its definition can be found in src/include/c.h as CommandId -// in the PostgreSQL sources. -type CID pguint32 - -// Set converts from src to dst. Note that as CID is not a general -// number type Set does not do automatic type conversion as other number -// types do. -func (dst *CID) Set(src interface{}) error { - return (*pguint32)(dst).Set(src) -} - -func (dst CID) Get() interface{} { - return (pguint32)(dst).Get() -} - -// AssignTo assigns from src to dst. Note that as CID is not a general number -// type AssignTo does not do automatic type conversion as other number types do. -func (src *CID) AssignTo(dst interface{}) error { - return (*pguint32)(src).AssignTo(dst) -} - -func (dst *CID) DecodeText(ci *ConnInfo, src []byte) error { - return (*pguint32)(dst).DecodeText(ci, src) -} - -func (dst *CID) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*pguint32)(dst).DecodeBinary(ci, src) -} - -func (src CID) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (pguint32)(src).EncodeText(ci, buf) -} - -func (src CID) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (pguint32)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *CID) Scan(src interface{}) error { - return (*pguint32)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src CID) Value() (driver.Value, error) { - return (pguint32)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/cidr.go b/vendor/github.com/jackc/pgtype/cidr.go deleted file mode 100644 index 7c562cf..0000000 --- a/vendor/github.com/jackc/pgtype/cidr.go +++ /dev/null @@ -1,43 +0,0 @@ -package pgtype - -import "database/sql/driver" - -type CIDR Inet - -func (dst *CIDR) Set(src interface{}) error { - return (*Inet)(dst).Set(src) -} - -func (dst CIDR) Get() interface{} { - return (Inet)(dst).Get() -} - -func (src *CIDR) AssignTo(dst interface{}) error { - return (*Inet)(src).AssignTo(dst) -} - -func (dst *CIDR) DecodeText(ci *ConnInfo, src []byte) error { - return (*Inet)(dst).DecodeText(ci, src) -} - -func (dst *CIDR) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Inet)(dst).DecodeBinary(ci, src) -} - -func (src CIDR) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Inet)(src).EncodeText(ci, buf) -} - -func (src CIDR) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Inet)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *CIDR) Scan(src interface{}) error { - return (*Inet)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src CIDR) Value() (driver.Value, error) { - return (Inet)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/cidr_array.go b/vendor/github.com/jackc/pgtype/cidr_array.go deleted file mode 100644 index 783c599..0000000 --- a/vendor/github.com/jackc/pgtype/cidr_array.go +++ /dev/null @@ -1,546 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "net" - "reflect" - - "github.com/jackc/pgio" -) - -type CIDRArray struct { - Elements []CIDR - Dimensions []ArrayDimension - Status Status -} - -func (dst *CIDRArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = CIDRArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []*net.IPNet: - if value == nil { - *dst = CIDRArray{Status: Null} - } else if len(value) == 0 { - *dst = CIDRArray{Status: Present} - } else { - elements := make([]CIDR, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = CIDRArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []net.IP: - if value == nil { - *dst = CIDRArray{Status: Null} - } else if len(value) == 0 { - *dst = CIDRArray{Status: Present} - } else { - elements := make([]CIDR, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = CIDRArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*net.IP: - if value == nil { - *dst = CIDRArray{Status: Null} - } else if len(value) == 0 { - *dst = CIDRArray{Status: Present} - } else { - elements := make([]CIDR, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = CIDRArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []CIDR: - if value == nil { - *dst = CIDRArray{Status: Null} - } else if len(value) == 0 { - *dst = CIDRArray{Status: Present} - } else { - *dst = CIDRArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = CIDRArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for CIDRArray", src) - } - if elementsLength == 0 { - *dst = CIDRArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to CIDRArray", src) - } - - *dst = CIDRArray{ - Elements: make([]CIDR, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]CIDR, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to CIDRArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *CIDRArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to CIDRArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in CIDRArray", err) - } - index++ - - return index, nil -} - -func (dst CIDRArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *CIDRArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]*net.IPNet: - *v = make([]*net.IPNet, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]net.IP: - *v = make([]net.IP, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*net.IP: - *v = make([]*net.IP, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *CIDRArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from CIDRArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from CIDRArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *CIDRArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = CIDRArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []CIDR - - if len(uta.Elements) > 0 { - elements = make([]CIDR, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem CIDR - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = CIDRArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *CIDRArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = CIDRArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = CIDRArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]CIDR, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = CIDRArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src CIDRArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src CIDRArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("cidr"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "cidr") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *CIDRArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src CIDRArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/circle.go b/vendor/github.com/jackc/pgtype/circle.go deleted file mode 100644 index 4279650..0000000 --- a/vendor/github.com/jackc/pgtype/circle.go +++ /dev/null @@ -1,150 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Circle struct { - P Vec2 - R float64 - Status Status -} - -func (dst *Circle) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to Circle", src) -} - -func (dst Circle) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Circle) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Circle) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Circle{Status: Null} - return nil - } - - if len(src) < 9 { - return fmt.Errorf("invalid length for Circle: %v", len(src)) - } - - str := string(src[2:]) - end := strings.IndexByte(str, ',') - x, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1:] - end = strings.IndexByte(str, ')') - - y, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+2 : len(str)-1] - - r, err := strconv.ParseFloat(str, 64) - if err != nil { - return err - } - - *dst = Circle{P: Vec2{x, y}, R: r, Status: Present} - return nil -} - -func (dst *Circle) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Circle{Status: Null} - return nil - } - - if len(src) != 24 { - return fmt.Errorf("invalid length for Circle: %v", len(src)) - } - - x := binary.BigEndian.Uint64(src) - y := binary.BigEndian.Uint64(src[8:]) - r := binary.BigEndian.Uint64(src[16:]) - - *dst = Circle{ - P: Vec2{math.Float64frombits(x), math.Float64frombits(y)}, - R: math.Float64frombits(r), - Status: Present, - } - return nil -} - -func (src Circle) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, fmt.Sprintf(`<(%s,%s),%s>`, - strconv.FormatFloat(src.P.X, 'f', -1, 64), - strconv.FormatFloat(src.P.Y, 'f', -1, 64), - strconv.FormatFloat(src.R, 'f', -1, 64), - )...) - - return buf, nil -} - -func (src Circle) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint64(buf, math.Float64bits(src.P.X)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P.Y)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.R)) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Circle) Scan(src interface{}) error { - if src == nil { - *dst = Circle{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Circle) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/composite_fields.go b/vendor/github.com/jackc/pgtype/composite_fields.go deleted file mode 100644 index b6d09fc..0000000 --- a/vendor/github.com/jackc/pgtype/composite_fields.go +++ /dev/null @@ -1,107 +0,0 @@ -package pgtype - -import "fmt" - -// CompositeFields scans the fields of a composite type into the elements of the CompositeFields value. To scan a -// nullable value use a *CompositeFields. It will be set to nil in case of null. -// -// CompositeFields implements EncodeBinary and EncodeText. However, functionality is limited due to CompositeFields not -// knowing the PostgreSQL schema of the composite type. Prefer using a registered CompositeType. -type CompositeFields []interface{} - -func (cf CompositeFields) DecodeBinary(ci *ConnInfo, src []byte) error { - if len(cf) == 0 { - return fmt.Errorf("cannot decode into empty CompositeFields") - } - - if src == nil { - return fmt.Errorf("cannot decode unexpected null into CompositeFields") - } - - scanner := NewCompositeBinaryScanner(ci, src) - - for _, f := range cf { - scanner.ScanValue(f) - } - - if scanner.Err() != nil { - return scanner.Err() - } - - return nil -} - -func (cf CompositeFields) DecodeText(ci *ConnInfo, src []byte) error { - if len(cf) == 0 { - return fmt.Errorf("cannot decode into empty CompositeFields") - } - - if src == nil { - return fmt.Errorf("cannot decode unexpected null into CompositeFields") - } - - scanner := NewCompositeTextScanner(ci, src) - - for _, f := range cf { - scanner.ScanValue(f) - } - - if scanner.Err() != nil { - return scanner.Err() - } - - return nil -} - -// EncodeText encodes composite fields into the text format. Prefer registering a CompositeType to using -// CompositeFields to encode directly. -func (cf CompositeFields) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - b := NewCompositeTextBuilder(ci, buf) - - for _, f := range cf { - if textEncoder, ok := f.(TextEncoder); ok { - b.AppendEncoder(textEncoder) - } else { - b.AppendValue(f) - } - } - - return b.Finish() -} - -// EncodeBinary encodes composite fields into the binary format. Unlike CompositeType the schema of the destination is -// unknown. Prefer registering a CompositeType to using CompositeFields to encode directly. Because the binary -// composite format requires the OID of each field to be specified the only types that will work are those known to -// ConnInfo. -// -// In particular: -// -// * Nil cannot be used because there is no way to determine what type it. -// * Integer types must be exact matches. e.g. A Go int32 into a PostgreSQL bigint will fail. -// * No dereferencing will be done. e.g. *Text must be used instead of Text. -func (cf CompositeFields) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - b := NewCompositeBinaryBuilder(ci, buf) - - for _, f := range cf { - dt, ok := ci.DataTypeForValue(f) - if !ok { - return nil, fmt.Errorf("Unknown OID for %#v", f) - } - - if binaryEncoder, ok := f.(BinaryEncoder); ok { - b.AppendEncoder(dt.OID, binaryEncoder) - } else { - err := dt.Value.Set(f) - if err != nil { - return nil, err - } - if binaryEncoder, ok := dt.Value.(BinaryEncoder); ok { - b.AppendEncoder(dt.OID, binaryEncoder) - } else { - return nil, fmt.Errorf("Cannot encode binary format for %v", f) - } - } - } - - return b.Finish() -} diff --git a/vendor/github.com/jackc/pgtype/composite_type.go b/vendor/github.com/jackc/pgtype/composite_type.go deleted file mode 100644 index 32e0aa2..0000000 --- a/vendor/github.com/jackc/pgtype/composite_type.go +++ /dev/null @@ -1,682 +0,0 @@ -package pgtype - -import ( - "encoding/binary" - "errors" - "fmt" - "reflect" - "strings" - - "github.com/jackc/pgio" -) - -type CompositeTypeField struct { - Name string - OID uint32 -} - -type CompositeType struct { - status Status - - typeName string - - fields []CompositeTypeField - valueTranscoders []ValueTranscoder -} - -// NewCompositeType creates a CompositeType from fields and ci. ci is used to find the ValueTranscoders used -// for fields. All field OIDs must be previously registered in ci. -func NewCompositeType(typeName string, fields []CompositeTypeField, ci *ConnInfo) (*CompositeType, error) { - valueTranscoders := make([]ValueTranscoder, len(fields)) - - for i := range fields { - dt, ok := ci.DataTypeForOID(fields[i].OID) - if !ok { - return nil, fmt.Errorf("no data type registered for oid: %d", fields[i].OID) - } - - value := NewValue(dt.Value) - valueTranscoder, ok := value.(ValueTranscoder) - if !ok { - return nil, fmt.Errorf("data type for oid does not implement ValueTranscoder: %d", fields[i].OID) - } - - valueTranscoders[i] = valueTranscoder - } - - return &CompositeType{typeName: typeName, fields: fields, valueTranscoders: valueTranscoders}, nil -} - -// NewCompositeTypeValues creates a CompositeType from fields and values. fields and values must have the same length. -// Prefer NewCompositeType unless overriding the transcoding of fields is required. -func NewCompositeTypeValues(typeName string, fields []CompositeTypeField, values []ValueTranscoder) (*CompositeType, error) { - if len(fields) != len(values) { - return nil, errors.New("fields and valueTranscoders must have same length") - } - - return &CompositeType{typeName: typeName, fields: fields, valueTranscoders: values}, nil -} - -func (src CompositeType) Get() interface{} { - switch src.status { - case Present: - results := make(map[string]interface{}, len(src.valueTranscoders)) - for i := range src.valueTranscoders { - results[src.fields[i].Name] = src.valueTranscoders[i].Get() - } - return results - case Null: - return nil - default: - return src.status - } -} - -func (ct *CompositeType) NewTypeValue() Value { - a := &CompositeType{ - typeName: ct.typeName, - fields: ct.fields, - valueTranscoders: make([]ValueTranscoder, len(ct.valueTranscoders)), - } - - for i := range ct.valueTranscoders { - a.valueTranscoders[i] = NewValue(ct.valueTranscoders[i]).(ValueTranscoder) - } - - return a -} - -func (ct *CompositeType) TypeName() string { - return ct.typeName -} - -func (ct *CompositeType) Fields() []CompositeTypeField { - return ct.fields -} - -func (dst *CompositeType) Set(src interface{}) error { - if src == nil { - dst.status = Null - return nil - } - - switch value := src.(type) { - case []interface{}: - if len(value) != len(dst.valueTranscoders) { - return fmt.Errorf("Number of fields don't match. CompositeType has %d fields", len(dst.valueTranscoders)) - } - for i, v := range value { - if err := dst.valueTranscoders[i].Set(v); err != nil { - return err - } - } - dst.status = Present - case *[]interface{}: - if value == nil { - dst.status = Null - return nil - } - return dst.Set(*value) - default: - return fmt.Errorf("Can not convert %v to Composite", src) - } - - return nil -} - -// AssignTo should never be called on composite value directly -func (src CompositeType) AssignTo(dst interface{}) error { - switch src.status { - case Present: - switch v := dst.(type) { - case []interface{}: - if len(v) != len(src.valueTranscoders) { - return fmt.Errorf("Number of fields don't match. CompositeType has %d fields", len(src.valueTranscoders)) - } - for i := range src.valueTranscoders { - if v[i] == nil { - continue - } - - err := assignToOrSet(src.valueTranscoders[i], v[i]) - if err != nil { - return fmt.Errorf("unable to assign to dst[%d]: %v", i, err) - } - } - return nil - case *[]interface{}: - return src.AssignTo(*v) - default: - if isPtrStruct, err := src.assignToPtrStruct(dst); isPtrStruct { - return err - } - - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func assignToOrSet(src Value, dst interface{}) error { - assignToErr := src.AssignTo(dst) - if assignToErr != nil { - // Try to use get / set instead -- this avoids every type having to be able to AssignTo type of self. - setSucceeded := false - if setter, ok := dst.(Value); ok { - err := setter.Set(src.Get()) - setSucceeded = err == nil - } - if !setSucceeded { - return assignToErr - } - } - - return nil -} - -func (src CompositeType) assignToPtrStruct(dst interface{}) (bool, error) { - dstValue := reflect.ValueOf(dst) - if dstValue.Kind() != reflect.Ptr { - return false, nil - } - - if dstValue.IsNil() { - return false, nil - } - - dstElemValue := dstValue.Elem() - dstElemType := dstElemValue.Type() - - if dstElemType.Kind() != reflect.Struct { - return false, nil - } - - exportedFields := make([]int, 0, dstElemType.NumField()) - for i := 0; i < dstElemType.NumField(); i++ { - sf := dstElemType.Field(i) - if sf.PkgPath == "" { - exportedFields = append(exportedFields, i) - } - } - - if len(exportedFields) != len(src.valueTranscoders) { - return false, nil - } - - for i := range exportedFields { - err := assignToOrSet(src.valueTranscoders[i], dstElemValue.Field(exportedFields[i]).Addr().Interface()) - if err != nil { - return true, fmt.Errorf("unable to assign to field %s: %v", dstElemType.Field(exportedFields[i]).Name, err) - } - } - - return true, nil -} - -func (src CompositeType) EncodeBinary(ci *ConnInfo, buf []byte) (newBuf []byte, err error) { - switch src.status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - b := NewCompositeBinaryBuilder(ci, buf) - for i := range src.valueTranscoders { - b.AppendEncoder(src.fields[i].OID, src.valueTranscoders[i]) - } - - return b.Finish() -} - -// DecodeBinary implements BinaryDecoder interface. -// Opposite to Record, fields in a composite act as a "schema" -// and decoding fails if SQL value can't be assigned due to -// type mismatch -func (dst *CompositeType) DecodeBinary(ci *ConnInfo, buf []byte) error { - if buf == nil { - dst.status = Null - return nil - } - - scanner := NewCompositeBinaryScanner(ci, buf) - - for _, f := range dst.valueTranscoders { - scanner.ScanDecoder(f) - } - - if scanner.Err() != nil { - return scanner.Err() - } - - dst.status = Present - - return nil -} - -func (dst *CompositeType) DecodeText(ci *ConnInfo, buf []byte) error { - if buf == nil { - dst.status = Null - return nil - } - - scanner := NewCompositeTextScanner(ci, buf) - - for _, f := range dst.valueTranscoders { - scanner.ScanDecoder(f) - } - - if scanner.Err() != nil { - return scanner.Err() - } - - dst.status = Present - - return nil -} - -func (src CompositeType) EncodeText(ci *ConnInfo, buf []byte) (newBuf []byte, err error) { - switch src.status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - b := NewCompositeTextBuilder(ci, buf) - for _, f := range src.valueTranscoders { - b.AppendEncoder(f) - } - - return b.Finish() -} - -type CompositeBinaryScanner struct { - ci *ConnInfo - rp int - src []byte - - fieldCount int32 - fieldBytes []byte - fieldOID uint32 - err error -} - -// NewCompositeBinaryScanner a scanner over a binary encoded composite balue. -func NewCompositeBinaryScanner(ci *ConnInfo, src []byte) *CompositeBinaryScanner { - rp := 0 - if len(src[rp:]) < 4 { - return &CompositeBinaryScanner{err: fmt.Errorf("Record incomplete %v", src)} - } - - fieldCount := int32(binary.BigEndian.Uint32(src[rp:])) - rp += 4 - - return &CompositeBinaryScanner{ - ci: ci, - rp: rp, - src: src, - fieldCount: fieldCount, - } -} - -// ScanDecoder calls Next and decodes the result with d. -func (cfs *CompositeBinaryScanner) ScanDecoder(d BinaryDecoder) { - if cfs.err != nil { - return - } - - if cfs.Next() { - cfs.err = d.DecodeBinary(cfs.ci, cfs.fieldBytes) - } else { - cfs.err = errors.New("read past end of composite") - } -} - -// ScanDecoder calls Next and scans the result into d. -func (cfs *CompositeBinaryScanner) ScanValue(d interface{}) { - if cfs.err != nil { - return - } - - if cfs.Next() { - cfs.err = cfs.ci.Scan(cfs.OID(), BinaryFormatCode, cfs.Bytes(), d) - } else { - cfs.err = errors.New("read past end of composite") - } -} - -// Next advances the scanner to the next field. It returns false after the last field is read or an error occurs. After -// Next returns false, the Err method can be called to check if any errors occurred. -func (cfs *CompositeBinaryScanner) Next() bool { - if cfs.err != nil { - return false - } - - if cfs.rp == len(cfs.src) { - return false - } - - if len(cfs.src[cfs.rp:]) < 8 { - cfs.err = fmt.Errorf("Record incomplete %v", cfs.src) - return false - } - cfs.fieldOID = binary.BigEndian.Uint32(cfs.src[cfs.rp:]) - cfs.rp += 4 - - fieldLen := int(int32(binary.BigEndian.Uint32(cfs.src[cfs.rp:]))) - cfs.rp += 4 - - if fieldLen >= 0 { - if len(cfs.src[cfs.rp:]) < fieldLen { - cfs.err = fmt.Errorf("Record incomplete rp=%d src=%v", cfs.rp, cfs.src) - return false - } - cfs.fieldBytes = cfs.src[cfs.rp : cfs.rp+fieldLen] - cfs.rp += fieldLen - } else { - cfs.fieldBytes = nil - } - - return true -} - -func (cfs *CompositeBinaryScanner) FieldCount() int { - return int(cfs.fieldCount) -} - -// Bytes returns the bytes of the field most recently read by Scan(). -func (cfs *CompositeBinaryScanner) Bytes() []byte { - return cfs.fieldBytes -} - -// OID returns the OID of the field most recently read by Scan(). -func (cfs *CompositeBinaryScanner) OID() uint32 { - return cfs.fieldOID -} - -// Err returns any error encountered by the scanner. -func (cfs *CompositeBinaryScanner) Err() error { - return cfs.err -} - -type CompositeTextScanner struct { - ci *ConnInfo - rp int - src []byte - - fieldBytes []byte - err error -} - -// NewCompositeTextScanner a scanner over a text encoded composite value. -func NewCompositeTextScanner(ci *ConnInfo, src []byte) *CompositeTextScanner { - if len(src) < 2 { - return &CompositeTextScanner{err: fmt.Errorf("Record incomplete %v", src)} - } - - if src[0] != '(' { - return &CompositeTextScanner{err: fmt.Errorf("composite text format must start with '('")} - } - - if src[len(src)-1] != ')' { - return &CompositeTextScanner{err: fmt.Errorf("composite text format must end with ')'")} - } - - return &CompositeTextScanner{ - ci: ci, - rp: 1, - src: src, - } -} - -// ScanDecoder calls Next and decodes the result with d. -func (cfs *CompositeTextScanner) ScanDecoder(d TextDecoder) { - if cfs.err != nil { - return - } - - if cfs.Next() { - cfs.err = d.DecodeText(cfs.ci, cfs.fieldBytes) - } else { - cfs.err = errors.New("read past end of composite") - } -} - -// ScanDecoder calls Next and scans the result into d. -func (cfs *CompositeTextScanner) ScanValue(d interface{}) { - if cfs.err != nil { - return - } - - if cfs.Next() { - cfs.err = cfs.ci.Scan(0, TextFormatCode, cfs.Bytes(), d) - } else { - cfs.err = errors.New("read past end of composite") - } -} - -// Next advances the scanner to the next field. It returns false after the last field is read or an error occurs. After -// Next returns false, the Err method can be called to check if any errors occurred. -func (cfs *CompositeTextScanner) Next() bool { - if cfs.err != nil { - return false - } - - if cfs.rp == len(cfs.src) { - return false - } - - switch cfs.src[cfs.rp] { - case ',', ')': // null - cfs.rp++ - cfs.fieldBytes = nil - return true - case '"': // quoted value - cfs.rp++ - cfs.fieldBytes = make([]byte, 0, 16) - for { - ch := cfs.src[cfs.rp] - - if ch == '"' { - cfs.rp++ - if cfs.src[cfs.rp] == '"' { - cfs.fieldBytes = append(cfs.fieldBytes, '"') - cfs.rp++ - } else { - break - } - } else if ch == '\\' { - cfs.rp++ - cfs.fieldBytes = append(cfs.fieldBytes, cfs.src[cfs.rp]) - cfs.rp++ - } else { - cfs.fieldBytes = append(cfs.fieldBytes, ch) - cfs.rp++ - } - } - cfs.rp++ - return true - default: // unquoted value - start := cfs.rp - for { - ch := cfs.src[cfs.rp] - if ch == ',' || ch == ')' { - break - } - cfs.rp++ - } - cfs.fieldBytes = cfs.src[start:cfs.rp] - cfs.rp++ - return true - } -} - -// Bytes returns the bytes of the field most recently read by Scan(). -func (cfs *CompositeTextScanner) Bytes() []byte { - return cfs.fieldBytes -} - -// Err returns any error encountered by the scanner. -func (cfs *CompositeTextScanner) Err() error { - return cfs.err -} - -type CompositeBinaryBuilder struct { - ci *ConnInfo - buf []byte - startIdx int - fieldCount uint32 - err error -} - -func NewCompositeBinaryBuilder(ci *ConnInfo, buf []byte) *CompositeBinaryBuilder { - startIdx := len(buf) - buf = append(buf, 0, 0, 0, 0) // allocate room for number of fields - return &CompositeBinaryBuilder{ci: ci, buf: buf, startIdx: startIdx} -} - -func (b *CompositeBinaryBuilder) AppendValue(oid uint32, field interface{}) { - if b.err != nil { - return - } - - dt, ok := b.ci.DataTypeForOID(oid) - if !ok { - b.err = fmt.Errorf("unknown data type for OID: %d", oid) - return - } - - err := dt.Value.Set(field) - if err != nil { - b.err = err - return - } - - binaryEncoder, ok := dt.Value.(BinaryEncoder) - if !ok { - b.err = fmt.Errorf("unable to encode binary for OID: %d", oid) - return - } - - b.AppendEncoder(oid, binaryEncoder) -} - -func (b *CompositeBinaryBuilder) AppendEncoder(oid uint32, field BinaryEncoder) { - if b.err != nil { - return - } - - b.buf = pgio.AppendUint32(b.buf, oid) - lengthPos := len(b.buf) - b.buf = pgio.AppendInt32(b.buf, -1) - fieldBuf, err := field.EncodeBinary(b.ci, b.buf) - if err != nil { - b.err = err - return - } - if fieldBuf != nil { - binary.BigEndian.PutUint32(fieldBuf[lengthPos:], uint32(len(fieldBuf)-len(b.buf))) - b.buf = fieldBuf - } - - b.fieldCount++ -} - -func (b *CompositeBinaryBuilder) Finish() ([]byte, error) { - if b.err != nil { - return nil, b.err - } - - binary.BigEndian.PutUint32(b.buf[b.startIdx:], b.fieldCount) - return b.buf, nil -} - -type CompositeTextBuilder struct { - ci *ConnInfo - buf []byte - startIdx int - fieldCount uint32 - err error - fieldBuf [32]byte -} - -func NewCompositeTextBuilder(ci *ConnInfo, buf []byte) *CompositeTextBuilder { - buf = append(buf, '(') // allocate room for number of fields - return &CompositeTextBuilder{ci: ci, buf: buf} -} - -func (b *CompositeTextBuilder) AppendValue(field interface{}) { - if b.err != nil { - return - } - - if field == nil { - b.buf = append(b.buf, ',') - return - } - - dt, ok := b.ci.DataTypeForValue(field) - if !ok { - b.err = fmt.Errorf("unknown data type for field: %v", field) - return - } - - err := dt.Value.Set(field) - if err != nil { - b.err = err - return - } - - textEncoder, ok := dt.Value.(TextEncoder) - if !ok { - b.err = fmt.Errorf("unable to encode text for value: %v", field) - return - } - - b.AppendEncoder(textEncoder) -} - -func (b *CompositeTextBuilder) AppendEncoder(field TextEncoder) { - if b.err != nil { - return - } - - fieldBuf, err := field.EncodeText(b.ci, b.fieldBuf[0:0]) - if err != nil { - b.err = err - return - } - if fieldBuf != nil { - b.buf = append(b.buf, quoteCompositeFieldIfNeeded(string(fieldBuf))...) - } - - b.buf = append(b.buf, ',') -} - -func (b *CompositeTextBuilder) Finish() ([]byte, error) { - if b.err != nil { - return nil, b.err - } - - b.buf[len(b.buf)-1] = ')' - return b.buf, nil -} - -var quoteCompositeReplacer = strings.NewReplacer(`\`, `\\`, `"`, `\"`) - -func quoteCompositeField(src string) string { - return `"` + quoteCompositeReplacer.Replace(src) + `"` -} - -func quoteCompositeFieldIfNeeded(src string) string { - if src == "" || src[0] == ' ' || src[len(src)-1] == ' ' || strings.ContainsAny(src, `(),"\`) { - return quoteCompositeField(src) - } - return src -} diff --git a/vendor/github.com/jackc/pgtype/convert.go b/vendor/github.com/jackc/pgtype/convert.go deleted file mode 100644 index 377fe3e..0000000 --- a/vendor/github.com/jackc/pgtype/convert.go +++ /dev/null @@ -1,476 +0,0 @@ -package pgtype - -import ( - "database/sql" - "fmt" - "math" - "reflect" - "time" -) - -const ( - maxUint = ^uint(0) - maxInt = int(maxUint >> 1) - minInt = -maxInt - 1 -) - -// underlyingNumberType gets the underlying type that can be converted to Int2, Int4, Int8, Float4, or Float8 -func underlyingNumberType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - case reflect.Int: - convVal := int(refVal.Int()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Int8: - convVal := int8(refVal.Int()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Int16: - convVal := int16(refVal.Int()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Int32: - convVal := int32(refVal.Int()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Int64: - convVal := int64(refVal.Int()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Uint: - convVal := uint(refVal.Uint()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Uint8: - convVal := uint8(refVal.Uint()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Uint16: - convVal := uint16(refVal.Uint()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Uint32: - convVal := uint32(refVal.Uint()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Uint64: - convVal := uint64(refVal.Uint()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Float32: - convVal := float32(refVal.Float()) - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.Float64: - convVal := refVal.Float() - return convVal, reflect.TypeOf(convVal) != refVal.Type() - case reflect.String: - convVal := refVal.String() - return convVal, reflect.TypeOf(convVal) != refVal.Type() - } - - return nil, false -} - -// underlyingBoolType gets the underlying type that can be converted to Bool -func underlyingBoolType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - case reflect.Bool: - convVal := refVal.Bool() - return convVal, reflect.TypeOf(convVal) != refVal.Type() - } - - return nil, false -} - -// underlyingBytesType gets the underlying type that can be converted to []byte -func underlyingBytesType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - case reflect.Slice: - if refVal.Type().Elem().Kind() == reflect.Uint8 { - convVal := refVal.Bytes() - return convVal, reflect.TypeOf(convVal) != refVal.Type() - } - } - - return nil, false -} - -// underlyingStringType gets the underlying type that can be converted to String -func underlyingStringType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - case reflect.String: - convVal := refVal.String() - return convVal, reflect.TypeOf(convVal) != refVal.Type() - } - - return nil, false -} - -// underlyingPtrType dereferences a pointer -func underlyingPtrType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - } - - return nil, false -} - -// underlyingTimeType gets the underlying type that can be converted to time.Time -func underlyingTimeType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - } - - timeType := reflect.TypeOf(time.Time{}) - if refVal.Type().ConvertibleTo(timeType) { - return refVal.Convert(timeType).Interface(), true - } - - return nil, false -} - -// underlyingUUIDType gets the underlying type that can be converted to [16]byte -func underlyingUUIDType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - } - - uuidType := reflect.TypeOf([16]byte{}) - if refVal.Type().ConvertibleTo(uuidType) { - return refVal.Convert(uuidType).Interface(), true - } - - return nil, false -} - -// underlyingSliceType gets the underlying slice type -func underlyingSliceType(val interface{}) (interface{}, bool) { - refVal := reflect.ValueOf(val) - - switch refVal.Kind() { - case reflect.Ptr: - if refVal.IsNil() { - return nil, false - } - convVal := refVal.Elem().Interface() - return convVal, true - case reflect.Slice: - baseSliceType := reflect.SliceOf(refVal.Type().Elem()) - if refVal.Type().ConvertibleTo(baseSliceType) { - convVal := refVal.Convert(baseSliceType) - return convVal.Interface(), reflect.TypeOf(convVal.Interface()) != refVal.Type() - } - } - - return nil, false -} - -func int64AssignTo(srcVal int64, srcStatus Status, dst interface{}) error { - if srcStatus == Present { - switch v := dst.(type) { - case *int: - if srcVal < int64(minInt) { - return fmt.Errorf("%d is less than minimum value for int", srcVal) - } else if srcVal > int64(maxInt) { - return fmt.Errorf("%d is greater than maximum value for int", srcVal) - } - *v = int(srcVal) - case *int8: - if srcVal < math.MinInt8 { - return fmt.Errorf("%d is less than minimum value for int8", srcVal) - } else if srcVal > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for int8", srcVal) - } - *v = int8(srcVal) - case *int16: - if srcVal < math.MinInt16 { - return fmt.Errorf("%d is less than minimum value for int16", srcVal) - } else if srcVal > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for int16", srcVal) - } - *v = int16(srcVal) - case *int32: - if srcVal < math.MinInt32 { - return fmt.Errorf("%d is less than minimum value for int32", srcVal) - } else if srcVal > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for int32", srcVal) - } - *v = int32(srcVal) - case *int64: - if srcVal < math.MinInt64 { - return fmt.Errorf("%d is less than minimum value for int64", srcVal) - } else if srcVal > math.MaxInt64 { - return fmt.Errorf("%d is greater than maximum value for int64", srcVal) - } - *v = int64(srcVal) - case *uint: - if srcVal < 0 { - return fmt.Errorf("%d is less than zero for uint", srcVal) - } else if uint64(srcVal) > uint64(maxUint) { - return fmt.Errorf("%d is greater than maximum value for uint", srcVal) - } - *v = uint(srcVal) - case *uint8: - if srcVal < 0 { - return fmt.Errorf("%d is less than zero for uint8", srcVal) - } else if srcVal > math.MaxUint8 { - return fmt.Errorf("%d is greater than maximum value for uint8", srcVal) - } - *v = uint8(srcVal) - case *uint16: - if srcVal < 0 { - return fmt.Errorf("%d is less than zero for uint32", srcVal) - } else if srcVal > math.MaxUint16 { - return fmt.Errorf("%d is greater than maximum value for uint16", srcVal) - } - *v = uint16(srcVal) - case *uint32: - if srcVal < 0 { - return fmt.Errorf("%d is less than zero for uint32", srcVal) - } else if srcVal > math.MaxUint32 { - return fmt.Errorf("%d is greater than maximum value for uint32", srcVal) - } - *v = uint32(srcVal) - case *uint64: - if srcVal < 0 { - return fmt.Errorf("%d is less than zero for uint64", srcVal) - } - *v = uint64(srcVal) - case sql.Scanner: - return v.Scan(srcVal) - default: - if v := reflect.ValueOf(dst); v.Kind() == reflect.Ptr { - el := v.Elem() - switch el.Kind() { - // if dst is a pointer to pointer, strip the pointer and try again - case reflect.Ptr: - if el.IsNil() { - // allocate destination - el.Set(reflect.New(el.Type().Elem())) - } - return int64AssignTo(srcVal, srcStatus, el.Interface()) - case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: - if el.OverflowInt(int64(srcVal)) { - return fmt.Errorf("cannot put %d into %T", srcVal, dst) - } - el.SetInt(int64(srcVal)) - return nil - case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: - if srcVal < 0 { - return fmt.Errorf("%d is less than zero for %T", srcVal, dst) - } - if el.OverflowUint(uint64(srcVal)) { - return fmt.Errorf("cannot put %d into %T", srcVal, dst) - } - el.SetUint(uint64(srcVal)) - return nil - } - } - return fmt.Errorf("cannot assign %v into %T", srcVal, dst) - } - return nil - } - - // if dst is a pointer to pointer and srcStatus is not Present, nil it out - if v := reflect.ValueOf(dst); v.Kind() == reflect.Ptr { - el := v.Elem() - if el.Kind() == reflect.Ptr { - el.Set(reflect.Zero(el.Type())) - return nil - } - } - - return fmt.Errorf("cannot assign %v %v into %T", srcVal, srcStatus, dst) -} - -func float64AssignTo(srcVal float64, srcStatus Status, dst interface{}) error { - if srcStatus == Present { - switch v := dst.(type) { - case *float32: - *v = float32(srcVal) - case *float64: - *v = srcVal - default: - if v := reflect.ValueOf(dst); v.Kind() == reflect.Ptr { - el := v.Elem() - switch el.Kind() { - // if dst is a type alias of a float32 or 64, set dst val - case reflect.Float32, reflect.Float64: - el.SetFloat(srcVal) - return nil - // if dst is a pointer to pointer, strip the pointer and try again - case reflect.Ptr: - if el.IsNil() { - // allocate destination - el.Set(reflect.New(el.Type().Elem())) - } - return float64AssignTo(srcVal, srcStatus, el.Interface()) - case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: - i64 := int64(srcVal) - if float64(i64) == srcVal { - return int64AssignTo(i64, srcStatus, dst) - } - } - } - return fmt.Errorf("cannot assign %v into %T", srcVal, dst) - } - return nil - } - - // if dst is a pointer to pointer and srcStatus is not Present, nil it out - if v := reflect.ValueOf(dst); v.Kind() == reflect.Ptr { - el := v.Elem() - if el.Kind() == reflect.Ptr { - el.Set(reflect.Zero(el.Type())) - return nil - } - } - - return fmt.Errorf("cannot assign %v %v into %T", srcVal, srcStatus, dst) -} - -func NullAssignTo(dst interface{}) error { - dstPtr := reflect.ValueOf(dst) - - // AssignTo dst must always be a pointer - if dstPtr.Kind() != reflect.Ptr { - return &nullAssignmentError{dst: dst} - } - - dstVal := dstPtr.Elem() - - switch dstVal.Kind() { - case reflect.Ptr, reflect.Slice, reflect.Map: - dstVal.Set(reflect.Zero(dstVal.Type())) - return nil - } - - return &nullAssignmentError{dst: dst} -} - -var kindTypes map[reflect.Kind]reflect.Type - -func toInterface(dst reflect.Value, t reflect.Type) (interface{}, bool) { - nextDst := dst.Convert(t) - return nextDst.Interface(), dst.Type() != nextDst.Type() -} - -// GetAssignToDstType attempts to convert dst to something AssignTo can assign -// to. If dst is a pointer to pointer it allocates a value and returns the -// dereferences pointer. If dst is a named type such as *Foo where Foo is type -// Foo int16, it converts dst to *int16. -// -// GetAssignToDstType returns the converted dst and a bool representing if any -// change was made. -func GetAssignToDstType(dst interface{}) (interface{}, bool) { - dstPtr := reflect.ValueOf(dst) - - // AssignTo dst must always be a pointer - if dstPtr.Kind() != reflect.Ptr { - return nil, false - } - - dstVal := dstPtr.Elem() - - // if dst is a pointer to pointer, allocate space try again with the dereferenced pointer - if dstVal.Kind() == reflect.Ptr { - dstVal.Set(reflect.New(dstVal.Type().Elem())) - return dstVal.Interface(), true - } - - // if dst is pointer to a base type that has been renamed - if baseValType, ok := kindTypes[dstVal.Kind()]; ok { - return toInterface(dstPtr, reflect.PtrTo(baseValType)) - } - - if dstVal.Kind() == reflect.Slice { - if baseElemType, ok := kindTypes[dstVal.Type().Elem().Kind()]; ok { - return toInterface(dstPtr, reflect.PtrTo(reflect.SliceOf(baseElemType))) - } - } - - if dstVal.Kind() == reflect.Array { - if baseElemType, ok := kindTypes[dstVal.Type().Elem().Kind()]; ok { - return toInterface(dstPtr, reflect.PtrTo(reflect.ArrayOf(dstVal.Len(), baseElemType))) - } - } - - if dstVal.Kind() == reflect.Struct { - if dstVal.Type().NumField() == 1 && dstVal.Type().Field(0).Anonymous { - dstPtr = dstVal.Field(0).Addr() - nested := dstVal.Type().Field(0).Type - if nested.Kind() == reflect.Array { - if baseElemType, ok := kindTypes[nested.Elem().Kind()]; ok { - return toInterface(dstPtr, reflect.PtrTo(reflect.ArrayOf(nested.Len(), baseElemType))) - } - } - if _, ok := kindTypes[nested.Kind()]; ok && dstPtr.CanInterface() { - return dstPtr.Interface(), true - } - } - } - - return nil, false -} - -func init() { - kindTypes = map[reflect.Kind]reflect.Type{ - reflect.Bool: reflect.TypeOf(false), - reflect.Float32: reflect.TypeOf(float32(0)), - reflect.Float64: reflect.TypeOf(float64(0)), - reflect.Int: reflect.TypeOf(int(0)), - reflect.Int8: reflect.TypeOf(int8(0)), - reflect.Int16: reflect.TypeOf(int16(0)), - reflect.Int32: reflect.TypeOf(int32(0)), - reflect.Int64: reflect.TypeOf(int64(0)), - reflect.Uint: reflect.TypeOf(uint(0)), - reflect.Uint8: reflect.TypeOf(uint8(0)), - reflect.Uint16: reflect.TypeOf(uint16(0)), - reflect.Uint32: reflect.TypeOf(uint32(0)), - reflect.Uint64: reflect.TypeOf(uint64(0)), - reflect.String: reflect.TypeOf(""), - } -} diff --git a/vendor/github.com/jackc/pgtype/database_sql.go b/vendor/github.com/jackc/pgtype/database_sql.go deleted file mode 100644 index 9d1cf82..0000000 --- a/vendor/github.com/jackc/pgtype/database_sql.go +++ /dev/null @@ -1,41 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "errors" -) - -func DatabaseSQLValue(ci *ConnInfo, src Value) (interface{}, error) { - if valuer, ok := src.(driver.Valuer); ok { - return valuer.Value() - } - - if textEncoder, ok := src.(TextEncoder); ok { - buf, err := textEncoder.EncodeText(ci, nil) - if err != nil { - return nil, err - } - return string(buf), nil - } - - if binaryEncoder, ok := src.(BinaryEncoder); ok { - buf, err := binaryEncoder.EncodeBinary(ci, nil) - if err != nil { - return nil, err - } - return buf, nil - } - - return nil, errors.New("cannot convert to database/sql compatible value") -} - -func EncodeValueText(src TextEncoder) (interface{}, error) { - buf, err := src.EncodeText(nil, make([]byte, 0, 32)) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - return string(buf), err -} diff --git a/vendor/github.com/jackc/pgtype/date.go b/vendor/github.com/jackc/pgtype/date.go deleted file mode 100644 index e68abf0..0000000 --- a/vendor/github.com/jackc/pgtype/date.go +++ /dev/null @@ -1,324 +0,0 @@ -package pgtype - -import ( - "database/sql" - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "strings" - "time" - - "github.com/jackc/pgio" -) - -type Date struct { - Time time.Time - Status Status - InfinityModifier InfinityModifier -} - -const ( - negativeInfinityDayOffset = -2147483648 - infinityDayOffset = 2147483647 -) - -func (dst *Date) Set(src interface{}) error { - if src == nil { - *dst = Date{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - if value, ok := src.(interface{ Value() (driver.Value, error) }); ok { - v, err := value.Value() - if err != nil { - return fmt.Errorf("cannot get value %v for Date: %v", value, err) - } - return dst.Set(v) - } - - switch value := src.(type) { - case time.Time: - *dst = Date{Time: value, Status: Present} - case *time.Time: - if value == nil { - *dst = Date{Status: Null} - } else { - return dst.Set(*value) - } - case string: - return dst.DecodeText(nil, []byte(value)) - case *string: - if value == nil { - *dst = Date{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingTimeType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Date", value) - } - - return nil -} - -func (dst Date) Get() interface{} { - switch dst.Status { - case Present: - if dst.InfinityModifier != None { - return dst.InfinityModifier - } - return dst.Time - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Date) AssignTo(dst interface{}) error { - if scanner, ok := dst.(sql.Scanner); ok { - var err error - switch src.Status { - case Present: - if src.InfinityModifier != None { - err = scanner.Scan(src.InfinityModifier.String()) - } else { - err = scanner.Scan(src.Time) - } - case Null: - err = scanner.Scan(nil) - } - if err != nil { - return fmt.Errorf("unable assign %v to %T: %s", src, dst, err) - } - return nil - } - - switch src.Status { - case Present: - switch v := dst.(type) { - case *time.Time: - if src.InfinityModifier != None { - return fmt.Errorf("cannot assign %v to %T", src, dst) - } - *v = src.Time - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Date) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Date{Status: Null} - return nil - } - - sbuf := string(src) - switch sbuf { - case "infinity": - *dst = Date{Status: Present, InfinityModifier: Infinity} - case "-infinity": - *dst = Date{Status: Present, InfinityModifier: -Infinity} - default: - if strings.HasSuffix(sbuf, " BC") { - t, err := time.ParseInLocation("2006-01-02", strings.TrimRight(sbuf, " BC"), time.UTC) - t2 := time.Date(1-t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - if err != nil { - return err - } - *dst = Date{Time: t2, Status: Present} - return nil - } - t, err := time.ParseInLocation("2006-01-02", sbuf, time.UTC) - if err != nil { - return err - } - - *dst = Date{Time: t, Status: Present} - } - - return nil -} - -func (dst *Date) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Date{Status: Null} - return nil - } - - if len(src) != 4 { - return fmt.Errorf("invalid length for date: %v", len(src)) - } - - dayOffset := int32(binary.BigEndian.Uint32(src)) - - switch dayOffset { - case infinityDayOffset: - *dst = Date{Status: Present, InfinityModifier: Infinity} - case negativeInfinityDayOffset: - *dst = Date{Status: Present, InfinityModifier: -Infinity} - default: - t := time.Date(2000, 1, int(1+dayOffset), 0, 0, 0, 0, time.UTC) - *dst = Date{Time: t, Status: Present} - } - - return nil -} - -func (src Date) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var s string - - switch src.InfinityModifier { - case None: - s = src.Time.Format("2006-01-02") - case Infinity: - s = "infinity" - case NegativeInfinity: - s = "-infinity" - } - - return append(buf, s...), nil -} - -func (src Date) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var daysSinceDateEpoch int32 - switch src.InfinityModifier { - case None: - tUnix := time.Date(src.Time.Year(), src.Time.Month(), src.Time.Day(), 0, 0, 0, 0, time.UTC).Unix() - dateEpoch := time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC).Unix() - - secSinceDateEpoch := tUnix - dateEpoch - daysSinceDateEpoch = int32(secSinceDateEpoch / 86400) - case Infinity: - daysSinceDateEpoch = infinityDayOffset - case NegativeInfinity: - daysSinceDateEpoch = negativeInfinityDayOffset - } - - return pgio.AppendInt32(buf, daysSinceDateEpoch), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Date) Scan(src interface{}) error { - if src == nil { - *dst = Date{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - case time.Time: - *dst = Date{Time: src, Status: Present} - return nil - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Date) Value() (driver.Value, error) { - switch src.Status { - case Present: - if src.InfinityModifier != None { - return src.InfinityModifier.String(), nil - } - return src.Time, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Date) MarshalJSON() ([]byte, error) { - switch src.Status { - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - if src.Status != Present { - return nil, errBadStatus - } - - var s string - - switch src.InfinityModifier { - case None: - s = src.Time.Format("2006-01-02") - case Infinity: - s = "infinity" - case NegativeInfinity: - s = "-infinity" - } - - return json.Marshal(s) -} - -func (dst *Date) UnmarshalJSON(b []byte) error { - var s *string - err := json.Unmarshal(b, &s) - if err != nil { - return err - } - - if s == nil { - *dst = Date{Status: Null} - return nil - } - - switch *s { - case "infinity": - *dst = Date{Status: Present, InfinityModifier: Infinity} - case "-infinity": - *dst = Date{Status: Present, InfinityModifier: -Infinity} - default: - t, err := time.ParseInLocation("2006-01-02", *s, time.UTC) - if err != nil { - return err - } - - *dst = Date{Time: t, Status: Present} - } - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/date_array.go b/vendor/github.com/jackc/pgtype/date_array.go deleted file mode 100644 index 24152fa..0000000 --- a/vendor/github.com/jackc/pgtype/date_array.go +++ /dev/null @@ -1,518 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - "time" - - "github.com/jackc/pgio" -) - -type DateArray struct { - Elements []Date - Dimensions []ArrayDimension - Status Status -} - -func (dst *DateArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = DateArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []time.Time: - if value == nil { - *dst = DateArray{Status: Null} - } else if len(value) == 0 { - *dst = DateArray{Status: Present} - } else { - elements := make([]Date, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = DateArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*time.Time: - if value == nil { - *dst = DateArray{Status: Null} - } else if len(value) == 0 { - *dst = DateArray{Status: Present} - } else { - elements := make([]Date, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = DateArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Date: - if value == nil { - *dst = DateArray{Status: Null} - } else if len(value) == 0 { - *dst = DateArray{Status: Present} - } else { - *dst = DateArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = DateArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for DateArray", src) - } - if elementsLength == 0 { - *dst = DateArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to DateArray", src) - } - - *dst = DateArray{ - Elements: make([]Date, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Date, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to DateArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *DateArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to DateArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in DateArray", err) - } - index++ - - return index, nil -} - -func (dst DateArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *DateArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]time.Time: - *v = make([]time.Time, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*time.Time: - *v = make([]*time.Time, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *DateArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from DateArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from DateArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *DateArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = DateArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Date - - if len(uta.Elements) > 0 { - elements = make([]Date, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Date - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = DateArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *DateArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = DateArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = DateArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Date, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = DateArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src DateArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src DateArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("date"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "date") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *DateArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src DateArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/daterange.go b/vendor/github.com/jackc/pgtype/daterange.go deleted file mode 100644 index 63164a5..0000000 --- a/vendor/github.com/jackc/pgtype/daterange.go +++ /dev/null @@ -1,267 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - - "github.com/jackc/pgio" -) - -type Daterange struct { - Lower Date - Upper Date - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *Daterange) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Daterange{Status: Null} - return nil - } - - switch value := src.(type) { - case Daterange: - *dst = value - case *Daterange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to Daterange", src) - } - - return nil -} - -func (dst Daterange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Daterange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Daterange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Daterange{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = Daterange{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *Daterange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Daterange{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = Daterange{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src Daterange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src Daterange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Daterange) Scan(src interface{}) error { - if src == nil { - *dst = Daterange{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Daterange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/enum_array.go b/vendor/github.com/jackc/pgtype/enum_array.go deleted file mode 100644 index 59b5a3e..0000000 --- a/vendor/github.com/jackc/pgtype/enum_array.go +++ /dev/null @@ -1,428 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "fmt" - "reflect" -) - -type EnumArray struct { - Elements []GenericText - Dimensions []ArrayDimension - Status Status -} - -func (dst *EnumArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = EnumArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = EnumArray{Status: Null} - } else if len(value) == 0 { - *dst = EnumArray{Status: Present} - } else { - elements := make([]GenericText, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = EnumArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*string: - if value == nil { - *dst = EnumArray{Status: Null} - } else if len(value) == 0 { - *dst = EnumArray{Status: Present} - } else { - elements := make([]GenericText, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = EnumArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []GenericText: - if value == nil { - *dst = EnumArray{Status: Null} - } else if len(value) == 0 { - *dst = EnumArray{Status: Present} - } else { - *dst = EnumArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = EnumArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for EnumArray", src) - } - if elementsLength == 0 { - *dst = EnumArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to EnumArray", src) - } - - *dst = EnumArray{ - Elements: make([]GenericText, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]GenericText, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to EnumArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *EnumArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to EnumArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in EnumArray", err) - } - index++ - - return index, nil -} - -func (dst EnumArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *EnumArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*string: - *v = make([]*string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *EnumArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from EnumArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from EnumArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *EnumArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = EnumArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []GenericText - - if len(uta.Elements) > 0 { - elements = make([]GenericText, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem GenericText - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = EnumArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (src EnumArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *EnumArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src EnumArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/enum_type.go b/vendor/github.com/jackc/pgtype/enum_type.go deleted file mode 100644 index 5265782..0000000 --- a/vendor/github.com/jackc/pgtype/enum_type.go +++ /dev/null @@ -1,168 +0,0 @@ -package pgtype - -import "fmt" - -// EnumType represents an enum type. While it implements Value, this is only in service of its type conversion duties -// when registered as a data type in a ConnType. It should not be used directly as a Value. -type EnumType struct { - value string - status Status - - typeName string // PostgreSQL type name - members []string // enum members - membersMap map[string]string // map to quickly lookup member and reuse string instead of allocating -} - -// NewEnumType initializes a new EnumType. It retains a read-only reference to members. members must not be changed. -func NewEnumType(typeName string, members []string) *EnumType { - et := &EnumType{typeName: typeName, members: members} - et.membersMap = make(map[string]string, len(members)) - for _, m := range members { - et.membersMap[m] = m - } - return et -} - -func (et *EnumType) NewTypeValue() Value { - return &EnumType{ - value: et.value, - status: et.status, - - typeName: et.typeName, - members: et.members, - membersMap: et.membersMap, - } -} - -func (et *EnumType) TypeName() string { - return et.typeName -} - -func (et *EnumType) Members() []string { - return et.members -} - -// Set assigns src to dst. Set purposely does not check that src is a member. This allows continued error free -// operation in the event the PostgreSQL enum type is modified during a connection. -func (dst *EnumType) Set(src interface{}) error { - if src == nil { - dst.status = Null - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case string: - dst.value = value - dst.status = Present - case *string: - if value == nil { - dst.status = Null - } else { - dst.value = *value - dst.status = Present - } - case []byte: - if value == nil { - dst.status = Null - } else { - dst.value = string(value) - dst.status = Present - } - default: - if originalSrc, ok := underlyingStringType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to enum %s", value, dst.typeName) - } - - return nil -} - -func (dst EnumType) Get() interface{} { - switch dst.status { - case Present: - return dst.value - case Null: - return nil - default: - return dst.status - } -} - -func (src *EnumType) AssignTo(dst interface{}) error { - switch src.status { - case Present: - switch v := dst.(type) { - case *string: - *v = src.value - return nil - case *[]byte: - *v = make([]byte, len(src.value)) - copy(*v, src.value) - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (EnumType) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *EnumType) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - dst.status = Null - return nil - } - - // Lookup the string in membersMap to avoid an allocation. - if s, found := dst.membersMap[string(src)]; found { - dst.value = s - } else { - // If an enum type is modified after the initial connection it is possible to receive an unexpected value. - // Gracefully handle this situation. Purposely NOT modifying members and membersMap to allow for sharing members - // and membersMap between connections. - dst.value = string(src) - } - dst.status = Present - - return nil -} - -func (dst *EnumType) DecodeBinary(ci *ConnInfo, src []byte) error { - return dst.DecodeText(ci, src) -} - -func (EnumType) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (src EnumType) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.value...), nil -} - -func (src EnumType) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return src.EncodeText(ci, buf) -} diff --git a/vendor/github.com/jackc/pgtype/float4.go b/vendor/github.com/jackc/pgtype/float4.go deleted file mode 100644 index 89b9e8f..0000000 --- a/vendor/github.com/jackc/pgtype/float4.go +++ /dev/null @@ -1,282 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - - "github.com/jackc/pgio" -) - -type Float4 struct { - Float float32 - Status Status -} - -func (dst *Float4) Set(src interface{}) error { - if src == nil { - *dst = Float4{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case float32: - *dst = Float4{Float: value, Status: Present} - case float64: - *dst = Float4{Float: float32(value), Status: Present} - case int8: - *dst = Float4{Float: float32(value), Status: Present} - case uint8: - *dst = Float4{Float: float32(value), Status: Present} - case int16: - *dst = Float4{Float: float32(value), Status: Present} - case uint16: - *dst = Float4{Float: float32(value), Status: Present} - case int32: - f32 := float32(value) - if int32(f32) == value { - *dst = Float4{Float: f32, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float32", value) - } - case uint32: - f32 := float32(value) - if uint32(f32) == value { - *dst = Float4{Float: f32, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float32", value) - } - case int64: - f32 := float32(value) - if int64(f32) == value { - *dst = Float4{Float: f32, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float32", value) - } - case uint64: - f32 := float32(value) - if uint64(f32) == value { - *dst = Float4{Float: f32, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float32", value) - } - case int: - f32 := float32(value) - if int(f32) == value { - *dst = Float4{Float: f32, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float32", value) - } - case uint: - f32 := float32(value) - if uint(f32) == value { - *dst = Float4{Float: f32, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float32", value) - } - case string: - num, err := strconv.ParseFloat(value, 32) - if err != nil { - return err - } - *dst = Float4{Float: float32(num), Status: Present} - case *float64: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *float32: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *int8: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint8: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *int16: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint16: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *int32: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint32: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *int64: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint64: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *int: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Float4{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Float8", value) - } - - return nil -} - -func (dst Float4) Get() interface{} { - switch dst.Status { - case Present: - return dst.Float - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Float4) AssignTo(dst interface{}) error { - return float64AssignTo(float64(src.Float), src.Status, dst) -} - -func (dst *Float4) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float4{Status: Null} - return nil - } - - n, err := strconv.ParseFloat(string(src), 32) - if err != nil { - return err - } - - *dst = Float4{Float: float32(n), Status: Present} - return nil -} - -func (dst *Float4) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float4{Status: Null} - return nil - } - - if len(src) != 4 { - return fmt.Errorf("invalid length for float4: %v", len(src)) - } - - n := int32(binary.BigEndian.Uint32(src)) - - *dst = Float4{Float: math.Float32frombits(uint32(n)), Status: Present} - return nil -} - -func (src Float4) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, strconv.FormatFloat(float64(src.Float), 'f', -1, 32)...) - return buf, nil -} - -func (src Float4) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint32(buf, math.Float32bits(src.Float)) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Float4) Scan(src interface{}) error { - if src == nil { - *dst = Float4{Status: Null} - return nil - } - - switch src := src.(type) { - case float64: - *dst = Float4{Float: float32(src), Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Float4) Value() (driver.Value, error) { - switch src.Status { - case Present: - return float64(src.Float), nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} diff --git a/vendor/github.com/jackc/pgtype/float4_array.go b/vendor/github.com/jackc/pgtype/float4_array.go deleted file mode 100644 index 41f2ec8..0000000 --- a/vendor/github.com/jackc/pgtype/float4_array.go +++ /dev/null @@ -1,517 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type Float4Array struct { - Elements []Float4 - Dimensions []ArrayDimension - Status Status -} - -func (dst *Float4Array) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Float4Array{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []float32: - if value == nil { - *dst = Float4Array{Status: Null} - } else if len(value) == 0 { - *dst = Float4Array{Status: Present} - } else { - elements := make([]Float4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Float4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*float32: - if value == nil { - *dst = Float4Array{Status: Null} - } else if len(value) == 0 { - *dst = Float4Array{Status: Present} - } else { - elements := make([]Float4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Float4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Float4: - if value == nil { - *dst = Float4Array{Status: Null} - } else if len(value) == 0 { - *dst = Float4Array{Status: Present} - } else { - *dst = Float4Array{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = Float4Array{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for Float4Array", src) - } - if elementsLength == 0 { - *dst = Float4Array{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Float4Array", src) - } - - *dst = Float4Array{ - Elements: make([]Float4, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Float4, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to Float4Array, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *Float4Array) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to Float4Array") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in Float4Array", err) - } - index++ - - return index, nil -} - -func (dst Float4Array) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Float4Array) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]float32: - *v = make([]float32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*float32: - *v = make([]*float32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *Float4Array) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from Float4Array") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from Float4Array") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *Float4Array) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float4Array{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Float4 - - if len(uta.Elements) > 0 { - elements = make([]Float4, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Float4 - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Float4Array{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *Float4Array) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float4Array{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = Float4Array{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Float4, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Float4Array{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src Float4Array) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src Float4Array) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("float4"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "float4") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Float4Array) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Float4Array) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/float8.go b/vendor/github.com/jackc/pgtype/float8.go deleted file mode 100644 index 6297ab5..0000000 --- a/vendor/github.com/jackc/pgtype/float8.go +++ /dev/null @@ -1,272 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - - "github.com/jackc/pgio" -) - -type Float8 struct { - Float float64 - Status Status -} - -func (dst *Float8) Set(src interface{}) error { - if src == nil { - *dst = Float8{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case float32: - *dst = Float8{Float: float64(value), Status: Present} - case float64: - *dst = Float8{Float: value, Status: Present} - case int8: - *dst = Float8{Float: float64(value), Status: Present} - case uint8: - *dst = Float8{Float: float64(value), Status: Present} - case int16: - *dst = Float8{Float: float64(value), Status: Present} - case uint16: - *dst = Float8{Float: float64(value), Status: Present} - case int32: - *dst = Float8{Float: float64(value), Status: Present} - case uint32: - *dst = Float8{Float: float64(value), Status: Present} - case int64: - f64 := float64(value) - if int64(f64) == value { - *dst = Float8{Float: f64, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float64", value) - } - case uint64: - f64 := float64(value) - if uint64(f64) == value { - *dst = Float8{Float: f64, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float64", value) - } - case int: - f64 := float64(value) - if int(f64) == value { - *dst = Float8{Float: f64, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float64", value) - } - case uint: - f64 := float64(value) - if uint(f64) == value { - *dst = Float8{Float: f64, Status: Present} - } else { - return fmt.Errorf("%v cannot be exactly represented as float64", value) - } - case string: - num, err := strconv.ParseFloat(value, 64) - if err != nil { - return err - } - *dst = Float8{Float: float64(num), Status: Present} - case *float64: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *float32: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *int8: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint8: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *int16: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint16: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *int32: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint32: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *int64: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint64: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *int: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Float8{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Float8", value) - } - - return nil -} - -func (dst Float8) Get() interface{} { - switch dst.Status { - case Present: - return dst.Float - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Float8) AssignTo(dst interface{}) error { - return float64AssignTo(src.Float, src.Status, dst) -} - -func (dst *Float8) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float8{Status: Null} - return nil - } - - n, err := strconv.ParseFloat(string(src), 64) - if err != nil { - return err - } - - *dst = Float8{Float: n, Status: Present} - return nil -} - -func (dst *Float8) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float8{Status: Null} - return nil - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for float8: %v", len(src)) - } - - n := int64(binary.BigEndian.Uint64(src)) - - *dst = Float8{Float: math.Float64frombits(uint64(n)), Status: Present} - return nil -} - -func (src Float8) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, strconv.FormatFloat(float64(src.Float), 'f', -1, 64)...) - return buf, nil -} - -func (src Float8) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint64(buf, math.Float64bits(src.Float)) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Float8) Scan(src interface{}) error { - if src == nil { - *dst = Float8{Status: Null} - return nil - } - - switch src := src.(type) { - case float64: - *dst = Float8{Float: src, Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Float8) Value() (driver.Value, error) { - switch src.Status { - case Present: - return src.Float, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} diff --git a/vendor/github.com/jackc/pgtype/float8_array.go b/vendor/github.com/jackc/pgtype/float8_array.go deleted file mode 100644 index 836ee19..0000000 --- a/vendor/github.com/jackc/pgtype/float8_array.go +++ /dev/null @@ -1,517 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type Float8Array struct { - Elements []Float8 - Dimensions []ArrayDimension - Status Status -} - -func (dst *Float8Array) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Float8Array{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []float64: - if value == nil { - *dst = Float8Array{Status: Null} - } else if len(value) == 0 { - *dst = Float8Array{Status: Present} - } else { - elements := make([]Float8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Float8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*float64: - if value == nil { - *dst = Float8Array{Status: Null} - } else if len(value) == 0 { - *dst = Float8Array{Status: Present} - } else { - elements := make([]Float8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Float8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Float8: - if value == nil { - *dst = Float8Array{Status: Null} - } else if len(value) == 0 { - *dst = Float8Array{Status: Present} - } else { - *dst = Float8Array{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = Float8Array{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for Float8Array", src) - } - if elementsLength == 0 { - *dst = Float8Array{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Float8Array", src) - } - - *dst = Float8Array{ - Elements: make([]Float8, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Float8, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to Float8Array, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *Float8Array) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to Float8Array") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in Float8Array", err) - } - index++ - - return index, nil -} - -func (dst Float8Array) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Float8Array) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]float64: - *v = make([]float64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*float64: - *v = make([]*float64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *Float8Array) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from Float8Array") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from Float8Array") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *Float8Array) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float8Array{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Float8 - - if len(uta.Elements) > 0 { - elements = make([]Float8, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Float8 - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Float8Array{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *Float8Array) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Float8Array{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = Float8Array{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Float8, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Float8Array{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src Float8Array) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src Float8Array) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("float8"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "float8") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Float8Array) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Float8Array) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/generic_binary.go b/vendor/github.com/jackc/pgtype/generic_binary.go deleted file mode 100644 index 76a1d35..0000000 --- a/vendor/github.com/jackc/pgtype/generic_binary.go +++ /dev/null @@ -1,39 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -// GenericBinary is a placeholder for binary format values that no other type exists -// to handle. -type GenericBinary Bytea - -func (dst *GenericBinary) Set(src interface{}) error { - return (*Bytea)(dst).Set(src) -} - -func (dst GenericBinary) Get() interface{} { - return (Bytea)(dst).Get() -} - -func (src *GenericBinary) AssignTo(dst interface{}) error { - return (*Bytea)(src).AssignTo(dst) -} - -func (dst *GenericBinary) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Bytea)(dst).DecodeBinary(ci, src) -} - -func (src GenericBinary) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Bytea)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *GenericBinary) Scan(src interface{}) error { - return (*Bytea)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src GenericBinary) Value() (driver.Value, error) { - return (Bytea)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/generic_text.go b/vendor/github.com/jackc/pgtype/generic_text.go deleted file mode 100644 index dbf5b47..0000000 --- a/vendor/github.com/jackc/pgtype/generic_text.go +++ /dev/null @@ -1,39 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -// GenericText is a placeholder for text format values that no other type exists -// to handle. -type GenericText Text - -func (dst *GenericText) Set(src interface{}) error { - return (*Text)(dst).Set(src) -} - -func (dst GenericText) Get() interface{} { - return (Text)(dst).Get() -} - -func (src *GenericText) AssignTo(dst interface{}) error { - return (*Text)(src).AssignTo(dst) -} - -func (dst *GenericText) DecodeText(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeText(ci, src) -} - -func (src GenericText) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeText(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *GenericText) Scan(src interface{}) error { - return (*Text)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src GenericText) Value() (driver.Value, error) { - return (Text)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/hstore.go b/vendor/github.com/jackc/pgtype/hstore.go deleted file mode 100644 index e42b755..0000000 --- a/vendor/github.com/jackc/pgtype/hstore.go +++ /dev/null @@ -1,465 +0,0 @@ -package pgtype - -import ( - "bytes" - "database/sql/driver" - "encoding/binary" - "errors" - "fmt" - "strings" - "unicode" - "unicode/utf8" - - "github.com/jackc/pgio" -) - -// Hstore represents an hstore column that can be null or have null values -// associated with its keys. -type Hstore struct { - Map map[string]Text - Status Status -} - -func (dst *Hstore) Set(src interface{}) error { - if src == nil { - *dst = Hstore{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case map[string]string: - m := make(map[string]Text, len(value)) - for k, v := range value { - m[k] = Text{String: v, Status: Present} - } - *dst = Hstore{Map: m, Status: Present} - case map[string]*string: - m := make(map[string]Text, len(value)) - for k, v := range value { - if v == nil { - m[k] = Text{Status: Null} - } else { - m[k] = Text{String: *v, Status: Present} - } - } - *dst = Hstore{Map: m, Status: Present} - case map[string]Text: - *dst = Hstore{Map: value, Status: Present} - default: - return fmt.Errorf("cannot convert %v to Hstore", src) - } - - return nil -} - -func (dst Hstore) Get() interface{} { - switch dst.Status { - case Present: - return dst.Map - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Hstore) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *map[string]string: - *v = make(map[string]string, len(src.Map)) - for k, val := range src.Map { - if val.Status != Present { - return fmt.Errorf("cannot decode %#v into %T", src, dst) - } - (*v)[k] = val.String - } - return nil - case *map[string]*string: - *v = make(map[string]*string, len(src.Map)) - for k, val := range src.Map { - switch val.Status { - case Null: - (*v)[k] = nil - case Present: - str := val.String - (*v)[k] = &str - default: - return fmt.Errorf("cannot decode %#v into %T", src, dst) - } - } - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Hstore) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Hstore{Status: Null} - return nil - } - - keys, values, err := parseHstore(string(src)) - if err != nil { - return err - } - - m := make(map[string]Text, len(keys)) - for i := range keys { - m[keys[i]] = values[i] - } - - *dst = Hstore{Map: m, Status: Present} - return nil -} - -func (dst *Hstore) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Hstore{Status: Null} - return nil - } - - rp := 0 - - if len(src[rp:]) < 4 { - return fmt.Errorf("hstore incomplete %v", src) - } - pairCount := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - - m := make(map[string]Text, pairCount) - - for i := 0; i < pairCount; i++ { - if len(src[rp:]) < 4 { - return fmt.Errorf("hstore incomplete %v", src) - } - keyLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - - if len(src[rp:]) < keyLen { - return fmt.Errorf("hstore incomplete %v", src) - } - key := string(src[rp : rp+keyLen]) - rp += keyLen - - if len(src[rp:]) < 4 { - return fmt.Errorf("hstore incomplete %v", src) - } - valueLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - - var valueBuf []byte - if valueLen >= 0 { - valueBuf = src[rp : rp+valueLen] - rp += valueLen - } - - var value Text - err := value.DecodeBinary(ci, valueBuf) - if err != nil { - return err - } - m[key] = value - } - - *dst = Hstore{Map: m, Status: Present} - - return nil -} - -func (src Hstore) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - firstPair := true - - inElemBuf := make([]byte, 0, 32) - for k, v := range src.Map { - if firstPair { - firstPair = false - } else { - buf = append(buf, ',') - } - - buf = append(buf, quoteHstoreElementIfNeeded(k)...) - buf = append(buf, "=>"...) - - elemBuf, err := v.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - - if elemBuf == nil { - buf = append(buf, "NULL"...) - } else { - buf = append(buf, quoteHstoreElementIfNeeded(string(elemBuf))...) - } - } - - return buf, nil -} - -func (src Hstore) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, int32(len(src.Map))) - - var err error - for k, v := range src.Map { - buf = pgio.AppendInt32(buf, int32(len(k))) - buf = append(buf, k...) - - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := v.EncodeText(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, err -} - -var quoteHstoreReplacer = strings.NewReplacer(`\`, `\\`, `"`, `\"`) - -func quoteHstoreElement(src string) string { - return `"` + quoteArrayReplacer.Replace(src) + `"` -} - -func quoteHstoreElementIfNeeded(src string) string { - if src == "" || (len(src) == 4 && strings.ToLower(src) == "null") || strings.ContainsAny(src, ` {},"\=>`) { - return quoteArrayElement(src) - } - return src -} - -const ( - hsPre = iota - hsKey - hsSep - hsVal - hsNul - hsNext -) - -type hstoreParser struct { - str string - pos int -} - -func newHSP(in string) *hstoreParser { - return &hstoreParser{ - pos: 0, - str: in, - } -} - -func (p *hstoreParser) Consume() (r rune, end bool) { - if p.pos >= len(p.str) { - end = true - return - } - r, w := utf8.DecodeRuneInString(p.str[p.pos:]) - p.pos += w - return -} - -func (p *hstoreParser) Peek() (r rune, end bool) { - if p.pos >= len(p.str) { - end = true - return - } - r, _ = utf8.DecodeRuneInString(p.str[p.pos:]) - return -} - -// parseHstore parses the string representation of an hstore column (the same -// you would get from an ordinary SELECT) into two slices of keys and values. it -// is used internally in the default parsing of hstores. -func parseHstore(s string) (k []string, v []Text, err error) { - if s == "" { - return - } - - buf := bytes.Buffer{} - keys := []string{} - values := []Text{} - p := newHSP(s) - - r, end := p.Consume() - state := hsPre - - for !end { - switch state { - case hsPre: - if r == '"' { - state = hsKey - } else { - err = errors.New("String does not begin with \"") - } - case hsKey: - switch r { - case '"': //End of the key - keys = append(keys, buf.String()) - buf = bytes.Buffer{} - state = hsSep - case '\\': //Potential escaped character - n, end := p.Consume() - switch { - case end: - err = errors.New("Found EOS in key, expecting character or \"") - case n == '"', n == '\\': - buf.WriteRune(n) - default: - buf.WriteRune(r) - buf.WriteRune(n) - } - default: //Any other character - buf.WriteRune(r) - } - case hsSep: - if r == '=' { - r, end = p.Consume() - switch { - case end: - err = errors.New("Found EOS after '=', expecting '>'") - case r == '>': - r, end = p.Consume() - switch { - case end: - err = errors.New("Found EOS after '=>', expecting '\"' or 'NULL'") - case r == '"': - state = hsVal - case r == 'N': - state = hsNul - default: - err = fmt.Errorf("Invalid character '%c' after '=>', expecting '\"' or 'NULL'", r) - } - default: - err = fmt.Errorf("Invalid character after '=', expecting '>'") - } - } else { - err = fmt.Errorf("Invalid character '%c' after value, expecting '='", r) - } - case hsVal: - switch r { - case '"': //End of the value - values = append(values, Text{String: buf.String(), Status: Present}) - buf = bytes.Buffer{} - state = hsNext - case '\\': //Potential escaped character - n, end := p.Consume() - switch { - case end: - err = errors.New("Found EOS in key, expecting character or \"") - case n == '"', n == '\\': - buf.WriteRune(n) - default: - buf.WriteRune(r) - buf.WriteRune(n) - } - default: //Any other character - buf.WriteRune(r) - } - case hsNul: - nulBuf := make([]rune, 3) - nulBuf[0] = r - for i := 1; i < 3; i++ { - r, end = p.Consume() - if end { - err = errors.New("Found EOS in NULL value") - return - } - nulBuf[i] = r - } - if nulBuf[0] == 'U' && nulBuf[1] == 'L' && nulBuf[2] == 'L' { - values = append(values, Text{Status: Null}) - state = hsNext - } else { - err = fmt.Errorf("Invalid NULL value: 'N%s'", string(nulBuf)) - } - case hsNext: - if r == ',' { - r, end = p.Consume() - switch { - case end: - err = errors.New("Found EOS after ',', expecting space") - case (unicode.IsSpace(r)): - r, end = p.Consume() - state = hsKey - default: - err = fmt.Errorf("Invalid character '%c' after ', ', expecting \"", r) - } - } else { - err = fmt.Errorf("Invalid character '%c' after value, expecting ','", r) - } - } - - if err != nil { - return - } - r, end = p.Consume() - } - if state != hsNext { - err = errors.New("Improperly formatted hstore") - return - } - k = keys - v = values - return -} - -// Scan implements the database/sql Scanner interface. -func (dst *Hstore) Scan(src interface{}) error { - if src == nil { - *dst = Hstore{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Hstore) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/hstore_array.go b/vendor/github.com/jackc/pgtype/hstore_array.go deleted file mode 100644 index 47b4b3f..0000000 --- a/vendor/github.com/jackc/pgtype/hstore_array.go +++ /dev/null @@ -1,489 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type HstoreArray struct { - Elements []Hstore - Dimensions []ArrayDimension - Status Status -} - -func (dst *HstoreArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = HstoreArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []map[string]string: - if value == nil { - *dst = HstoreArray{Status: Null} - } else if len(value) == 0 { - *dst = HstoreArray{Status: Present} - } else { - elements := make([]Hstore, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = HstoreArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Hstore: - if value == nil { - *dst = HstoreArray{Status: Null} - } else if len(value) == 0 { - *dst = HstoreArray{Status: Present} - } else { - *dst = HstoreArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = HstoreArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for HstoreArray", src) - } - if elementsLength == 0 { - *dst = HstoreArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to HstoreArray", src) - } - - *dst = HstoreArray{ - Elements: make([]Hstore, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Hstore, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to HstoreArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *HstoreArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to HstoreArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in HstoreArray", err) - } - index++ - - return index, nil -} - -func (dst HstoreArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *HstoreArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]map[string]string: - *v = make([]map[string]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *HstoreArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from HstoreArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from HstoreArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *HstoreArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = HstoreArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Hstore - - if len(uta.Elements) > 0 { - elements = make([]Hstore, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Hstore - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = HstoreArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *HstoreArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = HstoreArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = HstoreArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Hstore, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = HstoreArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src HstoreArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src HstoreArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("hstore"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "hstore") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *HstoreArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src HstoreArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/inet.go b/vendor/github.com/jackc/pgtype/inet.go deleted file mode 100644 index 976f0d7..0000000 --- a/vendor/github.com/jackc/pgtype/inet.go +++ /dev/null @@ -1,304 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding" - "fmt" - "net" - "strings" -) - -// Network address family is dependent on server socket.h value for AF_INET. -// In practice, all platforms appear to have the same value. See -// src/include/utils/inet.h for more information. -const ( - defaultAFInet = 2 - defaultAFInet6 = 3 -) - -// Inet represents both inet and cidr PostgreSQL types. -type Inet struct { - IPNet *net.IPNet - Status Status -} - -func (dst *Inet) Set(src interface{}) error { - if src == nil { - *dst = Inet{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case net.IPNet: - *dst = Inet{IPNet: &value, Status: Present} - case net.IP: - if len(value) == 0 { - *dst = Inet{Status: Null} - } else { - bitCount := len(value) * 8 - mask := net.CIDRMask(bitCount, bitCount) - *dst = Inet{IPNet: &net.IPNet{Mask: mask, IP: value}, Status: Present} - } - case string: - ip, ipnet, err := net.ParseCIDR(value) - if err != nil { - ip := net.ParseIP(value) - if ip == nil { - return fmt.Errorf("unable to parse inet address: %s", value) - } - - if ipv4 := maybeGetIPv4(value, ip); ipv4 != nil { - ipnet = &net.IPNet{IP: ipv4, Mask: net.CIDRMask(32, 32)} - } else { - ipnet = &net.IPNet{IP: ip, Mask: net.CIDRMask(128, 128)} - } - } else { - ipnet.IP = ip - if ipv4 := maybeGetIPv4(value, ipnet.IP); ipv4 != nil { - ipnet.IP = ipv4 - if len(ipnet.Mask) == 16 { - ipnet.Mask = ipnet.Mask[12:] // Not sure this is ever needed. - } - } - } - - *dst = Inet{IPNet: ipnet, Status: Present} - case *net.IPNet: - if value == nil { - *dst = Inet{Status: Null} - } else { - return dst.Set(*value) - } - case *net.IP: - if value == nil { - *dst = Inet{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Inet{Status: Null} - } else { - return dst.Set(*value) - } - default: - if tv, ok := src.(encoding.TextMarshaler); ok { - text, err := tv.MarshalText() - if err != nil { - return fmt.Errorf("cannot marshal %v: %w", value, err) - } - return dst.Set(string(text)) - } - if sv, ok := src.(fmt.Stringer); ok { - return dst.Set(sv.String()) - } - if originalSrc, ok := underlyingPtrType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Inet", value) - } - - return nil -} - -// Convert the net.IP to IPv4, if appropriate. -// -// When parsing a string to a net.IP using net.ParseIP() and the like, we get a -// 16 byte slice for IPv4 addresses as well as IPv6 addresses. This function -// calls To4() to convert them to a 4 byte slice. This is useful as it allows -// users of the net.IP check for IPv4 addresses based on the length and makes -// it clear we are handling IPv4 as opposed to IPv6 or IPv4-mapped IPv6 -// addresses. -func maybeGetIPv4(input string, ip net.IP) net.IP { - // Do not do this if the provided input looks like IPv6. This is because - // To4() on IPv4-mapped IPv6 addresses converts them to IPv4, which behave - // different in some cases. - if strings.Contains(input, ":") { - return nil - } - - return ip.To4() -} - -func (dst Inet) Get() interface{} { - switch dst.Status { - case Present: - return dst.IPNet - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Inet) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *net.IPNet: - *v = net.IPNet{ - IP: make(net.IP, len(src.IPNet.IP)), - Mask: make(net.IPMask, len(src.IPNet.Mask)), - } - copy(v.IP, src.IPNet.IP) - copy(v.Mask, src.IPNet.Mask) - return nil - case *net.IP: - if oneCount, bitCount := src.IPNet.Mask.Size(); oneCount != bitCount { - return fmt.Errorf("cannot assign %v to %T", src, dst) - } - *v = make(net.IP, len(src.IPNet.IP)) - copy(*v, src.IPNet.IP) - return nil - default: - if tv, ok := dst.(encoding.TextUnmarshaler); ok { - if err := tv.UnmarshalText([]byte(src.IPNet.String())); err != nil { - return fmt.Errorf("cannot unmarshal %v to %T: %w", src, dst, err) - } - return nil - } - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Inet) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Inet{Status: Null} - return nil - } - - var ipnet *net.IPNet - var err error - - if ip := net.ParseIP(string(src)); ip != nil { - if ipv4 := ip.To4(); ipv4 != nil { - ip = ipv4 - } - bitCount := len(ip) * 8 - mask := net.CIDRMask(bitCount, bitCount) - ipnet = &net.IPNet{Mask: mask, IP: ip} - } else { - ip, ipnet, err = net.ParseCIDR(string(src)) - if err != nil { - return err - } - if ipv4 := ip.To4(); ipv4 != nil { - ip = ipv4 - } - ones, _ := ipnet.Mask.Size() - *ipnet = net.IPNet{IP: ip, Mask: net.CIDRMask(ones, len(ip)*8)} - } - - *dst = Inet{IPNet: ipnet, Status: Present} - return nil -} - -func (dst *Inet) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Inet{Status: Null} - return nil - } - - if len(src) != 8 && len(src) != 20 { - return fmt.Errorf("Received an invalid size for an inet: %d", len(src)) - } - - // ignore family - bits := src[1] - // ignore is_cidr - addressLength := src[3] - - var ipnet net.IPNet - ipnet.IP = make(net.IP, int(addressLength)) - copy(ipnet.IP, src[4:]) - if ipv4 := ipnet.IP.To4(); ipv4 != nil { - ipnet.IP = ipv4 - } - ipnet.Mask = net.CIDRMask(int(bits), len(ipnet.IP)*8) - - *dst = Inet{IPNet: &ipnet, Status: Present} - - return nil -} - -func (src Inet) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.IPNet.String()...), nil -} - -// EncodeBinary encodes src into w. -func (src Inet) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var family byte - switch len(src.IPNet.IP) { - case net.IPv4len: - family = defaultAFInet - case net.IPv6len: - family = defaultAFInet6 - default: - return nil, fmt.Errorf("Unexpected IP length: %v", len(src.IPNet.IP)) - } - - buf = append(buf, family) - - ones, _ := src.IPNet.Mask.Size() - buf = append(buf, byte(ones)) - - // is_cidr is ignored on server - buf = append(buf, 0) - - buf = append(buf, byte(len(src.IPNet.IP))) - - return append(buf, src.IPNet.IP...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Inet) Scan(src interface{}) error { - if src == nil { - *dst = Inet{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Inet) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/inet_array.go b/vendor/github.com/jackc/pgtype/inet_array.go deleted file mode 100644 index 2460a1c..0000000 --- a/vendor/github.com/jackc/pgtype/inet_array.go +++ /dev/null @@ -1,546 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "net" - "reflect" - - "github.com/jackc/pgio" -) - -type InetArray struct { - Elements []Inet - Dimensions []ArrayDimension - Status Status -} - -func (dst *InetArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = InetArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []*net.IPNet: - if value == nil { - *dst = InetArray{Status: Null} - } else if len(value) == 0 { - *dst = InetArray{Status: Present} - } else { - elements := make([]Inet, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = InetArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []net.IP: - if value == nil { - *dst = InetArray{Status: Null} - } else if len(value) == 0 { - *dst = InetArray{Status: Present} - } else { - elements := make([]Inet, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = InetArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*net.IP: - if value == nil { - *dst = InetArray{Status: Null} - } else if len(value) == 0 { - *dst = InetArray{Status: Present} - } else { - elements := make([]Inet, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = InetArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Inet: - if value == nil { - *dst = InetArray{Status: Null} - } else if len(value) == 0 { - *dst = InetArray{Status: Present} - } else { - *dst = InetArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = InetArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for InetArray", src) - } - if elementsLength == 0 { - *dst = InetArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to InetArray", src) - } - - *dst = InetArray{ - Elements: make([]Inet, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Inet, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to InetArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *InetArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to InetArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in InetArray", err) - } - index++ - - return index, nil -} - -func (dst InetArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *InetArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]*net.IPNet: - *v = make([]*net.IPNet, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]net.IP: - *v = make([]net.IP, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*net.IP: - *v = make([]*net.IP, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *InetArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from InetArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from InetArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *InetArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = InetArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Inet - - if len(uta.Elements) > 0 { - elements = make([]Inet, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Inet - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = InetArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *InetArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = InetArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = InetArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Inet, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = InetArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src InetArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src InetArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("inet"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "inet") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *InetArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src InetArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/int2.go b/vendor/github.com/jackc/pgtype/int2.go deleted file mode 100644 index 0775882..0000000 --- a/vendor/github.com/jackc/pgtype/int2.go +++ /dev/null @@ -1,321 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "math" - "strconv" - - "github.com/jackc/pgio" -) - -type Int2 struct { - Int int16 - Status Status -} - -func (dst *Int2) Set(src interface{}) error { - if src == nil { - *dst = Int2{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case int8: - *dst = Int2{Int: int16(value), Status: Present} - case uint8: - *dst = Int2{Int: int16(value), Status: Present} - case int16: - *dst = Int2{Int: int16(value), Status: Present} - case uint16: - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case int32: - if value < math.MinInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case uint32: - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case int64: - if value < math.MinInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case uint64: - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case int: - if value < math.MinInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case uint: - if value > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case string: - num, err := strconv.ParseInt(value, 10, 16) - if err != nil { - return err - } - *dst = Int2{Int: int16(num), Status: Present} - case float32: - if value > math.MaxInt16 { - return fmt.Errorf("%f is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case float64: - if value > math.MaxInt16 { - return fmt.Errorf("%f is greater than maximum value for Int2", value) - } - *dst = Int2{Int: int16(value), Status: Present} - case *int8: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *uint8: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *int16: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *uint16: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *int32: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *uint32: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *int64: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *uint64: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *int: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *uint: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *float32: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - case *float64: - if value == nil { - *dst = Int2{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Int2", value) - } - - return nil -} - -func (dst Int2) Get() interface{} { - switch dst.Status { - case Present: - return dst.Int - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int2) AssignTo(dst interface{}) error { - return int64AssignTo(int64(src.Int), src.Status, dst) -} - -func (dst *Int2) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int2{Status: Null} - return nil - } - - n, err := strconv.ParseInt(string(src), 10, 16) - if err != nil { - return err - } - - *dst = Int2{Int: int16(n), Status: Present} - return nil -} - -func (dst *Int2) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int2{Status: Null} - return nil - } - - if len(src) != 2 { - return fmt.Errorf("invalid length for int2: %v", len(src)) - } - - n := int16(binary.BigEndian.Uint16(src)) - *dst = Int2{Int: n, Status: Present} - return nil -} - -func (src Int2) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, strconv.FormatInt(int64(src.Int), 10)...), nil -} - -func (src Int2) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return pgio.AppendInt16(buf, src.Int), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int2) Scan(src interface{}) error { - if src == nil { - *dst = Int2{Status: Null} - return nil - } - - switch src := src.(type) { - case int64: - if src < math.MinInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", src) - } - if src > math.MaxInt16 { - return fmt.Errorf("%d is greater than maximum value for Int2", src) - } - *dst = Int2{Int: int16(src), Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int2) Value() (driver.Value, error) { - switch src.Status { - case Present: - return int64(src.Int), nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Int2) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - return []byte(strconv.FormatInt(int64(src.Int), 10)), nil - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - return nil, errBadStatus -} - -func (dst *Int2) UnmarshalJSON(b []byte) error { - var n *int16 - err := json.Unmarshal(b, &n) - if err != nil { - return err - } - - if n == nil { - *dst = Int2{Status: Null} - } else { - *dst = Int2{Int: *n, Status: Present} - } - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/int2_array.go b/vendor/github.com/jackc/pgtype/int2_array.go deleted file mode 100644 index a513384..0000000 --- a/vendor/github.com/jackc/pgtype/int2_array.go +++ /dev/null @@ -1,909 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type Int2Array struct { - Elements []Int2 - Dimensions []ArrayDimension - Status Status -} - -func (dst *Int2Array) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int2Array{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []int16: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int16: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint16: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint16: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int32: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int32: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint32: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint32: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int64: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int64: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint64: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint64: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - elements := make([]Int2, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int2Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Int2: - if value == nil { - *dst = Int2Array{Status: Null} - } else if len(value) == 0 { - *dst = Int2Array{Status: Present} - } else { - *dst = Int2Array{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = Int2Array{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for Int2Array", src) - } - if elementsLength == 0 { - *dst = Int2Array{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Int2Array", src) - } - - *dst = Int2Array{ - Elements: make([]Int2, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Int2, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to Int2Array, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *Int2Array) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to Int2Array") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in Int2Array", err) - } - index++ - - return index, nil -} - -func (dst Int2Array) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int2Array) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]int16: - *v = make([]int16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int16: - *v = make([]*int16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint16: - *v = make([]uint16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint16: - *v = make([]*uint16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int32: - *v = make([]int32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int32: - *v = make([]*int32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint32: - *v = make([]uint32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint32: - *v = make([]*uint32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int64: - *v = make([]int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int64: - *v = make([]*int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint64: - *v = make([]uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint64: - *v = make([]*uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int: - *v = make([]int, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int: - *v = make([]*int, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint: - *v = make([]uint, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint: - *v = make([]*uint, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *Int2Array) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from Int2Array") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from Int2Array") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *Int2Array) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int2Array{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Int2 - - if len(uta.Elements) > 0 { - elements = make([]Int2, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Int2 - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Int2Array{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *Int2Array) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int2Array{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = Int2Array{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Int2, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Int2Array{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src Int2Array) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src Int2Array) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("int2"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "int2") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int2Array) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int2Array) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/int4.go b/vendor/github.com/jackc/pgtype/int4.go deleted file mode 100644 index 22b48e5..0000000 --- a/vendor/github.com/jackc/pgtype/int4.go +++ /dev/null @@ -1,312 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "math" - "strconv" - - "github.com/jackc/pgio" -) - -type Int4 struct { - Int int32 - Status Status -} - -func (dst *Int4) Set(src interface{}) error { - if src == nil { - *dst = Int4{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case int8: - *dst = Int4{Int: int32(value), Status: Present} - case uint8: - *dst = Int4{Int: int32(value), Status: Present} - case int16: - *dst = Int4{Int: int32(value), Status: Present} - case uint16: - *dst = Int4{Int: int32(value), Status: Present} - case int32: - *dst = Int4{Int: int32(value), Status: Present} - case uint32: - if value > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case int64: - if value < math.MinInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - if value > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case uint64: - if value > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case int: - if value < math.MinInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - if value > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case uint: - if value > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case string: - num, err := strconv.ParseInt(value, 10, 32) - if err != nil { - return err - } - *dst = Int4{Int: int32(num), Status: Present} - case float32: - if value > math.MaxInt32 { - return fmt.Errorf("%f is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case float64: - if value > math.MaxInt32 { - return fmt.Errorf("%f is greater than maximum value for Int4", value) - } - *dst = Int4{Int: int32(value), Status: Present} - case *int8: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint8: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *int16: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint16: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *int32: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint32: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *int64: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint64: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *int: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *uint: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *float32: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - case *float64: - if value == nil { - *dst = Int4{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Int4", value) - } - - return nil -} - -func (dst Int4) Get() interface{} { - switch dst.Status { - case Present: - return dst.Int - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int4) AssignTo(dst interface{}) error { - return int64AssignTo(int64(src.Int), src.Status, dst) -} - -func (dst *Int4) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4{Status: Null} - return nil - } - - n, err := strconv.ParseInt(string(src), 10, 32) - if err != nil { - return err - } - - *dst = Int4{Int: int32(n), Status: Present} - return nil -} - -func (dst *Int4) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4{Status: Null} - return nil - } - - if len(src) != 4 { - return fmt.Errorf("invalid length for int4: %v", len(src)) - } - - n := int32(binary.BigEndian.Uint32(src)) - *dst = Int4{Int: n, Status: Present} - return nil -} - -func (src Int4) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, strconv.FormatInt(int64(src.Int), 10)...), nil -} - -func (src Int4) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return pgio.AppendInt32(buf, src.Int), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int4) Scan(src interface{}) error { - if src == nil { - *dst = Int4{Status: Null} - return nil - } - - switch src := src.(type) { - case int64: - if src < math.MinInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", src) - } - if src > math.MaxInt32 { - return fmt.Errorf("%d is greater than maximum value for Int4", src) - } - *dst = Int4{Int: int32(src), Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int4) Value() (driver.Value, error) { - switch src.Status { - case Present: - return int64(src.Int), nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Int4) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - return []byte(strconv.FormatInt(int64(src.Int), 10)), nil - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - return nil, errBadStatus -} - -func (dst *Int4) UnmarshalJSON(b []byte) error { - var n *int32 - err := json.Unmarshal(b, &n) - if err != nil { - return err - } - - if n == nil { - *dst = Int4{Status: Null} - } else { - *dst = Int4{Int: *n, Status: Present} - } - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/int4_array.go b/vendor/github.com/jackc/pgtype/int4_array.go deleted file mode 100644 index de26236..0000000 --- a/vendor/github.com/jackc/pgtype/int4_array.go +++ /dev/null @@ -1,909 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type Int4Array struct { - Elements []Int4 - Dimensions []ArrayDimension - Status Status -} - -func (dst *Int4Array) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int4Array{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []int16: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int16: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint16: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint16: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int32: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int32: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint32: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint32: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int64: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int64: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint64: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint64: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - elements := make([]Int4, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Int4: - if value == nil { - *dst = Int4Array{Status: Null} - } else if len(value) == 0 { - *dst = Int4Array{Status: Present} - } else { - *dst = Int4Array{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = Int4Array{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for Int4Array", src) - } - if elementsLength == 0 { - *dst = Int4Array{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Int4Array", src) - } - - *dst = Int4Array{ - Elements: make([]Int4, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Int4, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to Int4Array, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *Int4Array) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to Int4Array") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in Int4Array", err) - } - index++ - - return index, nil -} - -func (dst Int4Array) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int4Array) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]int16: - *v = make([]int16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int16: - *v = make([]*int16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint16: - *v = make([]uint16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint16: - *v = make([]*uint16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int32: - *v = make([]int32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int32: - *v = make([]*int32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint32: - *v = make([]uint32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint32: - *v = make([]*uint32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int64: - *v = make([]int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int64: - *v = make([]*int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint64: - *v = make([]uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint64: - *v = make([]*uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int: - *v = make([]int, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int: - *v = make([]*int, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint: - *v = make([]uint, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint: - *v = make([]*uint, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *Int4Array) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from Int4Array") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from Int4Array") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *Int4Array) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4Array{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Int4 - - if len(uta.Elements) > 0 { - elements = make([]Int4, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Int4 - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Int4Array{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *Int4Array) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4Array{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = Int4Array{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Int4, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Int4Array{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src Int4Array) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src Int4Array) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("int4"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "int4") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int4Array) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int4Array) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/int4_multirange.go b/vendor/github.com/jackc/pgtype/int4_multirange.go deleted file mode 100644 index c3432ce..0000000 --- a/vendor/github.com/jackc/pgtype/int4_multirange.go +++ /dev/null @@ -1,239 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - - "github.com/jackc/pgio" -) - -type Int4multirange struct { - Ranges []Int4range - Status Status -} - -func (dst *Int4multirange) Set(src interface{}) error { - //untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int4multirange{Status: Null} - return nil - } - - switch value := src.(type) { - case Int4multirange: - *dst = value - case *Int4multirange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - case []Int4range: - if value == nil { - *dst = Int4multirange{Status: Null} - } else if len(value) == 0 { - *dst = Int4multirange{Status: Present} - } else { - elements := make([]Int4range, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4multirange{ - Ranges: elements, - Status: Present, - } - } - case []*Int4range: - if value == nil { - *dst = Int4multirange{Status: Null} - } else if len(value) == 0 { - *dst = Int4multirange{Status: Present} - } else { - elements := make([]Int4range, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int4multirange{ - Ranges: elements, - Status: Present, - } - } - default: - return fmt.Errorf("cannot convert %v to Int4multirange", src) - } - - return nil - -} - -func (dst Int4multirange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int4multirange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Int4multirange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4multirange{Status: Null} - return nil - } - - utmr, err := ParseUntypedTextMultirange(string(src)) - if err != nil { - return err - } - - var elements []Int4range - - if len(utmr.Elements) > 0 { - elements = make([]Int4range, len(utmr.Elements)) - - for i, s := range utmr.Elements { - var elem Int4range - - elemSrc := []byte(s) - - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Int4multirange{Ranges: elements, Status: Present} - - return nil -} - -func (dst *Int4multirange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4multirange{Status: Null} - return nil - } - - rp := 0 - - numElems := int(binary.BigEndian.Uint32(src[rp:])) - rp += 4 - - if numElems == 0 { - *dst = Int4multirange{Status: Present} - return nil - } - - elements := make([]Int4range, numElems) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err := elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Int4multirange{Ranges: elements, Status: Present} - return nil -} - -func (src Int4multirange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, '{') - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Ranges { - if i > 0 { - buf = append(buf, ',') - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - return nil, fmt.Errorf("multi-range does not allow null range") - } else { - buf = append(buf, string(elemBuf)...) - } - - } - - buf = append(buf, '}') - - return buf, nil -} - -func (src Int4multirange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, int32(len(src.Ranges))) - - for i := range src.Ranges { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Ranges[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int4multirange) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int4multirange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/int4range.go b/vendor/github.com/jackc/pgtype/int4range.go deleted file mode 100644 index c7f51fa..0000000 --- a/vendor/github.com/jackc/pgtype/int4range.go +++ /dev/null @@ -1,267 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - - "github.com/jackc/pgio" -) - -type Int4range struct { - Lower Int4 - Upper Int4 - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *Int4range) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int4range{Status: Null} - return nil - } - - switch value := src.(type) { - case Int4range: - *dst = value - case *Int4range: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to Int4range", src) - } - - return nil -} - -func (dst Int4range) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int4range) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Int4range) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4range{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = Int4range{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *Int4range) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int4range{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = Int4range{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src Int4range) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src Int4range) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int4range) Scan(src interface{}) error { - if src == nil { - *dst = Int4range{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int4range) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/int8.go b/vendor/github.com/jackc/pgtype/int8.go deleted file mode 100644 index 0e08997..0000000 --- a/vendor/github.com/jackc/pgtype/int8.go +++ /dev/null @@ -1,298 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "math" - "strconv" - - "github.com/jackc/pgio" -) - -type Int8 struct { - Int int64 - Status Status -} - -func (dst *Int8) Set(src interface{}) error { - if src == nil { - *dst = Int8{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case int8: - *dst = Int8{Int: int64(value), Status: Present} - case uint8: - *dst = Int8{Int: int64(value), Status: Present} - case int16: - *dst = Int8{Int: int64(value), Status: Present} - case uint16: - *dst = Int8{Int: int64(value), Status: Present} - case int32: - *dst = Int8{Int: int64(value), Status: Present} - case uint32: - *dst = Int8{Int: int64(value), Status: Present} - case int64: - *dst = Int8{Int: int64(value), Status: Present} - case uint64: - if value > math.MaxInt64 { - return fmt.Errorf("%d is greater than maximum value for Int8", value) - } - *dst = Int8{Int: int64(value), Status: Present} - case int: - if int64(value) < math.MinInt64 { - return fmt.Errorf("%d is greater than maximum value for Int8", value) - } - if int64(value) > math.MaxInt64 { - return fmt.Errorf("%d is greater than maximum value for Int8", value) - } - *dst = Int8{Int: int64(value), Status: Present} - case uint: - if uint64(value) > math.MaxInt64 { - return fmt.Errorf("%d is greater than maximum value for Int8", value) - } - *dst = Int8{Int: int64(value), Status: Present} - case string: - num, err := strconv.ParseInt(value, 10, 64) - if err != nil { - return err - } - *dst = Int8{Int: num, Status: Present} - case float32: - if value > math.MaxInt64 { - return fmt.Errorf("%f is greater than maximum value for Int8", value) - } - *dst = Int8{Int: int64(value), Status: Present} - case float64: - if value > math.MaxInt64 { - return fmt.Errorf("%f is greater than maximum value for Int8", value) - } - *dst = Int8{Int: int64(value), Status: Present} - case *int8: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint8: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *int16: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint16: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *int32: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint32: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *int64: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint64: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *int: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *uint: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *float32: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - case *float64: - if value == nil { - *dst = Int8{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Int8", value) - } - - return nil -} - -func (dst Int8) Get() interface{} { - switch dst.Status { - case Present: - return dst.Int - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int8) AssignTo(dst interface{}) error { - return int64AssignTo(int64(src.Int), src.Status, dst) -} - -func (dst *Int8) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8{Status: Null} - return nil - } - - n, err := strconv.ParseInt(string(src), 10, 64) - if err != nil { - return err - } - - *dst = Int8{Int: n, Status: Present} - return nil -} - -func (dst *Int8) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8{Status: Null} - return nil - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for int8: %v", len(src)) - } - - n := int64(binary.BigEndian.Uint64(src)) - - *dst = Int8{Int: n, Status: Present} - return nil -} - -func (src Int8) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, strconv.FormatInt(src.Int, 10)...), nil -} - -func (src Int8) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return pgio.AppendInt64(buf, src.Int), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int8) Scan(src interface{}) error { - if src == nil { - *dst = Int8{Status: Null} - return nil - } - - switch src := src.(type) { - case int64: - *dst = Int8{Int: src, Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int8) Value() (driver.Value, error) { - switch src.Status { - case Present: - return int64(src.Int), nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Int8) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - return []byte(strconv.FormatInt(src.Int, 10)), nil - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - return nil, errBadStatus -} - -func (dst *Int8) UnmarshalJSON(b []byte) error { - var n *int64 - err := json.Unmarshal(b, &n) - if err != nil { - return err - } - - if n == nil { - *dst = Int8{Status: Null} - } else { - *dst = Int8{Int: *n, Status: Present} - } - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/int8_array.go b/vendor/github.com/jackc/pgtype/int8_array.go deleted file mode 100644 index e405b32..0000000 --- a/vendor/github.com/jackc/pgtype/int8_array.go +++ /dev/null @@ -1,909 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type Int8Array struct { - Elements []Int8 - Dimensions []ArrayDimension - Status Status -} - -func (dst *Int8Array) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int8Array{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []int16: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int16: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint16: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint16: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int32: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int32: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint32: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint32: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int64: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int64: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint64: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint64: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - elements := make([]Int8, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8Array{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Int8: - if value == nil { - *dst = Int8Array{Status: Null} - } else if len(value) == 0 { - *dst = Int8Array{Status: Present} - } else { - *dst = Int8Array{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = Int8Array{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for Int8Array", src) - } - if elementsLength == 0 { - *dst = Int8Array{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Int8Array", src) - } - - *dst = Int8Array{ - Elements: make([]Int8, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Int8, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to Int8Array, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *Int8Array) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to Int8Array") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in Int8Array", err) - } - index++ - - return index, nil -} - -func (dst Int8Array) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int8Array) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]int16: - *v = make([]int16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int16: - *v = make([]*int16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint16: - *v = make([]uint16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint16: - *v = make([]*uint16, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int32: - *v = make([]int32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int32: - *v = make([]*int32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint32: - *v = make([]uint32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint32: - *v = make([]*uint32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int64: - *v = make([]int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int64: - *v = make([]*int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint64: - *v = make([]uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint64: - *v = make([]*uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int: - *v = make([]int, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int: - *v = make([]*int, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint: - *v = make([]uint, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint: - *v = make([]*uint, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *Int8Array) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from Int8Array") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from Int8Array") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *Int8Array) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8Array{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Int8 - - if len(uta.Elements) > 0 { - elements = make([]Int8, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Int8 - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Int8Array{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *Int8Array) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8Array{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = Int8Array{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Int8, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Int8Array{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src Int8Array) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src Int8Array) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("int8"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "int8") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int8Array) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int8Array) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/int8_multirange.go b/vendor/github.com/jackc/pgtype/int8_multirange.go deleted file mode 100644 index e097642..0000000 --- a/vendor/github.com/jackc/pgtype/int8_multirange.go +++ /dev/null @@ -1,239 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - - "github.com/jackc/pgio" -) - -type Int8multirange struct { - Ranges []Int8range - Status Status -} - -func (dst *Int8multirange) Set(src interface{}) error { - //untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int8multirange{Status: Null} - return nil - } - - switch value := src.(type) { - case Int8multirange: - *dst = value - case *Int8multirange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - case []Int8range: - if value == nil { - *dst = Int8multirange{Status: Null} - } else if len(value) == 0 { - *dst = Int8multirange{Status: Present} - } else { - elements := make([]Int8range, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8multirange{ - Ranges: elements, - Status: Present, - } - } - case []*Int8range: - if value == nil { - *dst = Int8multirange{Status: Null} - } else if len(value) == 0 { - *dst = Int8multirange{Status: Present} - } else { - elements := make([]Int8range, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Int8multirange{ - Ranges: elements, - Status: Present, - } - } - default: - return fmt.Errorf("cannot convert %v to Int8multirange", src) - } - - return nil - -} - -func (dst Int8multirange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int8multirange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Int8multirange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8multirange{Status: Null} - return nil - } - - utmr, err := ParseUntypedTextMultirange(string(src)) - if err != nil { - return err - } - - var elements []Int8range - - if len(utmr.Elements) > 0 { - elements = make([]Int8range, len(utmr.Elements)) - - for i, s := range utmr.Elements { - var elem Int8range - - elemSrc := []byte(s) - - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Int8multirange{Ranges: elements, Status: Present} - - return nil -} - -func (dst *Int8multirange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8multirange{Status: Null} - return nil - } - - rp := 0 - - numElems := int(binary.BigEndian.Uint32(src[rp:])) - rp += 4 - - if numElems == 0 { - *dst = Int8multirange{Status: Present} - return nil - } - - elements := make([]Int8range, numElems) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err := elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Int8multirange{Ranges: elements, Status: Present} - return nil -} - -func (src Int8multirange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, '{') - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Ranges { - if i > 0 { - buf = append(buf, ',') - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - return nil, fmt.Errorf("multi-range does not allow null range") - } else { - buf = append(buf, string(elemBuf)...) - } - - } - - buf = append(buf, '}') - - return buf, nil -} - -func (src Int8multirange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, int32(len(src.Ranges))) - - for i := range src.Ranges { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Ranges[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int8multirange) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int8multirange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/int8range.go b/vendor/github.com/jackc/pgtype/int8range.go deleted file mode 100644 index 7136937..0000000 --- a/vendor/github.com/jackc/pgtype/int8range.go +++ /dev/null @@ -1,267 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - - "github.com/jackc/pgio" -) - -type Int8range struct { - Lower Int8 - Upper Int8 - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *Int8range) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Int8range{Status: Null} - return nil - } - - switch value := src.(type) { - case Int8range: - *dst = value - case *Int8range: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to Int8range", src) - } - - return nil -} - -func (dst Int8range) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Int8range) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Int8range) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8range{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = Int8range{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *Int8range) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Int8range{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = Int8range{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src Int8range) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src Int8range) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Int8range) Scan(src interface{}) error { - if src == nil { - *dst = Int8range{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Int8range) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/interval.go b/vendor/github.com/jackc/pgtype/interval.go deleted file mode 100644 index 00ec47c..0000000 --- a/vendor/github.com/jackc/pgtype/interval.go +++ /dev/null @@ -1,257 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "strconv" - "strings" - "time" - - "github.com/jackc/pgio" -) - -const ( - microsecondsPerSecond = 1000000 - microsecondsPerMinute = 60 * microsecondsPerSecond - microsecondsPerHour = 60 * microsecondsPerMinute - microsecondsPerDay = 24 * microsecondsPerHour - microsecondsPerMonth = 30 * microsecondsPerDay -) - -type Interval struct { - Microseconds int64 - Days int32 - Months int32 - Status Status -} - -func (dst *Interval) Set(src interface{}) error { - if src == nil { - *dst = Interval{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case time.Duration: - *dst = Interval{Microseconds: int64(value) / 1000, Status: Present} - default: - if originalSrc, ok := underlyingPtrType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Interval", value) - } - - return nil -} - -func (dst Interval) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Interval) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *time.Duration: - us := int64(src.Months)*microsecondsPerMonth + int64(src.Days)*microsecondsPerDay + src.Microseconds - *v = time.Duration(us) * time.Microsecond - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Interval) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Interval{Status: Null} - return nil - } - - var microseconds int64 - var days int32 - var months int32 - - parts := strings.Split(string(src), " ") - - for i := 0; i < len(parts)-1; i += 2 { - scalar, err := strconv.ParseInt(parts[i], 10, 64) - if err != nil { - return fmt.Errorf("bad interval format") - } - - switch parts[i+1] { - case "year", "years": - months += int32(scalar * 12) - case "mon", "mons": - months += int32(scalar) - case "day", "days": - days = int32(scalar) - } - } - - if len(parts)%2 == 1 { - timeParts := strings.SplitN(parts[len(parts)-1], ":", 3) - if len(timeParts) != 3 { - return fmt.Errorf("bad interval format") - } - - var negative bool - if timeParts[0][0] == '-' { - negative = true - timeParts[0] = timeParts[0][1:] - } - - hours, err := strconv.ParseInt(timeParts[0], 10, 64) - if err != nil { - return fmt.Errorf("bad interval hour format: %s", timeParts[0]) - } - - minutes, err := strconv.ParseInt(timeParts[1], 10, 64) - if err != nil { - return fmt.Errorf("bad interval minute format: %s", timeParts[1]) - } - - secondParts := strings.SplitN(timeParts[2], ".", 2) - - seconds, err := strconv.ParseInt(secondParts[0], 10, 64) - if err != nil { - return fmt.Errorf("bad interval second format: %s", secondParts[0]) - } - - var uSeconds int64 - if len(secondParts) == 2 { - uSeconds, err = strconv.ParseInt(secondParts[1], 10, 64) - if err != nil { - return fmt.Errorf("bad interval decimal format: %s", secondParts[1]) - } - - for i := 0; i < 6-len(secondParts[1]); i++ { - uSeconds *= 10 - } - } - - microseconds = hours * microsecondsPerHour - microseconds += minutes * microsecondsPerMinute - microseconds += seconds * microsecondsPerSecond - microseconds += uSeconds - - if negative { - microseconds = -microseconds - } - } - - *dst = Interval{Months: months, Days: days, Microseconds: microseconds, Status: Present} - return nil -} - -func (dst *Interval) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Interval{Status: Null} - return nil - } - - if len(src) != 16 { - return fmt.Errorf("Received an invalid size for an interval: %d", len(src)) - } - - microseconds := int64(binary.BigEndian.Uint64(src)) - days := int32(binary.BigEndian.Uint32(src[8:])) - months := int32(binary.BigEndian.Uint32(src[12:])) - - *dst = Interval{Microseconds: microseconds, Days: days, Months: months, Status: Present} - return nil -} - -func (src Interval) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if src.Months != 0 { - buf = append(buf, strconv.FormatInt(int64(src.Months), 10)...) - buf = append(buf, " mon "...) - } - - if src.Days != 0 { - buf = append(buf, strconv.FormatInt(int64(src.Days), 10)...) - buf = append(buf, " day "...) - } - - absMicroseconds := src.Microseconds - if absMicroseconds < 0 { - absMicroseconds = -absMicroseconds - buf = append(buf, '-') - } - - hours := absMicroseconds / microsecondsPerHour - minutes := (absMicroseconds % microsecondsPerHour) / microsecondsPerMinute - seconds := (absMicroseconds % microsecondsPerMinute) / microsecondsPerSecond - microseconds := absMicroseconds % microsecondsPerSecond - - timeStr := fmt.Sprintf("%02d:%02d:%02d.%06d", hours, minutes, seconds, microseconds) - return append(buf, timeStr...), nil -} - -// EncodeBinary encodes src into w. -func (src Interval) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt64(buf, src.Microseconds) - buf = pgio.AppendInt32(buf, src.Days) - return pgio.AppendInt32(buf, src.Months), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Interval) Scan(src interface{}) error { - if src == nil { - *dst = Interval{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Interval) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/json.go b/vendor/github.com/jackc/pgtype/json.go deleted file mode 100644 index a9508bd..0000000 --- a/vendor/github.com/jackc/pgtype/json.go +++ /dev/null @@ -1,209 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/json" - "errors" - "fmt" - "reflect" -) - -type JSON struct { - Bytes []byte - Status Status -} - -func (dst *JSON) Set(src interface{}) error { - if src == nil { - *dst = JSON{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case string: - *dst = JSON{Bytes: []byte(value), Status: Present} - case *string: - if value == nil { - *dst = JSON{Status: Null} - } else { - *dst = JSON{Bytes: []byte(*value), Status: Present} - } - case []byte: - if value == nil { - *dst = JSON{Status: Null} - } else { - *dst = JSON{Bytes: value, Status: Present} - } - // Encode* methods are defined on *JSON. If JSON is passed directly then the - // struct itself would be encoded instead of Bytes. This is clearly a footgun - // so detect and return an error. See https://github.com/jackc/pgx/issues/350. - case JSON: - return errors.New("use pointer to pgtype.JSON instead of value") - // Same as above but for JSONB (because they share implementation) - case JSONB: - return errors.New("use pointer to pgtype.JSONB instead of value") - - default: - buf, err := json.Marshal(value) - if err != nil { - return err - } - *dst = JSON{Bytes: buf, Status: Present} - } - - return nil -} - -func (dst JSON) Get() interface{} { - switch dst.Status { - case Present: - var i interface{} - err := json.Unmarshal(dst.Bytes, &i) - if err != nil { - return dst - } - return i - case Null: - return nil - default: - return dst.Status - } -} - -func (src *JSON) AssignTo(dst interface{}) error { - switch v := dst.(type) { - case *string: - if src.Status == Present { - *v = string(src.Bytes) - } else { - return fmt.Errorf("cannot assign non-present status to %T", dst) - } - case **string: - if src.Status == Present { - s := string(src.Bytes) - *v = &s - return nil - } else { - *v = nil - return nil - } - case *[]byte: - if src.Status != Present { - *v = nil - } else { - buf := make([]byte, len(src.Bytes)) - copy(buf, src.Bytes) - *v = buf - } - default: - data := src.Bytes - if data == nil || src.Status != Present { - data = []byte("null") - } - - p := reflect.ValueOf(dst).Elem() - p.Set(reflect.Zero(p.Type())) - - return json.Unmarshal(data, dst) - } - - return nil -} - -func (JSON) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *JSON) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = JSON{Status: Null} - return nil - } - - *dst = JSON{Bytes: src, Status: Present} - return nil -} - -func (dst *JSON) DecodeBinary(ci *ConnInfo, src []byte) error { - return dst.DecodeText(ci, src) -} - -func (JSON) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (src JSON) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.Bytes...), nil -} - -func (src JSON) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return src.EncodeText(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *JSON) Scan(src interface{}) error { - if src == nil { - *dst = JSON{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src JSON) Value() (driver.Value, error) { - switch src.Status { - case Present: - return src.Bytes, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src JSON) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - return src.Bytes, nil - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - return nil, errBadStatus -} - -func (dst *JSON) UnmarshalJSON(b []byte) error { - if b == nil || string(b) == "null" { - *dst = JSON{Status: Null} - } else { - *dst = JSON{Bytes: b, Status: Present} - } - return nil - -} diff --git a/vendor/github.com/jackc/pgtype/json_array.go b/vendor/github.com/jackc/pgtype/json_array.go deleted file mode 100644 index 8d68882..0000000 --- a/vendor/github.com/jackc/pgtype/json_array.go +++ /dev/null @@ -1,546 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type JSONArray struct { - Elements []JSON - Dimensions []ArrayDimension - Status Status -} - -func (dst *JSONArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = JSONArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = JSONArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONArray{Status: Present} - } else { - elements := make([]JSON, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = JSONArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case [][]byte: - if value == nil { - *dst = JSONArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONArray{Status: Present} - } else { - elements := make([]JSON, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = JSONArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []json.RawMessage: - if value == nil { - *dst = JSONArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONArray{Status: Present} - } else { - elements := make([]JSON, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = JSONArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []JSON: - if value == nil { - *dst = JSONArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONArray{Status: Present} - } else { - *dst = JSONArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = JSONArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for JSONArray", src) - } - if elementsLength == 0 { - *dst = JSONArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to JSONArray", src) - } - - *dst = JSONArray{ - Elements: make([]JSON, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]JSON, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to JSONArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *JSONArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to JSONArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in JSONArray", err) - } - index++ - - return index, nil -} - -func (dst JSONArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *JSONArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[][]byte: - *v = make([][]byte, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]json.RawMessage: - *v = make([]json.RawMessage, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *JSONArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from JSONArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from JSONArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *JSONArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = JSONArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []JSON - - if len(uta.Elements) > 0 { - elements = make([]JSON, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem JSON - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = JSONArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *JSONArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = JSONArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = JSONArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]JSON, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = JSONArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src JSONArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src JSONArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("json"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "json") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *JSONArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src JSONArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/jsonb.go b/vendor/github.com/jackc/pgtype/jsonb.go deleted file mode 100644 index c9dafc9..0000000 --- a/vendor/github.com/jackc/pgtype/jsonb.go +++ /dev/null @@ -1,85 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" -) - -type JSONB JSON - -func (dst *JSONB) Set(src interface{}) error { - return (*JSON)(dst).Set(src) -} - -func (dst JSONB) Get() interface{} { - return (JSON)(dst).Get() -} - -func (src *JSONB) AssignTo(dst interface{}) error { - return (*JSON)(src).AssignTo(dst) -} - -func (JSONB) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *JSONB) DecodeText(ci *ConnInfo, src []byte) error { - return (*JSON)(dst).DecodeText(ci, src) -} - -func (dst *JSONB) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = JSONB{Status: Null} - return nil - } - - if len(src) == 0 { - return fmt.Errorf("jsonb too short") - } - - if src[0] != 1 { - return fmt.Errorf("unknown jsonb version number %d", src[0]) - } - - *dst = JSONB{Bytes: src[1:], Status: Present} - return nil - -} - -func (JSONB) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (src JSONB) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (JSON)(src).EncodeText(ci, buf) -} - -func (src JSONB) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, 1) - return append(buf, src.Bytes...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *JSONB) Scan(src interface{}) error { - return (*JSON)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src JSONB) Value() (driver.Value, error) { - return (JSON)(src).Value() -} - -func (src JSONB) MarshalJSON() ([]byte, error) { - return (JSON)(src).MarshalJSON() -} - -func (dst *JSONB) UnmarshalJSON(b []byte) error { - return (*JSON)(dst).UnmarshalJSON(b) -} diff --git a/vendor/github.com/jackc/pgtype/jsonb_array.go b/vendor/github.com/jackc/pgtype/jsonb_array.go deleted file mode 100644 index e78ad37..0000000 --- a/vendor/github.com/jackc/pgtype/jsonb_array.go +++ /dev/null @@ -1,546 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type JSONBArray struct { - Elements []JSONB - Dimensions []ArrayDimension - Status Status -} - -func (dst *JSONBArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = JSONBArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = JSONBArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONBArray{Status: Present} - } else { - elements := make([]JSONB, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = JSONBArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case [][]byte: - if value == nil { - *dst = JSONBArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONBArray{Status: Present} - } else { - elements := make([]JSONB, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = JSONBArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []json.RawMessage: - if value == nil { - *dst = JSONBArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONBArray{Status: Present} - } else { - elements := make([]JSONB, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = JSONBArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []JSONB: - if value == nil { - *dst = JSONBArray{Status: Null} - } else if len(value) == 0 { - *dst = JSONBArray{Status: Present} - } else { - *dst = JSONBArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = JSONBArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for JSONBArray", src) - } - if elementsLength == 0 { - *dst = JSONBArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to JSONBArray", src) - } - - *dst = JSONBArray{ - Elements: make([]JSONB, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]JSONB, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to JSONBArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *JSONBArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to JSONBArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in JSONBArray", err) - } - index++ - - return index, nil -} - -func (dst JSONBArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *JSONBArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[][]byte: - *v = make([][]byte, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]json.RawMessage: - *v = make([]json.RawMessage, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *JSONBArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from JSONBArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from JSONBArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *JSONBArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = JSONBArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []JSONB - - if len(uta.Elements) > 0 { - elements = make([]JSONB, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem JSONB - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = JSONBArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *JSONBArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = JSONBArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = JSONBArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]JSONB, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = JSONBArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src JSONBArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src JSONBArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("jsonb"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "jsonb") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *JSONBArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src JSONBArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/line.go b/vendor/github.com/jackc/pgtype/line.go deleted file mode 100644 index 3564b17..0000000 --- a/vendor/github.com/jackc/pgtype/line.go +++ /dev/null @@ -1,148 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Line struct { - A, B, C float64 - Status Status -} - -func (dst *Line) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to Line", src) -} - -func (dst Line) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Line) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Line) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Line{Status: Null} - return nil - } - - if len(src) < 7 { - return fmt.Errorf("invalid length for Line: %v", len(src)) - } - - parts := strings.SplitN(string(src[1:len(src)-1]), ",", 3) - if len(parts) < 3 { - return fmt.Errorf("invalid format for line") - } - - a, err := strconv.ParseFloat(parts[0], 64) - if err != nil { - return err - } - - b, err := strconv.ParseFloat(parts[1], 64) - if err != nil { - return err - } - - c, err := strconv.ParseFloat(parts[2], 64) - if err != nil { - return err - } - - *dst = Line{A: a, B: b, C: c, Status: Present} - return nil -} - -func (dst *Line) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Line{Status: Null} - return nil - } - - if len(src) != 24 { - return fmt.Errorf("invalid length for Line: %v", len(src)) - } - - a := binary.BigEndian.Uint64(src) - b := binary.BigEndian.Uint64(src[8:]) - c := binary.BigEndian.Uint64(src[16:]) - - *dst = Line{ - A: math.Float64frombits(a), - B: math.Float64frombits(b), - C: math.Float64frombits(c), - Status: Present, - } - return nil -} - -func (src Line) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, fmt.Sprintf(`{%s,%s,%s}`, - strconv.FormatFloat(src.A, 'f', -1, 64), - strconv.FormatFloat(src.B, 'f', -1, 64), - strconv.FormatFloat(src.C, 'f', -1, 64), - )...) - - return buf, nil -} - -func (src Line) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint64(buf, math.Float64bits(src.A)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.B)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.C)) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Line) Scan(src interface{}) error { - if src == nil { - *dst = Line{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Line) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/lseg.go b/vendor/github.com/jackc/pgtype/lseg.go deleted file mode 100644 index 894dae8..0000000 --- a/vendor/github.com/jackc/pgtype/lseg.go +++ /dev/null @@ -1,165 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Lseg struct { - P [2]Vec2 - Status Status -} - -func (dst *Lseg) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to Lseg", src) -} - -func (dst Lseg) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Lseg) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Lseg) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Lseg{Status: Null} - return nil - } - - if len(src) < 11 { - return fmt.Errorf("invalid length for Lseg: %v", len(src)) - } - - str := string(src[2:]) - - var end int - end = strings.IndexByte(str, ',') - - x1, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1:] - end = strings.IndexByte(str, ')') - - y1, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+3:] - end = strings.IndexByte(str, ',') - - x2, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1 : len(str)-2] - - y2, err := strconv.ParseFloat(str, 64) - if err != nil { - return err - } - - *dst = Lseg{P: [2]Vec2{{x1, y1}, {x2, y2}}, Status: Present} - return nil -} - -func (dst *Lseg) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Lseg{Status: Null} - return nil - } - - if len(src) != 32 { - return fmt.Errorf("invalid length for Lseg: %v", len(src)) - } - - x1 := binary.BigEndian.Uint64(src) - y1 := binary.BigEndian.Uint64(src[8:]) - x2 := binary.BigEndian.Uint64(src[16:]) - y2 := binary.BigEndian.Uint64(src[24:]) - - *dst = Lseg{ - P: [2]Vec2{ - {math.Float64frombits(x1), math.Float64frombits(y1)}, - {math.Float64frombits(x2), math.Float64frombits(y2)}, - }, - Status: Present, - } - return nil -} - -func (src Lseg) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, fmt.Sprintf(`[(%s,%s),(%s,%s)]`, - strconv.FormatFloat(src.P[0].X, 'f', -1, 64), - strconv.FormatFloat(src.P[0].Y, 'f', -1, 64), - strconv.FormatFloat(src.P[1].X, 'f', -1, 64), - strconv.FormatFloat(src.P[1].Y, 'f', -1, 64), - )...) - - return buf, nil -} - -func (src Lseg) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[0].X)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[0].Y)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[1].X)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P[1].Y)) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Lseg) Scan(src interface{}) error { - if src == nil { - *dst = Lseg{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Lseg) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/ltree.go b/vendor/github.com/jackc/pgtype/ltree.go deleted file mode 100644 index 8c8d421..0000000 --- a/vendor/github.com/jackc/pgtype/ltree.go +++ /dev/null @@ -1,72 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" -) - -type Ltree Text - -func (dst *Ltree) Set(src interface{}) error { - return (*Text)(dst).Set(src) -} - -func (dst Ltree) Get() interface{} { - return (Text)(dst).Get() -} - -func (src *Ltree) AssignTo(dst interface{}) error { - return (*Text)(src).AssignTo(dst) -} - -func (src Ltree) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeText(ci, buf) -} - -func (src Ltree) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - buf = append(buf, 1) - return append(buf, src.String...), nil -} - -func (Ltree) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *Ltree) DecodeText(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeText(ci, src) -} - -func (dst *Ltree) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Ltree{Status: Null} - return nil - } - - // Get Ltree version, only 1 is allowed - version := src[0] - if version != 1 { - return fmt.Errorf("unsupported ltree version %d", version) - } - - ltreeStr := string(src[1:]) - *dst = Ltree{String: ltreeStr, Status: Present} - return nil -} - -func (Ltree) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (dst *Ltree) Scan(src interface{}) error { - return (*Text)(dst).Scan(src) -} - -func (src Ltree) Value() (driver.Value, error) { - return (Text)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/macaddr.go b/vendor/github.com/jackc/pgtype/macaddr.go deleted file mode 100644 index 1d3cfe7..0000000 --- a/vendor/github.com/jackc/pgtype/macaddr.go +++ /dev/null @@ -1,173 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - "net" -) - -type Macaddr struct { - Addr net.HardwareAddr - Status Status -} - -func (dst *Macaddr) Set(src interface{}) error { - if src == nil { - *dst = Macaddr{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case net.HardwareAddr: - addr := make(net.HardwareAddr, len(value)) - copy(addr, value) - *dst = Macaddr{Addr: addr, Status: Present} - case string: - addr, err := net.ParseMAC(value) - if err != nil { - return err - } - *dst = Macaddr{Addr: addr, Status: Present} - case *net.HardwareAddr: - if value == nil { - *dst = Macaddr{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Macaddr{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingPtrType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Macaddr", value) - } - - return nil -} - -func (dst Macaddr) Get() interface{} { - switch dst.Status { - case Present: - return dst.Addr - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Macaddr) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *net.HardwareAddr: - *v = make(net.HardwareAddr, len(src.Addr)) - copy(*v, src.Addr) - return nil - case *string: - *v = src.Addr.String() - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Macaddr) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Macaddr{Status: Null} - return nil - } - - addr, err := net.ParseMAC(string(src)) - if err != nil { - return err - } - - *dst = Macaddr{Addr: addr, Status: Present} - return nil -} - -func (dst *Macaddr) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Macaddr{Status: Null} - return nil - } - - if len(src) != 6 { - return fmt.Errorf("Received an invalid size for a macaddr: %d", len(src)) - } - - addr := make(net.HardwareAddr, 6) - copy(addr, src) - - *dst = Macaddr{Addr: addr, Status: Present} - - return nil -} - -func (src Macaddr) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.Addr.String()...), nil -} - -// EncodeBinary encodes src into w. -func (src Macaddr) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.Addr...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Macaddr) Scan(src interface{}) error { - if src == nil { - *dst = Macaddr{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Macaddr) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/macaddr_array.go b/vendor/github.com/jackc/pgtype/macaddr_array.go deleted file mode 100644 index bdb1f20..0000000 --- a/vendor/github.com/jackc/pgtype/macaddr_array.go +++ /dev/null @@ -1,518 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "net" - "reflect" - - "github.com/jackc/pgio" -) - -type MacaddrArray struct { - Elements []Macaddr - Dimensions []ArrayDimension - Status Status -} - -func (dst *MacaddrArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = MacaddrArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []net.HardwareAddr: - if value == nil { - *dst = MacaddrArray{Status: Null} - } else if len(value) == 0 { - *dst = MacaddrArray{Status: Present} - } else { - elements := make([]Macaddr, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = MacaddrArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*net.HardwareAddr: - if value == nil { - *dst = MacaddrArray{Status: Null} - } else if len(value) == 0 { - *dst = MacaddrArray{Status: Present} - } else { - elements := make([]Macaddr, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = MacaddrArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Macaddr: - if value == nil { - *dst = MacaddrArray{Status: Null} - } else if len(value) == 0 { - *dst = MacaddrArray{Status: Present} - } else { - *dst = MacaddrArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = MacaddrArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for MacaddrArray", src) - } - if elementsLength == 0 { - *dst = MacaddrArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to MacaddrArray", src) - } - - *dst = MacaddrArray{ - Elements: make([]Macaddr, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Macaddr, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to MacaddrArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *MacaddrArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to MacaddrArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in MacaddrArray", err) - } - index++ - - return index, nil -} - -func (dst MacaddrArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *MacaddrArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]net.HardwareAddr: - *v = make([]net.HardwareAddr, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*net.HardwareAddr: - *v = make([]*net.HardwareAddr, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *MacaddrArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from MacaddrArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from MacaddrArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *MacaddrArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = MacaddrArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Macaddr - - if len(uta.Elements) > 0 { - elements = make([]Macaddr, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Macaddr - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = MacaddrArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *MacaddrArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = MacaddrArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = MacaddrArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Macaddr, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = MacaddrArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src MacaddrArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src MacaddrArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("macaddr"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "macaddr") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *MacaddrArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src MacaddrArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/multirange.go b/vendor/github.com/jackc/pgtype/multirange.go deleted file mode 100644 index beb11f7..0000000 --- a/vendor/github.com/jackc/pgtype/multirange.go +++ /dev/null @@ -1,83 +0,0 @@ -package pgtype - -import ( - "bytes" - "fmt" -) - -type UntypedTextMultirange struct { - Elements []string -} - -func ParseUntypedTextMultirange(src string) (*UntypedTextMultirange, error) { - utmr := &UntypedTextMultirange{} - utmr.Elements = make([]string, 0) - - buf := bytes.NewBufferString(src) - - skipWhitespace(buf) - - r, _, err := buf.ReadRune() - if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) - } - - if r != '{' { - return nil, fmt.Errorf("invalid multirange, expected '{': %v", err) - } - -parseValueLoop: - for { - r, _, err = buf.ReadRune() - if err != nil { - return nil, fmt.Errorf("invalid multirange: %v", err) - } - - switch r { - case ',': // skip range separator - case '}': - break parseValueLoop - default: - buf.UnreadRune() - value, err := parseRange(buf) - if err != nil { - return nil, fmt.Errorf("invalid multirange value: %v", err) - } - utmr.Elements = append(utmr.Elements, value) - } - } - - skipWhitespace(buf) - - if buf.Len() > 0 { - return nil, fmt.Errorf("unexpected trailing data: %v", buf.String()) - } - - return utmr, nil - -} - -func parseRange(buf *bytes.Buffer) (string, error) { - - s := &bytes.Buffer{} - - boundSepRead := false - for { - r, _, err := buf.ReadRune() - if err != nil { - return "", err - } - - switch r { - case ',', '}': - if r == ',' && !boundSepRead { - boundSepRead = true - break - } - buf.UnreadRune() - return s.String(), nil - } - - s.WriteRune(r) - } -} diff --git a/vendor/github.com/jackc/pgtype/name.go b/vendor/github.com/jackc/pgtype/name.go deleted file mode 100644 index 7ce8d25..0000000 --- a/vendor/github.com/jackc/pgtype/name.go +++ /dev/null @@ -1,58 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -// Name is a type used for PostgreSQL's special 63-byte -// name data type, used for identifiers like table names. -// The pg_class.relname column is a good example of where the -// name data type is used. -// -// Note that the underlying Go data type of pgx.Name is string, -// so there is no way to enforce the 63-byte length. Inputting -// a longer name into PostgreSQL will result in silent truncation -// to 63 bytes. -// -// Also, if you have custom-compiled PostgreSQL and set -// NAMEDATALEN to a different value, obviously that number of -// bytes applies, rather than the default 63. -type Name Text - -func (dst *Name) Set(src interface{}) error { - return (*Text)(dst).Set(src) -} - -func (dst Name) Get() interface{} { - return (Text)(dst).Get() -} - -func (src *Name) AssignTo(dst interface{}) error { - return (*Text)(src).AssignTo(dst) -} - -func (dst *Name) DecodeText(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeText(ci, src) -} - -func (dst *Name) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeBinary(ci, src) -} - -func (src Name) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeText(ci, buf) -} - -func (src Name) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *Name) Scan(src interface{}) error { - return (*Text)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Name) Value() (driver.Value, error) { - return (Text)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/num_multirange.go b/vendor/github.com/jackc/pgtype/num_multirange.go deleted file mode 100644 index cbabc8a..0000000 --- a/vendor/github.com/jackc/pgtype/num_multirange.go +++ /dev/null @@ -1,239 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - - "github.com/jackc/pgio" -) - -type Nummultirange struct { - Ranges []Numrange - Status Status -} - -func (dst *Nummultirange) Set(src interface{}) error { - //untyped nil and typed nil interfaces are different - if src == nil { - *dst = Nummultirange{Status: Null} - return nil - } - - switch value := src.(type) { - case Nummultirange: - *dst = value - case *Nummultirange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - case []Numrange: - if value == nil { - *dst = Nummultirange{Status: Null} - } else if len(value) == 0 { - *dst = Nummultirange{Status: Present} - } else { - elements := make([]Numrange, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Nummultirange{ - Ranges: elements, - Status: Present, - } - } - case []*Numrange: - if value == nil { - *dst = Nummultirange{Status: Null} - } else if len(value) == 0 { - *dst = Nummultirange{Status: Present} - } else { - elements := make([]Numrange, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = Nummultirange{ - Ranges: elements, - Status: Present, - } - } - default: - return fmt.Errorf("cannot convert %v to Nummultirange", src) - } - - return nil - -} - -func (dst Nummultirange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Nummultirange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Nummultirange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Nummultirange{Status: Null} - return nil - } - - utmr, err := ParseUntypedTextMultirange(string(src)) - if err != nil { - return err - } - - var elements []Numrange - - if len(utmr.Elements) > 0 { - elements = make([]Numrange, len(utmr.Elements)) - - for i, s := range utmr.Elements { - var elem Numrange - - elemSrc := []byte(s) - - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = Nummultirange{Ranges: elements, Status: Present} - - return nil -} - -func (dst *Nummultirange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Nummultirange{Status: Null} - return nil - } - - rp := 0 - - numElems := int(binary.BigEndian.Uint32(src[rp:])) - rp += 4 - - if numElems == 0 { - *dst = Nummultirange{Status: Present} - return nil - } - - elements := make([]Numrange, numElems) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err := elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = Nummultirange{Ranges: elements, Status: Present} - return nil -} - -func (src Nummultirange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, '{') - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Ranges { - if i > 0 { - buf = append(buf, ',') - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - return nil, fmt.Errorf("multi-range does not allow null range") - } else { - buf = append(buf, string(elemBuf)...) - } - - } - - buf = append(buf, '}') - - return buf, nil -} - -func (src Nummultirange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, int32(len(src.Ranges))) - - for i := range src.Ranges { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Ranges[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Nummultirange) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Nummultirange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/numeric.go b/vendor/github.com/jackc/pgtype/numeric.go deleted file mode 100644 index 1f32b36..0000000 --- a/vendor/github.com/jackc/pgtype/numeric.go +++ /dev/null @@ -1,853 +0,0 @@ -package pgtype - -import ( - "bytes" - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "math/big" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -// PostgreSQL internal numeric storage uses 16-bit "digits" with base of 10,000 -const nbase = 10000 - -const ( - pgNumericNaN = 0x00000000c0000000 - pgNumericNaNSign = 0xc000 - - pgNumericPosInf = 0x00000000d0000000 - pgNumericPosInfSign = 0xd000 - - pgNumericNegInf = 0x00000000f0000000 - pgNumericNegInfSign = 0xf000 -) - -var big0 *big.Int = big.NewInt(0) -var big1 *big.Int = big.NewInt(1) -var big10 *big.Int = big.NewInt(10) -var big100 *big.Int = big.NewInt(100) -var big1000 *big.Int = big.NewInt(1000) - -var bigMaxInt8 *big.Int = big.NewInt(math.MaxInt8) -var bigMinInt8 *big.Int = big.NewInt(math.MinInt8) -var bigMaxInt16 *big.Int = big.NewInt(math.MaxInt16) -var bigMinInt16 *big.Int = big.NewInt(math.MinInt16) -var bigMaxInt32 *big.Int = big.NewInt(math.MaxInt32) -var bigMinInt32 *big.Int = big.NewInt(math.MinInt32) -var bigMaxInt64 *big.Int = big.NewInt(math.MaxInt64) -var bigMinInt64 *big.Int = big.NewInt(math.MinInt64) -var bigMaxInt *big.Int = big.NewInt(int64(maxInt)) -var bigMinInt *big.Int = big.NewInt(int64(minInt)) - -var bigMaxUint8 *big.Int = big.NewInt(math.MaxUint8) -var bigMaxUint16 *big.Int = big.NewInt(math.MaxUint16) -var bigMaxUint32 *big.Int = big.NewInt(math.MaxUint32) -var bigMaxUint64 *big.Int = (&big.Int{}).SetUint64(uint64(math.MaxUint64)) -var bigMaxUint *big.Int = (&big.Int{}).SetUint64(uint64(maxUint)) - -var bigNBase *big.Int = big.NewInt(nbase) -var bigNBaseX2 *big.Int = big.NewInt(nbase * nbase) -var bigNBaseX3 *big.Int = big.NewInt(nbase * nbase * nbase) -var bigNBaseX4 *big.Int = big.NewInt(nbase * nbase * nbase * nbase) - -type Numeric struct { - Int *big.Int - Exp int32 - Status Status - NaN bool - InfinityModifier InfinityModifier -} - -func (dst *Numeric) Set(src interface{}) error { - if src == nil { - *dst = Numeric{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case float32: - if math.IsNaN(float64(value)) { - *dst = Numeric{Status: Present, NaN: true} - return nil - } else if math.IsInf(float64(value), 1) { - *dst = Numeric{Status: Present, InfinityModifier: Infinity} - return nil - } else if math.IsInf(float64(value), -1) { - *dst = Numeric{Status: Present, InfinityModifier: NegativeInfinity} - return nil - } - num, exp, err := parseNumericString(strconv.FormatFloat(float64(value), 'f', -1, 64)) - if err != nil { - return err - } - *dst = Numeric{Int: num, Exp: exp, Status: Present} - case float64: - if math.IsNaN(value) { - *dst = Numeric{Status: Present, NaN: true} - return nil - } else if math.IsInf(value, 1) { - *dst = Numeric{Status: Present, InfinityModifier: Infinity} - return nil - } else if math.IsInf(value, -1) { - *dst = Numeric{Status: Present, InfinityModifier: NegativeInfinity} - return nil - } - num, exp, err := parseNumericString(strconv.FormatFloat(value, 'f', -1, 64)) - if err != nil { - return err - } - *dst = Numeric{Int: num, Exp: exp, Status: Present} - case int8: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case uint8: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case int16: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case uint16: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case int32: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case uint32: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case int64: - *dst = Numeric{Int: big.NewInt(value), Status: Present} - case uint64: - *dst = Numeric{Int: (&big.Int{}).SetUint64(value), Status: Present} - case int: - *dst = Numeric{Int: big.NewInt(int64(value)), Status: Present} - case uint: - *dst = Numeric{Int: (&big.Int{}).SetUint64(uint64(value)), Status: Present} - case string: - num, exp, err := parseNumericString(value) - if err != nil { - return err - } - *dst = Numeric{Int: num, Exp: exp, Status: Present} - case *float64: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *float32: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *int8: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *uint8: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *int16: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *uint16: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *int32: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *uint32: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *int64: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *uint64: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *int: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *uint: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case *string: - if value == nil { - *dst = Numeric{Status: Null} - } else { - return dst.Set(*value) - } - case InfinityModifier: - *dst = Numeric{InfinityModifier: value, Status: Present} - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Numeric", value) - } - - return nil -} - -func (dst Numeric) Get() interface{} { - switch dst.Status { - case Present: - if dst.InfinityModifier != None { - return dst.InfinityModifier - } - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Numeric) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *float32: - f, err := src.toFloat64() - if err != nil { - return err - } - return float64AssignTo(f, src.Status, dst) - case *float64: - f, err := src.toFloat64() - if err != nil { - return err - } - return float64AssignTo(f, src.Status, dst) - case *int: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(bigMaxInt) > 0 { - return fmt.Errorf("%v is greater than maximum value for %T", normalizedInt, *v) - } - if normalizedInt.Cmp(bigMinInt) < 0 { - return fmt.Errorf("%v is less than minimum value for %T", normalizedInt, *v) - } - *v = int(normalizedInt.Int64()) - case *int8: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(bigMaxInt8) > 0 { - return fmt.Errorf("%v is greater than maximum value for %T", normalizedInt, *v) - } - if normalizedInt.Cmp(bigMinInt8) < 0 { - return fmt.Errorf("%v is less than minimum value for %T", normalizedInt, *v) - } - *v = int8(normalizedInt.Int64()) - case *int16: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(bigMaxInt16) > 0 { - return fmt.Errorf("%v is greater than maximum value for %T", normalizedInt, *v) - } - if normalizedInt.Cmp(bigMinInt16) < 0 { - return fmt.Errorf("%v is less than minimum value for %T", normalizedInt, *v) - } - *v = int16(normalizedInt.Int64()) - case *int32: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(bigMaxInt32) > 0 { - return fmt.Errorf("%v is greater than maximum value for %T", normalizedInt, *v) - } - if normalizedInt.Cmp(bigMinInt32) < 0 { - return fmt.Errorf("%v is less than minimum value for %T", normalizedInt, *v) - } - *v = int32(normalizedInt.Int64()) - case *int64: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(bigMaxInt64) > 0 { - return fmt.Errorf("%v is greater than maximum value for %T", normalizedInt, *v) - } - if normalizedInt.Cmp(bigMinInt64) < 0 { - return fmt.Errorf("%v is less than minimum value for %T", normalizedInt, *v) - } - *v = normalizedInt.Int64() - case *uint: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(big0) < 0 { - return fmt.Errorf("%d is less than zero for %T", normalizedInt, *v) - } else if normalizedInt.Cmp(bigMaxUint) > 0 { - return fmt.Errorf("%d is greater than maximum value for %T", normalizedInt, *v) - } - *v = uint(normalizedInt.Uint64()) - case *uint8: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(big0) < 0 { - return fmt.Errorf("%d is less than zero for %T", normalizedInt, *v) - } else if normalizedInt.Cmp(bigMaxUint8) > 0 { - return fmt.Errorf("%d is greater than maximum value for %T", normalizedInt, *v) - } - *v = uint8(normalizedInt.Uint64()) - case *uint16: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(big0) < 0 { - return fmt.Errorf("%d is less than zero for %T", normalizedInt, *v) - } else if normalizedInt.Cmp(bigMaxUint16) > 0 { - return fmt.Errorf("%d is greater than maximum value for %T", normalizedInt, *v) - } - *v = uint16(normalizedInt.Uint64()) - case *uint32: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(big0) < 0 { - return fmt.Errorf("%d is less than zero for %T", normalizedInt, *v) - } else if normalizedInt.Cmp(bigMaxUint32) > 0 { - return fmt.Errorf("%d is greater than maximum value for %T", normalizedInt, *v) - } - *v = uint32(normalizedInt.Uint64()) - case *uint64: - normalizedInt, err := src.toBigInt() - if err != nil { - return err - } - if normalizedInt.Cmp(big0) < 0 { - return fmt.Errorf("%d is less than zero for %T", normalizedInt, *v) - } else if normalizedInt.Cmp(bigMaxUint64) > 0 { - return fmt.Errorf("%d is greater than maximum value for %T", normalizedInt, *v) - } - *v = normalizedInt.Uint64() - case *big.Rat: - rat, err := src.toBigRat() - if err != nil { - return err - } - v.Set(rat) - case *string: - buf, err := encodeNumericText(*src, nil) - if err != nil { - return err - } - *v = string(buf) - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return nil -} - -func (dst *Numeric) toBigInt() (*big.Int, error) { - if dst.Exp == 0 { - return dst.Int, nil - } - - num := &big.Int{} - num.Set(dst.Int) - if dst.Exp > 0 { - mul := &big.Int{} - mul.Exp(big10, big.NewInt(int64(dst.Exp)), nil) - num.Mul(num, mul) - return num, nil - } - - div := &big.Int{} - div.Exp(big10, big.NewInt(int64(-dst.Exp)), nil) - remainder := &big.Int{} - num.DivMod(num, div, remainder) - if remainder.Cmp(big0) != 0 { - return nil, fmt.Errorf("cannot convert %v to integer", dst) - } - return num, nil -} - -func (dst *Numeric) toBigRat() (*big.Rat, error) { - if dst.NaN { - return nil, fmt.Errorf("%v is not a number", dst) - } else if dst.InfinityModifier == Infinity { - return nil, fmt.Errorf("%v is infinity", dst) - } else if dst.InfinityModifier == NegativeInfinity { - return nil, fmt.Errorf("%v is -infinity", dst) - } - - num := new(big.Rat).SetInt(dst.Int) - if dst.Exp > 0 { - mul := new(big.Int).Exp(big10, big.NewInt(int64(dst.Exp)), nil) - num.Mul(num, new(big.Rat).SetInt(mul)) - } else if dst.Exp < 0 { - mul := new(big.Int).Exp(big10, big.NewInt(int64(-dst.Exp)), nil) - num.Quo(num, new(big.Rat).SetInt(mul)) - } - return num, nil -} - -func (src *Numeric) toFloat64() (float64, error) { - if src.NaN { - return math.NaN(), nil - } else if src.InfinityModifier == Infinity { - return math.Inf(1), nil - } else if src.InfinityModifier == NegativeInfinity { - return math.Inf(-1), nil - } - - buf := make([]byte, 0, 32) - - buf = append(buf, src.Int.String()...) - buf = append(buf, 'e') - buf = append(buf, strconv.FormatInt(int64(src.Exp), 10)...) - - f, err := strconv.ParseFloat(string(buf), 64) - if err != nil { - return 0, err - } - return f, nil -} - -func (dst *Numeric) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Numeric{Status: Null} - return nil - } - - if string(src) == "NaN" { - *dst = Numeric{Status: Present, NaN: true} - return nil - } else if string(src) == "Infinity" { - *dst = Numeric{Status: Present, InfinityModifier: Infinity} - return nil - } else if string(src) == "-Infinity" { - *dst = Numeric{Status: Present, InfinityModifier: NegativeInfinity} - return nil - } - - num, exp, err := parseNumericString(string(src)) - if err != nil { - return err - } - - *dst = Numeric{Int: num, Exp: exp, Status: Present} - return nil -} - -func parseNumericString(str string) (n *big.Int, exp int32, err error) { - parts := strings.SplitN(str, ".", 2) - digits := strings.Join(parts, "") - - if len(parts) > 1 { - exp = int32(-len(parts[1])) - } else { - for len(digits) > 1 && digits[len(digits)-1] == '0' && digits[len(digits)-2] != '-' { - digits = digits[:len(digits)-1] - exp++ - } - } - - accum := &big.Int{} - if _, ok := accum.SetString(digits, 10); !ok { - return nil, 0, fmt.Errorf("%s is not a number", str) - } - - return accum, exp, nil -} - -func (dst *Numeric) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Numeric{Status: Null} - return nil - } - - if len(src) < 8 { - return fmt.Errorf("numeric incomplete %v", src) - } - - rp := 0 - ndigits := binary.BigEndian.Uint16(src[rp:]) - rp += 2 - weight := int16(binary.BigEndian.Uint16(src[rp:])) - rp += 2 - sign := binary.BigEndian.Uint16(src[rp:]) - rp += 2 - dscale := int16(binary.BigEndian.Uint16(src[rp:])) - rp += 2 - - if sign == pgNumericNaNSign { - *dst = Numeric{Status: Present, NaN: true} - return nil - } else if sign == pgNumericPosInfSign { - *dst = Numeric{Status: Present, InfinityModifier: Infinity} - return nil - } else if sign == pgNumericNegInfSign { - *dst = Numeric{Status: Present, InfinityModifier: NegativeInfinity} - return nil - } - - if ndigits == 0 { - *dst = Numeric{Int: big.NewInt(0), Status: Present} - return nil - } - - if len(src[rp:]) < int(ndigits)*2 { - return fmt.Errorf("numeric incomplete %v", src) - } - - accum := &big.Int{} - - for i := 0; i < int(ndigits+3)/4; i++ { - int64accum, bytesRead, digitsRead := nbaseDigitsToInt64(src[rp:]) - rp += bytesRead - - if i > 0 { - var mul *big.Int - switch digitsRead { - case 1: - mul = bigNBase - case 2: - mul = bigNBaseX2 - case 3: - mul = bigNBaseX3 - case 4: - mul = bigNBaseX4 - default: - return fmt.Errorf("invalid digitsRead: %d (this can't happen)", digitsRead) - } - accum.Mul(accum, mul) - } - - accum.Add(accum, big.NewInt(int64accum)) - } - - exp := (int32(weight) - int32(ndigits) + 1) * 4 - - if dscale > 0 { - fracNBaseDigits := int16(int32(ndigits) - int32(weight) - 1) - fracDecimalDigits := fracNBaseDigits * 4 - - if dscale > fracDecimalDigits { - multCount := int(dscale - fracDecimalDigits) - for i := 0; i < multCount; i++ { - accum.Mul(accum, big10) - exp-- - } - } else if dscale < fracDecimalDigits { - divCount := int(fracDecimalDigits - dscale) - for i := 0; i < divCount; i++ { - accum.Div(accum, big10) - exp++ - } - } - } - - reduced := &big.Int{} - remainder := &big.Int{} - if exp >= 0 { - for { - reduced.DivMod(accum, big10, remainder) - if remainder.Cmp(big0) != 0 { - break - } - accum.Set(reduced) - exp++ - } - } - - if sign != 0 { - accum.Neg(accum) - } - - *dst = Numeric{Int: accum, Exp: exp, Status: Present} - - return nil - -} - -func nbaseDigitsToInt64(src []byte) (accum int64, bytesRead, digitsRead int) { - digits := len(src) / 2 - if digits > 4 { - digits = 4 - } - - rp := 0 - - for i := 0; i < digits; i++ { - if i > 0 { - accum *= nbase - } - accum += int64(binary.BigEndian.Uint16(src[rp:])) - rp += 2 - } - - return accum, rp, digits -} - -func (src Numeric) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if src.NaN { - buf = append(buf, "NaN"...) - return buf, nil - } else if src.InfinityModifier == Infinity { - buf = append(buf, "Infinity"...) - return buf, nil - } else if src.InfinityModifier == NegativeInfinity { - buf = append(buf, "-Infinity"...) - return buf, nil - } - - buf = append(buf, src.Int.String()...) - buf = append(buf, 'e') - buf = append(buf, strconv.FormatInt(int64(src.Exp), 10)...) - return buf, nil -} - -func (src Numeric) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if src.NaN { - buf = pgio.AppendUint64(buf, pgNumericNaN) - return buf, nil - } else if src.InfinityModifier == Infinity { - buf = pgio.AppendUint64(buf, pgNumericPosInf) - return buf, nil - } else if src.InfinityModifier == NegativeInfinity { - buf = pgio.AppendUint64(buf, pgNumericNegInf) - return buf, nil - } - - var sign int16 - if src.Int.Cmp(big0) < 0 { - sign = 16384 - } - - absInt := &big.Int{} - wholePart := &big.Int{} - fracPart := &big.Int{} - remainder := &big.Int{} - absInt.Abs(src.Int) - - // Normalize absInt and exp to where exp is always a multiple of 4. This makes - // converting to 16-bit base 10,000 digits easier. - var exp int32 - switch src.Exp % 4 { - case 1, -3: - exp = src.Exp - 1 - absInt.Mul(absInt, big10) - case 2, -2: - exp = src.Exp - 2 - absInt.Mul(absInt, big100) - case 3, -1: - exp = src.Exp - 3 - absInt.Mul(absInt, big1000) - default: - exp = src.Exp - } - - if exp < 0 { - divisor := &big.Int{} - divisor.Exp(big10, big.NewInt(int64(-exp)), nil) - wholePart.DivMod(absInt, divisor, fracPart) - fracPart.Add(fracPart, divisor) - } else { - wholePart = absInt - } - - var wholeDigits, fracDigits []int16 - - for wholePart.Cmp(big0) != 0 { - wholePart.DivMod(wholePart, bigNBase, remainder) - wholeDigits = append(wholeDigits, int16(remainder.Int64())) - } - - if fracPart.Cmp(big0) != 0 { - for fracPart.Cmp(big1) != 0 { - fracPart.DivMod(fracPart, bigNBase, remainder) - fracDigits = append(fracDigits, int16(remainder.Int64())) - } - } - - buf = pgio.AppendInt16(buf, int16(len(wholeDigits)+len(fracDigits))) - - var weight int16 - if len(wholeDigits) > 0 { - weight = int16(len(wholeDigits) - 1) - if exp > 0 { - weight += int16(exp / 4) - } - } else { - weight = int16(exp/4) - 1 + int16(len(fracDigits)) - } - buf = pgio.AppendInt16(buf, weight) - - buf = pgio.AppendInt16(buf, sign) - - var dscale int16 - if src.Exp < 0 { - dscale = int16(-src.Exp) - } - buf = pgio.AppendInt16(buf, dscale) - - for i := len(wholeDigits) - 1; i >= 0; i-- { - buf = pgio.AppendInt16(buf, wholeDigits[i]) - } - - for i := len(fracDigits) - 1; i >= 0; i-- { - buf = pgio.AppendInt16(buf, fracDigits[i]) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Numeric) Scan(src interface{}) error { - if src == nil { - *dst = Numeric{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Numeric) Value() (driver.Value, error) { - switch src.Status { - case Present: - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - - return string(buf), nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func encodeNumericText(n Numeric, buf []byte) (newBuf []byte, err error) { - // if !n.Valid { - // return nil, nil - // } - - if n.NaN { - buf = append(buf, "NaN"...) - return buf, nil - } else if n.InfinityModifier == Infinity { - buf = append(buf, "Infinity"...) - return buf, nil - } else if n.InfinityModifier == NegativeInfinity { - buf = append(buf, "-Infinity"...) - return buf, nil - } - - buf = append(buf, n.numberTextBytes()...) - - return buf, nil -} - -// numberString returns a string of the number. undefined if NaN, infinite, or NULL -func (n Numeric) numberTextBytes() []byte { - intStr := n.Int.String() - buf := &bytes.Buffer{} - exp := int(n.Exp) - if exp > 0 { - buf.WriteString(intStr) - for i := 0; i < exp; i++ { - buf.WriteByte('0') - } - } else if exp < 0 { - if len(intStr) <= -exp { - buf.WriteString("0.") - leadingZeros := -exp - len(intStr) - for i := 0; i < leadingZeros; i++ { - buf.WriteByte('0') - } - buf.WriteString(intStr) - } else if len(intStr) > -exp { - dpPos := len(intStr) + exp - buf.WriteString(intStr[:dpPos]) - buf.WriteByte('.') - buf.WriteString(intStr[dpPos:]) - } - } else { - buf.WriteString(intStr) - } - - return buf.Bytes() -} diff --git a/vendor/github.com/jackc/pgtype/numeric_array.go b/vendor/github.com/jackc/pgtype/numeric_array.go deleted file mode 100644 index 31899de..0000000 --- a/vendor/github.com/jackc/pgtype/numeric_array.go +++ /dev/null @@ -1,685 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type NumericArray struct { - Elements []Numeric - Dimensions []ArrayDimension - Status Status -} - -func (dst *NumericArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = NumericArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []float32: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*float32: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []float64: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*float64: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []int64: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*int64: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []uint64: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*uint64: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - elements := make([]Numeric, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = NumericArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Numeric: - if value == nil { - *dst = NumericArray{Status: Null} - } else if len(value) == 0 { - *dst = NumericArray{Status: Present} - } else { - *dst = NumericArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = NumericArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for NumericArray", src) - } - if elementsLength == 0 { - *dst = NumericArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to NumericArray", src) - } - - *dst = NumericArray{ - Elements: make([]Numeric, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Numeric, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to NumericArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *NumericArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to NumericArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in NumericArray", err) - } - index++ - - return index, nil -} - -func (dst NumericArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *NumericArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]float32: - *v = make([]float32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*float32: - *v = make([]*float32, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]float64: - *v = make([]float64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*float64: - *v = make([]*float64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]int64: - *v = make([]int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*int64: - *v = make([]*int64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]uint64: - *v = make([]uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*uint64: - *v = make([]*uint64, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *NumericArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from NumericArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from NumericArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *NumericArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = NumericArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Numeric - - if len(uta.Elements) > 0 { - elements = make([]Numeric, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Numeric - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = NumericArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *NumericArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = NumericArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = NumericArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Numeric, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = NumericArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src NumericArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src NumericArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("numeric"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "numeric") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *NumericArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src NumericArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/numrange.go b/vendor/github.com/jackc/pgtype/numrange.go deleted file mode 100644 index 3d5951a..0000000 --- a/vendor/github.com/jackc/pgtype/numrange.go +++ /dev/null @@ -1,267 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - - "github.com/jackc/pgio" -) - -type Numrange struct { - Lower Numeric - Upper Numeric - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *Numrange) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Numrange{Status: Null} - return nil - } - - switch value := src.(type) { - case Numrange: - *dst = value - case *Numrange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to Numrange", src) - } - - return nil -} - -func (dst Numrange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Numrange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Numrange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Numrange{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = Numrange{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *Numrange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Numrange{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = Numrange{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src Numrange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src Numrange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Numrange) Scan(src interface{}) error { - if src == nil { - *dst = Numrange{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Numrange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/oid.go b/vendor/github.com/jackc/pgtype/oid.go deleted file mode 100644 index 31677e8..0000000 --- a/vendor/github.com/jackc/pgtype/oid.go +++ /dev/null @@ -1,81 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "strconv" - - "github.com/jackc/pgio" -) - -// OID (Object Identifier Type) is, according to -// https://www.postgresql.org/docs/current/static/datatype-oid.html, used -// internally by PostgreSQL as a primary key for various system tables. It is -// currently implemented as an unsigned four-byte integer. Its definition can be -// found in src/include/postgres_ext.h in the PostgreSQL sources. Because it is -// so frequently required to be in a NOT NULL condition OID cannot be NULL. To -// allow for NULL OIDs use OIDValue. -type OID uint32 - -func (dst *OID) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - return fmt.Errorf("cannot decode nil into OID") - } - - n, err := strconv.ParseUint(string(src), 10, 32) - if err != nil { - return err - } - - *dst = OID(n) - return nil -} - -func (dst *OID) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - return fmt.Errorf("cannot decode nil into OID") - } - - if len(src) != 4 { - return fmt.Errorf("invalid length: %v", len(src)) - } - - n := binary.BigEndian.Uint32(src) - *dst = OID(n) - return nil -} - -func (src OID) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return append(buf, strconv.FormatUint(uint64(src), 10)...), nil -} - -func (src OID) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return pgio.AppendUint32(buf, uint32(src)), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *OID) Scan(src interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan NULL into %T", src) - } - - switch src := src.(type) { - case int64: - *dst = OID(src) - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src OID) Value() (driver.Value, error) { - return int64(src), nil -} diff --git a/vendor/github.com/jackc/pgtype/oid_value.go b/vendor/github.com/jackc/pgtype/oid_value.go deleted file mode 100644 index 5dc9136..0000000 --- a/vendor/github.com/jackc/pgtype/oid_value.go +++ /dev/null @@ -1,55 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -// OIDValue (Object Identifier Type) is, according to -// https://www.postgresql.org/docs/current/static/datatype-OIDValue.html, used -// internally by PostgreSQL as a primary key for various system tables. It is -// currently implemented as an unsigned four-byte integer. Its definition can be -// found in src/include/postgres_ext.h in the PostgreSQL sources. -type OIDValue pguint32 - -// Set converts from src to dst. Note that as OIDValue is not a general -// number type Set does not do automatic type conversion as other number -// types do. -func (dst *OIDValue) Set(src interface{}) error { - return (*pguint32)(dst).Set(src) -} - -func (dst OIDValue) Get() interface{} { - return (pguint32)(dst).Get() -} - -// AssignTo assigns from src to dst. Note that as OIDValue is not a general number -// type AssignTo does not do automatic type conversion as other number types do. -func (src *OIDValue) AssignTo(dst interface{}) error { - return (*pguint32)(src).AssignTo(dst) -} - -func (dst *OIDValue) DecodeText(ci *ConnInfo, src []byte) error { - return (*pguint32)(dst).DecodeText(ci, src) -} - -func (dst *OIDValue) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*pguint32)(dst).DecodeBinary(ci, src) -} - -func (src OIDValue) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (pguint32)(src).EncodeText(ci, buf) -} - -func (src OIDValue) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (pguint32)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *OIDValue) Scan(src interface{}) error { - return (*pguint32)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src OIDValue) Value() (driver.Value, error) { - return (pguint32)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/path.go b/vendor/github.com/jackc/pgtype/path.go deleted file mode 100644 index 9f89969..0000000 --- a/vendor/github.com/jackc/pgtype/path.go +++ /dev/null @@ -1,195 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Path struct { - P []Vec2 - Closed bool - Status Status -} - -func (dst *Path) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to Path", src) -} - -func (dst Path) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Path) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Path) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Path{Status: Null} - return nil - } - - if len(src) < 7 { - return fmt.Errorf("invalid length for Path: %v", len(src)) - } - - closed := src[0] == '(' - points := make([]Vec2, 0) - - str := string(src[2:]) - - for { - end := strings.IndexByte(str, ',') - x, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1:] - end = strings.IndexByte(str, ')') - - y, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - points = append(points, Vec2{x, y}) - - if end+3 < len(str) { - str = str[end+3:] - } else { - break - } - } - - *dst = Path{P: points, Closed: closed, Status: Present} - return nil -} - -func (dst *Path) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Path{Status: Null} - return nil - } - - if len(src) < 5 { - return fmt.Errorf("invalid length for Path: %v", len(src)) - } - - closed := src[0] == 1 - pointCount := int(binary.BigEndian.Uint32(src[1:])) - - rp := 5 - - if 5+pointCount*16 != len(src) { - return fmt.Errorf("invalid length for Path with %d points: %v", pointCount, len(src)) - } - - points := make([]Vec2, pointCount) - for i := 0; i < len(points); i++ { - x := binary.BigEndian.Uint64(src[rp:]) - rp += 8 - y := binary.BigEndian.Uint64(src[rp:]) - rp += 8 - points[i] = Vec2{math.Float64frombits(x), math.Float64frombits(y)} - } - - *dst = Path{ - P: points, - Closed: closed, - Status: Present, - } - return nil -} - -func (src Path) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var startByte, endByte byte - if src.Closed { - startByte = '(' - endByte = ')' - } else { - startByte = '[' - endByte = ']' - } - buf = append(buf, startByte) - - for i, p := range src.P { - if i > 0 { - buf = append(buf, ',') - } - buf = append(buf, fmt.Sprintf(`(%s,%s)`, - strconv.FormatFloat(p.X, 'f', -1, 64), - strconv.FormatFloat(p.Y, 'f', -1, 64), - )...) - } - - return append(buf, endByte), nil -} - -func (src Path) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var closeByte byte - if src.Closed { - closeByte = 1 - } - buf = append(buf, closeByte) - - buf = pgio.AppendInt32(buf, int32(len(src.P))) - - for _, p := range src.P { - buf = pgio.AppendUint64(buf, math.Float64bits(p.X)) - buf = pgio.AppendUint64(buf, math.Float64bits(p.Y)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Path) Scan(src interface{}) error { - if src == nil { - *dst = Path{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Path) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/pgtype.go b/vendor/github.com/jackc/pgtype/pgtype.go deleted file mode 100644 index a52740e..0000000 --- a/vendor/github.com/jackc/pgtype/pgtype.go +++ /dev/null @@ -1,1001 +0,0 @@ -package pgtype - -import ( - "database/sql" - "encoding/binary" - "errors" - "fmt" - "math" - "net" - "reflect" - "time" -) - -// PostgreSQL oids for common types -const ( - BoolOID = 16 - ByteaOID = 17 - QCharOID = 18 - NameOID = 19 - Int8OID = 20 - Int2OID = 21 - Int4OID = 23 - TextOID = 25 - OIDOID = 26 - TIDOID = 27 - XIDOID = 28 - CIDOID = 29 - JSONOID = 114 - JSONArrayOID = 199 - PointOID = 600 - LsegOID = 601 - PathOID = 602 - BoxOID = 603 - PolygonOID = 604 - LineOID = 628 - CIDROID = 650 - CIDRArrayOID = 651 - Float4OID = 700 - Float8OID = 701 - CircleOID = 718 - UnknownOID = 705 - MacaddrOID = 829 - InetOID = 869 - BoolArrayOID = 1000 - Int2ArrayOID = 1005 - Int4ArrayOID = 1007 - TextArrayOID = 1009 - ByteaArrayOID = 1001 - BPCharArrayOID = 1014 - VarcharArrayOID = 1015 - Int8ArrayOID = 1016 - Float4ArrayOID = 1021 - Float8ArrayOID = 1022 - ACLItemOID = 1033 - ACLItemArrayOID = 1034 - InetArrayOID = 1041 - BPCharOID = 1042 - VarcharOID = 1043 - DateOID = 1082 - TimeOID = 1083 - TimestampOID = 1114 - TimestampArrayOID = 1115 - DateArrayOID = 1182 - TimestamptzOID = 1184 - TimestamptzArrayOID = 1185 - IntervalOID = 1186 - NumericArrayOID = 1231 - BitOID = 1560 - VarbitOID = 1562 - NumericOID = 1700 - RecordOID = 2249 - UUIDOID = 2950 - UUIDArrayOID = 2951 - JSONBOID = 3802 - JSONBArrayOID = 3807 - DaterangeOID = 3912 - Int4rangeOID = 3904 - Int4multirangeOID = 4451 - NumrangeOID = 3906 - NummultirangeOID = 4532 - TsrangeOID = 3908 - TsrangeArrayOID = 3909 - TstzrangeOID = 3910 - TstzrangeArrayOID = 3911 - Int8rangeOID = 3926 - Int8multirangeOID = 4536 -) - -type Status byte - -const ( - Undefined Status = iota - Null - Present -) - -type InfinityModifier int8 - -const ( - Infinity InfinityModifier = 1 - None InfinityModifier = 0 - NegativeInfinity InfinityModifier = -Infinity -) - -func (im InfinityModifier) String() string { - switch im { - case None: - return "none" - case Infinity: - return "infinity" - case NegativeInfinity: - return "-infinity" - default: - return "invalid" - } -} - -// PostgreSQL format codes -const ( - TextFormatCode = 0 - BinaryFormatCode = 1 -) - -// Value translates values to and from an internal canonical representation for the type. To actually be usable a type -// that implements Value should also implement some combination of BinaryDecoder, BinaryEncoder, TextDecoder, -// and TextEncoder. -// -// Operations that update a Value (e.g. Set, DecodeText, DecodeBinary) should entirely replace the value. e.g. Internal -// slices should be replaced not resized and reused. This allows Get and AssignTo to return a slice directly rather -// than incur a usually unnecessary copy. -type Value interface { - // Set converts and assigns src to itself. Value takes ownership of src. - Set(src interface{}) error - - // Get returns the simplest representation of Value. Get may return a pointer to an internal value but it must never - // mutate that value. e.g. If Get returns a []byte Value must never change the contents of the []byte. - Get() interface{} - - // AssignTo converts and assigns the Value to dst. AssignTo may a pointer to an internal value but it must never - // mutate that value. e.g. If Get returns a []byte Value must never change the contents of the []byte. - AssignTo(dst interface{}) error -} - -// TypeValue is a Value where instances can represent different PostgreSQL types. This can be useful for -// representing types such as enums, composites, and arrays. -// -// In general, instances of TypeValue should not be used to directly represent a value. It should only be used as an -// encoder and decoder internal to ConnInfo. -type TypeValue interface { - Value - - // NewTypeValue creates a TypeValue including references to internal type information. e.g. the list of members - // in an EnumType. - NewTypeValue() Value - - // TypeName returns the PostgreSQL name of this type. - TypeName() string -} - -// ValueTranscoder is a value that implements the text and binary encoding and decoding interfaces. -type ValueTranscoder interface { - Value - TextEncoder - BinaryEncoder - TextDecoder - BinaryDecoder -} - -// ResultFormatPreferrer allows a type to specify its preferred result format instead of it being inferred from -// whether it is also a BinaryDecoder. -type ResultFormatPreferrer interface { - PreferredResultFormat() int16 -} - -// ParamFormatPreferrer allows a type to specify its preferred param format instead of it being inferred from -// whether it is also a BinaryEncoder. -type ParamFormatPreferrer interface { - PreferredParamFormat() int16 -} - -type BinaryDecoder interface { - // DecodeBinary decodes src into BinaryDecoder. If src is nil then the - // original SQL value is NULL. BinaryDecoder takes ownership of src. The - // caller MUST not use it again. - DecodeBinary(ci *ConnInfo, src []byte) error -} - -type TextDecoder interface { - // DecodeText decodes src into TextDecoder. If src is nil then the original - // SQL value is NULL. TextDecoder takes ownership of src. The caller MUST not - // use it again. - DecodeText(ci *ConnInfo, src []byte) error -} - -// BinaryEncoder is implemented by types that can encode themselves into the -// PostgreSQL binary wire format. -type BinaryEncoder interface { - // EncodeBinary should append the binary format of self to buf. If self is the - // SQL value NULL then append nothing and return (nil, nil). The caller of - // EncodeBinary is responsible for writing the correct NULL value or the - // length of the data written. - EncodeBinary(ci *ConnInfo, buf []byte) (newBuf []byte, err error) -} - -// TextEncoder is implemented by types that can encode themselves into the -// PostgreSQL text wire format. -type TextEncoder interface { - // EncodeText should append the text format of self to buf. If self is the - // SQL value NULL then append nothing and return (nil, nil). The caller of - // EncodeText is responsible for writing the correct NULL value or the - // length of the data written. - EncodeText(ci *ConnInfo, buf []byte) (newBuf []byte, err error) -} - -var errUndefined = errors.New("cannot encode status undefined") -var errBadStatus = errors.New("invalid status") - -type nullAssignmentError struct { - dst interface{} -} - -func (e *nullAssignmentError) Error() string { - return fmt.Sprintf("cannot assign NULL to %T", e.dst) -} - -type DataType struct { - Value Value - - textDecoder TextDecoder - binaryDecoder BinaryDecoder - - Name string - OID uint32 -} - -type ConnInfo struct { - oidToDataType map[uint32]*DataType - nameToDataType map[string]*DataType - reflectTypeToName map[reflect.Type]string - oidToParamFormatCode map[uint32]int16 - oidToResultFormatCode map[uint32]int16 - - reflectTypeToDataType map[reflect.Type]*DataType -} - -func newConnInfo() *ConnInfo { - return &ConnInfo{ - oidToDataType: make(map[uint32]*DataType), - nameToDataType: make(map[string]*DataType), - reflectTypeToName: make(map[reflect.Type]string), - oidToParamFormatCode: make(map[uint32]int16), - oidToResultFormatCode: make(map[uint32]int16), - } -} - -func NewConnInfo() *ConnInfo { - ci := newConnInfo() - - ci.RegisterDataType(DataType{Value: &ACLItemArray{}, Name: "_aclitem", OID: ACLItemArrayOID}) - ci.RegisterDataType(DataType{Value: &BoolArray{}, Name: "_bool", OID: BoolArrayOID}) - ci.RegisterDataType(DataType{Value: &BPCharArray{}, Name: "_bpchar", OID: BPCharArrayOID}) - ci.RegisterDataType(DataType{Value: &ByteaArray{}, Name: "_bytea", OID: ByteaArrayOID}) - ci.RegisterDataType(DataType{Value: &CIDRArray{}, Name: "_cidr", OID: CIDRArrayOID}) - ci.RegisterDataType(DataType{Value: &DateArray{}, Name: "_date", OID: DateArrayOID}) - ci.RegisterDataType(DataType{Value: &Float4Array{}, Name: "_float4", OID: Float4ArrayOID}) - ci.RegisterDataType(DataType{Value: &Float8Array{}, Name: "_float8", OID: Float8ArrayOID}) - ci.RegisterDataType(DataType{Value: &InetArray{}, Name: "_inet", OID: InetArrayOID}) - ci.RegisterDataType(DataType{Value: &Int2Array{}, Name: "_int2", OID: Int2ArrayOID}) - ci.RegisterDataType(DataType{Value: &Int4Array{}, Name: "_int4", OID: Int4ArrayOID}) - ci.RegisterDataType(DataType{Value: &Int8Array{}, Name: "_int8", OID: Int8ArrayOID}) - ci.RegisterDataType(DataType{Value: &NumericArray{}, Name: "_numeric", OID: NumericArrayOID}) - ci.RegisterDataType(DataType{Value: &TextArray{}, Name: "_text", OID: TextArrayOID}) - ci.RegisterDataType(DataType{Value: &TimestampArray{}, Name: "_timestamp", OID: TimestampArrayOID}) - ci.RegisterDataType(DataType{Value: &TimestamptzArray{}, Name: "_timestamptz", OID: TimestamptzArrayOID}) - ci.RegisterDataType(DataType{Value: &UUIDArray{}, Name: "_uuid", OID: UUIDArrayOID}) - ci.RegisterDataType(DataType{Value: &VarcharArray{}, Name: "_varchar", OID: VarcharArrayOID}) - ci.RegisterDataType(DataType{Value: &ACLItem{}, Name: "aclitem", OID: ACLItemOID}) - ci.RegisterDataType(DataType{Value: &Bit{}, Name: "bit", OID: BitOID}) - ci.RegisterDataType(DataType{Value: &Bool{}, Name: "bool", OID: BoolOID}) - ci.RegisterDataType(DataType{Value: &Box{}, Name: "box", OID: BoxOID}) - ci.RegisterDataType(DataType{Value: &BPChar{}, Name: "bpchar", OID: BPCharOID}) - ci.RegisterDataType(DataType{Value: &Bytea{}, Name: "bytea", OID: ByteaOID}) - ci.RegisterDataType(DataType{Value: &QChar{}, Name: "char", OID: QCharOID}) - ci.RegisterDataType(DataType{Value: &CID{}, Name: "cid", OID: CIDOID}) - ci.RegisterDataType(DataType{Value: &CIDR{}, Name: "cidr", OID: CIDROID}) - ci.RegisterDataType(DataType{Value: &Circle{}, Name: "circle", OID: CircleOID}) - ci.RegisterDataType(DataType{Value: &Date{}, Name: "date", OID: DateOID}) - ci.RegisterDataType(DataType{Value: &Daterange{}, Name: "daterange", OID: DaterangeOID}) - ci.RegisterDataType(DataType{Value: &Float4{}, Name: "float4", OID: Float4OID}) - ci.RegisterDataType(DataType{Value: &Float8{}, Name: "float8", OID: Float8OID}) - ci.RegisterDataType(DataType{Value: &Inet{}, Name: "inet", OID: InetOID}) - ci.RegisterDataType(DataType{Value: &Int2{}, Name: "int2", OID: Int2OID}) - ci.RegisterDataType(DataType{Value: &Int4{}, Name: "int4", OID: Int4OID}) - ci.RegisterDataType(DataType{Value: &Int4range{}, Name: "int4range", OID: Int4rangeOID}) - ci.RegisterDataType(DataType{Value: &Int4multirange{}, Name: "int4multirange", OID: Int4multirangeOID}) - ci.RegisterDataType(DataType{Value: &Int8{}, Name: "int8", OID: Int8OID}) - ci.RegisterDataType(DataType{Value: &Int8range{}, Name: "int8range", OID: Int8rangeOID}) - ci.RegisterDataType(DataType{Value: &Int8multirange{}, Name: "int8multirange", OID: Int8multirangeOID}) - ci.RegisterDataType(DataType{Value: &Interval{}, Name: "interval", OID: IntervalOID}) - ci.RegisterDataType(DataType{Value: &JSON{}, Name: "json", OID: JSONOID}) - ci.RegisterDataType(DataType{Value: &JSONArray{}, Name: "_json", OID: JSONArrayOID}) - ci.RegisterDataType(DataType{Value: &JSONB{}, Name: "jsonb", OID: JSONBOID}) - ci.RegisterDataType(DataType{Value: &JSONBArray{}, Name: "_jsonb", OID: JSONBArrayOID}) - ci.RegisterDataType(DataType{Value: &Line{}, Name: "line", OID: LineOID}) - ci.RegisterDataType(DataType{Value: &Lseg{}, Name: "lseg", OID: LsegOID}) - ci.RegisterDataType(DataType{Value: &Macaddr{}, Name: "macaddr", OID: MacaddrOID}) - ci.RegisterDataType(DataType{Value: &Name{}, Name: "name", OID: NameOID}) - ci.RegisterDataType(DataType{Value: &Numeric{}, Name: "numeric", OID: NumericOID}) - ci.RegisterDataType(DataType{Value: &Numrange{}, Name: "numrange", OID: NumrangeOID}) - ci.RegisterDataType(DataType{Value: &Nummultirange{}, Name: "nummultirange", OID: NummultirangeOID}) - ci.RegisterDataType(DataType{Value: &OIDValue{}, Name: "oid", OID: OIDOID}) - ci.RegisterDataType(DataType{Value: &Path{}, Name: "path", OID: PathOID}) - ci.RegisterDataType(DataType{Value: &Point{}, Name: "point", OID: PointOID}) - ci.RegisterDataType(DataType{Value: &Polygon{}, Name: "polygon", OID: PolygonOID}) - ci.RegisterDataType(DataType{Value: &Record{}, Name: "record", OID: RecordOID}) - ci.RegisterDataType(DataType{Value: &Text{}, Name: "text", OID: TextOID}) - ci.RegisterDataType(DataType{Value: &TID{}, Name: "tid", OID: TIDOID}) - ci.RegisterDataType(DataType{Value: &Time{}, Name: "time", OID: TimeOID}) - ci.RegisterDataType(DataType{Value: &Timestamp{}, Name: "timestamp", OID: TimestampOID}) - ci.RegisterDataType(DataType{Value: &Timestamptz{}, Name: "timestamptz", OID: TimestamptzOID}) - ci.RegisterDataType(DataType{Value: &Tsrange{}, Name: "tsrange", OID: TsrangeOID}) - ci.RegisterDataType(DataType{Value: &TsrangeArray{}, Name: "_tsrange", OID: TsrangeArrayOID}) - ci.RegisterDataType(DataType{Value: &Tstzrange{}, Name: "tstzrange", OID: TstzrangeOID}) - ci.RegisterDataType(DataType{Value: &TstzrangeArray{}, Name: "_tstzrange", OID: TstzrangeArrayOID}) - ci.RegisterDataType(DataType{Value: &Unknown{}, Name: "unknown", OID: UnknownOID}) - ci.RegisterDataType(DataType{Value: &UUID{}, Name: "uuid", OID: UUIDOID}) - ci.RegisterDataType(DataType{Value: &Varbit{}, Name: "varbit", OID: VarbitOID}) - ci.RegisterDataType(DataType{Value: &Varchar{}, Name: "varchar", OID: VarcharOID}) - ci.RegisterDataType(DataType{Value: &XID{}, Name: "xid", OID: XIDOID}) - - registerDefaultPgTypeVariants := func(name, arrayName string, value interface{}) { - ci.RegisterDefaultPgType(value, name) - valueType := reflect.TypeOf(value) - - ci.RegisterDefaultPgType(reflect.New(valueType).Interface(), name) - - sliceType := reflect.SliceOf(valueType) - ci.RegisterDefaultPgType(reflect.MakeSlice(sliceType, 0, 0).Interface(), arrayName) - - ci.RegisterDefaultPgType(reflect.New(sliceType).Interface(), arrayName) - } - - // Integer types that directly map to a PostgreSQL type - registerDefaultPgTypeVariants("int2", "_int2", int16(0)) - registerDefaultPgTypeVariants("int4", "_int4", int32(0)) - registerDefaultPgTypeVariants("int8", "_int8", int64(0)) - - // Integer types that do not have a direct match to a PostgreSQL type - registerDefaultPgTypeVariants("int8", "_int8", uint16(0)) - registerDefaultPgTypeVariants("int8", "_int8", uint32(0)) - registerDefaultPgTypeVariants("int8", "_int8", uint64(0)) - registerDefaultPgTypeVariants("int8", "_int8", int(0)) - registerDefaultPgTypeVariants("int8", "_int8", uint(0)) - - registerDefaultPgTypeVariants("float4", "_float4", float32(0)) - registerDefaultPgTypeVariants("float8", "_float8", float64(0)) - - registerDefaultPgTypeVariants("bool", "_bool", false) - registerDefaultPgTypeVariants("timestamptz", "_timestamptz", time.Time{}) - registerDefaultPgTypeVariants("text", "_text", "") - registerDefaultPgTypeVariants("bytea", "_bytea", []byte(nil)) - - registerDefaultPgTypeVariants("inet", "_inet", net.IP{}) - ci.RegisterDefaultPgType((*net.IPNet)(nil), "cidr") - ci.RegisterDefaultPgType([]*net.IPNet(nil), "_cidr") - - return ci -} - -func (ci *ConnInfo) InitializeDataTypes(nameOIDs map[string]uint32) { - for name, oid := range nameOIDs { - var value Value - if t, ok := nameValues[name]; ok { - value = reflect.New(reflect.ValueOf(t).Elem().Type()).Interface().(Value) - } else { - value = &GenericText{} - } - ci.RegisterDataType(DataType{Value: value, Name: name, OID: oid}) - } -} - -func (ci *ConnInfo) RegisterDataType(t DataType) { - t.Value = NewValue(t.Value) - - ci.oidToDataType[t.OID] = &t - ci.nameToDataType[t.Name] = &t - - { - var formatCode int16 - if pfp, ok := t.Value.(ParamFormatPreferrer); ok { - formatCode = pfp.PreferredParamFormat() - } else if _, ok := t.Value.(BinaryEncoder); ok { - formatCode = BinaryFormatCode - } - ci.oidToParamFormatCode[t.OID] = formatCode - } - - { - var formatCode int16 - if rfp, ok := t.Value.(ResultFormatPreferrer); ok { - formatCode = rfp.PreferredResultFormat() - } else if _, ok := t.Value.(BinaryDecoder); ok { - formatCode = BinaryFormatCode - } - ci.oidToResultFormatCode[t.OID] = formatCode - } - - if d, ok := t.Value.(TextDecoder); ok { - t.textDecoder = d - } - - if d, ok := t.Value.(BinaryDecoder); ok { - t.binaryDecoder = d - } - - ci.reflectTypeToDataType = nil // Invalidated by type registration -} - -// RegisterDefaultPgType registers a mapping of a Go type to a PostgreSQL type name. Typically the data type to be -// encoded or decoded is determined by the PostgreSQL OID. But if the OID of a value to be encoded or decoded is -// unknown, this additional mapping will be used by DataTypeForValue to determine a suitable data type. -func (ci *ConnInfo) RegisterDefaultPgType(value interface{}, name string) { - ci.reflectTypeToName[reflect.TypeOf(value)] = name - ci.reflectTypeToDataType = nil // Invalidated by registering a default type -} - -func (ci *ConnInfo) DataTypeForOID(oid uint32) (*DataType, bool) { - dt, ok := ci.oidToDataType[oid] - return dt, ok -} - -func (ci *ConnInfo) DataTypeForName(name string) (*DataType, bool) { - dt, ok := ci.nameToDataType[name] - return dt, ok -} - -func (ci *ConnInfo) buildReflectTypeToDataType() { - ci.reflectTypeToDataType = make(map[reflect.Type]*DataType) - - for _, dt := range ci.oidToDataType { - if _, is := dt.Value.(TypeValue); !is { - ci.reflectTypeToDataType[reflect.ValueOf(dt.Value).Type()] = dt - } - } - - for reflectType, name := range ci.reflectTypeToName { - if dt, ok := ci.nameToDataType[name]; ok { - ci.reflectTypeToDataType[reflectType] = dt - } - } -} - -// DataTypeForValue finds a data type suitable for v. Use RegisterDataType to register types that can encode and decode -// themselves. Use RegisterDefaultPgType to register that can be handled by a registered data type. -func (ci *ConnInfo) DataTypeForValue(v interface{}) (*DataType, bool) { - if ci.reflectTypeToDataType == nil { - ci.buildReflectTypeToDataType() - } - - if tv, ok := v.(TypeValue); ok { - dt, ok := ci.nameToDataType[tv.TypeName()] - return dt, ok - } - - dt, ok := ci.reflectTypeToDataType[reflect.TypeOf(v)] - return dt, ok -} - -func (ci *ConnInfo) ParamFormatCodeForOID(oid uint32) int16 { - fc, ok := ci.oidToParamFormatCode[oid] - if ok { - return fc - } - return TextFormatCode -} - -func (ci *ConnInfo) ResultFormatCodeForOID(oid uint32) int16 { - fc, ok := ci.oidToResultFormatCode[oid] - if ok { - return fc - } - return TextFormatCode -} - -// DeepCopy makes a deep copy of the ConnInfo. -func (ci *ConnInfo) DeepCopy() *ConnInfo { - ci2 := newConnInfo() - - for _, dt := range ci.oidToDataType { - ci2.RegisterDataType(DataType{ - Value: NewValue(dt.Value), - Name: dt.Name, - OID: dt.OID, - }) - } - - for t, n := range ci.reflectTypeToName { - ci2.reflectTypeToName[t] = n - } - - return ci2 -} - -// ScanPlan is a precompiled plan to scan into a type of destination. -type ScanPlan interface { - // Scan scans src into dst. If the dst type has changed in an incompatible way a ScanPlan should automatically - // replan and scan. - Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error -} - -type scanPlanDstBinaryDecoder struct{} - -func (scanPlanDstBinaryDecoder) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if d, ok := (dst).(BinaryDecoder); ok { - return d.DecodeBinary(ci, src) - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanDstTextDecoder struct{} - -func (plan scanPlanDstTextDecoder) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if d, ok := (dst).(TextDecoder); ok { - return d.DecodeText(ci, src) - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanDataTypeSQLScanner DataType - -func (plan *scanPlanDataTypeSQLScanner) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - scanner, ok := dst.(sql.Scanner) - if !ok { - dv := reflect.ValueOf(dst) - if dv.Kind() != reflect.Ptr || !dv.Type().Elem().Implements(scannerType) { - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) - } - if src == nil { - // Ensure the pointer points to a zero version of the value - dv.Elem().Set(reflect.Zero(dv.Type().Elem())) - return nil - } - dv = dv.Elem() - // If the pointer is to a nil pointer then set that before scanning - if dv.Kind() == reflect.Ptr && dv.IsNil() { - dv.Set(reflect.New(dv.Type().Elem())) - } - scanner = dv.Interface().(sql.Scanner) - } - - dt := (*DataType)(plan) - var err error - switch formatCode { - case BinaryFormatCode: - err = dt.binaryDecoder.DecodeBinary(ci, src) - case TextFormatCode: - err = dt.textDecoder.DecodeText(ci, src) - } - if err != nil { - return err - } - - sqlSrc, err := DatabaseSQLValue(ci, dt.Value) - if err != nil { - return err - } - return scanner.Scan(sqlSrc) -} - -type scanPlanDataTypeAssignTo DataType - -func (plan *scanPlanDataTypeAssignTo) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - dt := (*DataType)(plan) - var err error - switch formatCode { - case BinaryFormatCode: - err = dt.binaryDecoder.DecodeBinary(ci, src) - case TextFormatCode: - err = dt.textDecoder.DecodeText(ci, src) - } - if err != nil { - return err - } - - assignToErr := dt.Value.AssignTo(dst) - if assignToErr == nil { - return nil - } - - if dstPtr, ok := dst.(*interface{}); ok { - *dstPtr = dt.Value.Get() - return nil - } - - // assignToErr might have failed because the type of destination has changed - newPlan := ci.PlanScan(oid, formatCode, dst) - if newPlan, sameType := newPlan.(*scanPlanDataTypeAssignTo); !sameType { - return newPlan.Scan(ci, oid, formatCode, src, dst) - } - - return assignToErr -} - -type scanPlanSQLScanner struct{} - -func (scanPlanSQLScanner) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - scanner, ok := dst.(sql.Scanner) - if !ok { - dv := reflect.ValueOf(dst) - if dv.Kind() != reflect.Ptr || !dv.Type().Elem().Implements(scannerType) { - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) - } - if src == nil { - // Ensure the pointer points to a zero version of the value - dv.Elem().Set(reflect.Zero(dv.Elem().Type())) - return nil - } - dv = dv.Elem() - // If the pointer is to a nil pointer then set that before scanning - if dv.Kind() == reflect.Ptr && dv.IsNil() { - dv.Set(reflect.New(dv.Type().Elem())) - } - scanner = dv.Interface().(sql.Scanner) - } - if src == nil { - // This is necessary because interface value []byte:nil does not equal nil:nil for the binary format path and the - // text format path would be converted to empty string. - return scanner.Scan(nil) - } else if formatCode == BinaryFormatCode { - return scanner.Scan(src) - } else { - return scanner.Scan(string(src)) - } -} - -type scanPlanReflection struct{} - -func (scanPlanReflection) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - // We might be given a pointer to something that implements the decoder interface(s), - // even though the pointer itself doesn't. - refVal := reflect.ValueOf(dst) - if refVal.Kind() == reflect.Ptr && refVal.Type().Elem().Kind() == reflect.Ptr { - // If the database returned NULL, then we set dest as nil to indicate that. - if src == nil { - nilPtr := reflect.Zero(refVal.Type().Elem()) - refVal.Elem().Set(nilPtr) - return nil - } - - // We need to allocate an element, and set the destination to it - // Then we can retry as that element. - elemPtr := reflect.New(refVal.Type().Elem().Elem()) - refVal.Elem().Set(elemPtr) - - plan := ci.PlanScan(oid, formatCode, elemPtr.Interface()) - return plan.Scan(ci, oid, formatCode, src, elemPtr.Interface()) - } - - return scanUnknownType(oid, formatCode, src, dst) -} - -type scanPlanBinaryInt16 struct{} - -func (scanPlanBinaryInt16) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan null into %T", dst) - } - - if len(src) != 2 { - return fmt.Errorf("invalid length for int2: %v", len(src)) - } - - if p, ok := (dst).(*int16); ok { - *p = int16(binary.BigEndian.Uint16(src)) - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanBinaryInt32 struct{} - -func (scanPlanBinaryInt32) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan null into %T", dst) - } - - if len(src) != 4 { - return fmt.Errorf("invalid length for int4: %v", len(src)) - } - - if p, ok := (dst).(*int32); ok { - *p = int32(binary.BigEndian.Uint32(src)) - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanBinaryInt64 struct{} - -func (scanPlanBinaryInt64) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan null into %T", dst) - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for int8: %v", len(src)) - } - - if p, ok := (dst).(*int64); ok { - *p = int64(binary.BigEndian.Uint64(src)) - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanBinaryFloat32 struct{} - -func (scanPlanBinaryFloat32) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan null into %T", dst) - } - - if len(src) != 4 { - return fmt.Errorf("invalid length for int4: %v", len(src)) - } - - if p, ok := (dst).(*float32); ok { - n := int32(binary.BigEndian.Uint32(src)) - *p = float32(math.Float32frombits(uint32(n))) - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanBinaryFloat64 struct{} - -func (scanPlanBinaryFloat64) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan null into %T", dst) - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for int8: %v", len(src)) - } - - if p, ok := (dst).(*float64); ok { - n := int64(binary.BigEndian.Uint64(src)) - *p = float64(math.Float64frombits(uint64(n))) - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanBinaryBytes struct{} - -func (scanPlanBinaryBytes) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if p, ok := (dst).(*[]byte); ok { - *p = src - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -type scanPlanString struct{} - -func (scanPlanString) Scan(ci *ConnInfo, oid uint32, formatCode int16, src []byte, dst interface{}) error { - if src == nil { - return fmt.Errorf("cannot scan null into %T", dst) - } - - if p, ok := (dst).(*string); ok { - *p = string(src) - return nil - } - - newPlan := ci.PlanScan(oid, formatCode, dst) - return newPlan.Scan(ci, oid, formatCode, src, dst) -} - -var scannerType = reflect.TypeOf((*sql.Scanner)(nil)).Elem() - -func isScanner(dst interface{}) bool { - if _, ok := dst.(sql.Scanner); ok { - return true - } - if t := reflect.TypeOf(dst); t != nil && t.Kind() == reflect.Ptr && t.Elem().Implements(scannerType) { - return true - } - return false -} - -// PlanScan prepares a plan to scan a value into dst. -func (ci *ConnInfo) PlanScan(oid uint32, formatCode int16, dst interface{}) ScanPlan { - switch formatCode { - case BinaryFormatCode: - switch dst.(type) { - case *string: - switch oid { - case TextOID, VarcharOID: - return scanPlanString{} - } - case *int16: - if oid == Int2OID { - return scanPlanBinaryInt16{} - } - case *int32: - if oid == Int4OID { - return scanPlanBinaryInt32{} - } - case *int64: - if oid == Int8OID { - return scanPlanBinaryInt64{} - } - case *float32: - if oid == Float4OID { - return scanPlanBinaryFloat32{} - } - case *float64: - if oid == Float8OID { - return scanPlanBinaryFloat64{} - } - case *[]byte: - switch oid { - case ByteaOID, TextOID, VarcharOID, JSONOID: - return scanPlanBinaryBytes{} - } - case BinaryDecoder: - return scanPlanDstBinaryDecoder{} - } - case TextFormatCode: - switch dst.(type) { - case *string: - return scanPlanString{} - case *[]byte: - if oid != ByteaOID { - return scanPlanBinaryBytes{} - } - case TextDecoder: - return scanPlanDstTextDecoder{} - } - } - - var dt *DataType - - if oid == 0 { - if dataType, ok := ci.DataTypeForValue(dst); ok { - dt = dataType - } - } else { - if dataType, ok := ci.DataTypeForOID(oid); ok { - dt = dataType - } - } - - if dt != nil { - if isScanner(dst) { - return (*scanPlanDataTypeSQLScanner)(dt) - } - return (*scanPlanDataTypeAssignTo)(dt) - } - - if isScanner(dst) { - return scanPlanSQLScanner{} - } - - return scanPlanReflection{} -} - -func (ci *ConnInfo) Scan(oid uint32, formatCode int16, src []byte, dst interface{}) error { - if dst == nil { - return nil - } - - plan := ci.PlanScan(oid, formatCode, dst) - return plan.Scan(ci, oid, formatCode, src, dst) -} - -func scanUnknownType(oid uint32, formatCode int16, buf []byte, dest interface{}) error { - switch dest := dest.(type) { - case *string: - if formatCode == BinaryFormatCode { - return fmt.Errorf("unknown oid %d in binary format cannot be scanned into %T", oid, dest) - } - *dest = string(buf) - return nil - case *[]byte: - *dest = buf - return nil - default: - if nextDst, retry := GetAssignToDstType(dest); retry { - return scanUnknownType(oid, formatCode, buf, nextDst) - } - return fmt.Errorf("unknown oid %d cannot be scanned into %T", oid, dest) - } -} - -// NewValue returns a new instance of the same type as v. -func NewValue(v Value) Value { - if tv, ok := v.(TypeValue); ok { - return tv.NewTypeValue() - } else { - return reflect.New(reflect.ValueOf(v).Elem().Type()).Interface().(Value) - } -} - -var nameValues map[string]Value - -func init() { - nameValues = map[string]Value{ - "_aclitem": &ACLItemArray{}, - "_bool": &BoolArray{}, - "_bpchar": &BPCharArray{}, - "_bytea": &ByteaArray{}, - "_cidr": &CIDRArray{}, - "_date": &DateArray{}, - "_float4": &Float4Array{}, - "_float8": &Float8Array{}, - "_inet": &InetArray{}, - "_int2": &Int2Array{}, - "_int4": &Int4Array{}, - "_int8": &Int8Array{}, - "_numeric": &NumericArray{}, - "_text": &TextArray{}, - "_timestamp": &TimestampArray{}, - "_timestamptz": &TimestamptzArray{}, - "_uuid": &UUIDArray{}, - "_varchar": &VarcharArray{}, - "_json": &JSONArray{}, - "_jsonb": &JSONBArray{}, - "aclitem": &ACLItem{}, - "bit": &Bit{}, - "bool": &Bool{}, - "box": &Box{}, - "bpchar": &BPChar{}, - "bytea": &Bytea{}, - "char": &QChar{}, - "cid": &CID{}, - "cidr": &CIDR{}, - "circle": &Circle{}, - "date": &Date{}, - "daterange": &Daterange{}, - "float4": &Float4{}, - "float8": &Float8{}, - "hstore": &Hstore{}, - "inet": &Inet{}, - "int2": &Int2{}, - "int4": &Int4{}, - "int4range": &Int4range{}, - "int4multirange": &Int4multirange{}, - "int8": &Int8{}, - "int8range": &Int8range{}, - "int8multirange": &Int8multirange{}, - "interval": &Interval{}, - "json": &JSON{}, - "jsonb": &JSONB{}, - "line": &Line{}, - "lseg": &Lseg{}, - "ltree": &Ltree{}, - "macaddr": &Macaddr{}, - "name": &Name{}, - "numeric": &Numeric{}, - "numrange": &Numrange{}, - "nummultirange": &Nummultirange{}, - "oid": &OIDValue{}, - "path": &Path{}, - "point": &Point{}, - "polygon": &Polygon{}, - "record": &Record{}, - "text": &Text{}, - "tid": &TID{}, - "timestamp": &Timestamp{}, - "timestamptz": &Timestamptz{}, - "tsrange": &Tsrange{}, - "_tsrange": &TsrangeArray{}, - "tstzrange": &Tstzrange{}, - "_tstzrange": &TstzrangeArray{}, - "unknown": &Unknown{}, - "uuid": &UUID{}, - "varbit": &Varbit{}, - "varchar": &Varchar{}, - "xid": &XID{}, - } -} diff --git a/vendor/github.com/jackc/pgtype/pguint32.go b/vendor/github.com/jackc/pgtype/pguint32.go deleted file mode 100644 index a0e88ca..0000000 --- a/vendor/github.com/jackc/pgtype/pguint32.go +++ /dev/null @@ -1,162 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - - "github.com/jackc/pgio" -) - -// pguint32 is the core type that is used to implement PostgreSQL types such as -// CID and XID. -type pguint32 struct { - Uint uint32 - Status Status -} - -// Set converts from src to dst. Note that as pguint32 is not a general -// number type Set does not do automatic type conversion as other number -// types do. -func (dst *pguint32) Set(src interface{}) error { - switch value := src.(type) { - case int64: - if value < 0 { - return fmt.Errorf("%d is less than minimum value for pguint32", value) - } - if value > math.MaxUint32 { - return fmt.Errorf("%d is greater than maximum value for pguint32", value) - } - *dst = pguint32{Uint: uint32(value), Status: Present} - case uint32: - *dst = pguint32{Uint: value, Status: Present} - default: - return fmt.Errorf("cannot convert %v to pguint32", value) - } - - return nil -} - -func (dst pguint32) Get() interface{} { - switch dst.Status { - case Present: - return dst.Uint - case Null: - return nil - default: - return dst.Status - } -} - -// AssignTo assigns from src to dst. Note that as pguint32 is not a general number -// type AssignTo does not do automatic type conversion as other number types do. -func (src *pguint32) AssignTo(dst interface{}) error { - switch v := dst.(type) { - case *uint32: - if src.Status == Present { - *v = src.Uint - } else { - return fmt.Errorf("cannot assign %v into %T", src, dst) - } - case **uint32: - if src.Status == Present { - n := src.Uint - *v = &n - } else { - *v = nil - } - } - - return nil -} - -func (dst *pguint32) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = pguint32{Status: Null} - return nil - } - - n, err := strconv.ParseUint(string(src), 10, 32) - if err != nil { - return err - } - - *dst = pguint32{Uint: uint32(n), Status: Present} - return nil -} - -func (dst *pguint32) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = pguint32{Status: Null} - return nil - } - - if len(src) != 4 { - return fmt.Errorf("invalid length: %v", len(src)) - } - - n := binary.BigEndian.Uint32(src) - *dst = pguint32{Uint: n, Status: Present} - return nil -} - -func (src pguint32) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, strconv.FormatUint(uint64(src.Uint), 10)...), nil -} - -func (src pguint32) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return pgio.AppendUint32(buf, src.Uint), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *pguint32) Scan(src interface{}) error { - if src == nil { - *dst = pguint32{Status: Null} - return nil - } - - switch src := src.(type) { - case uint32: - *dst = pguint32{Uint: src, Status: Present} - return nil - case int64: - *dst = pguint32{Uint: uint32(src), Status: Present} - return nil - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src pguint32) Value() (driver.Value, error) { - switch src.Status { - case Present: - return int64(src.Uint), nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} diff --git a/vendor/github.com/jackc/pgtype/point.go b/vendor/github.com/jackc/pgtype/point.go deleted file mode 100644 index 0c79910..0000000 --- a/vendor/github.com/jackc/pgtype/point.go +++ /dev/null @@ -1,214 +0,0 @@ -package pgtype - -import ( - "bytes" - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Vec2 struct { - X float64 - Y float64 -} - -type Point struct { - P Vec2 - Status Status -} - -func (dst *Point) Set(src interface{}) error { - if src == nil { - dst.Status = Null - return nil - } - err := fmt.Errorf("cannot convert %v to Point", src) - var p *Point - switch value := src.(type) { - case string: - p, err = parsePoint([]byte(value)) - case []byte: - p, err = parsePoint(value) - default: - return err - } - if err != nil { - return err - } - *dst = *p - return nil -} - -func parsePoint(src []byte) (*Point, error) { - if src == nil || bytes.Compare(src, []byte("null")) == 0 { - return &Point{Status: Null}, nil - } - - if len(src) < 5 { - return nil, fmt.Errorf("invalid length for point: %v", len(src)) - } - if src[0] == '"' && src[len(src)-1] == '"' { - src = src[1 : len(src)-1] - } - parts := strings.SplitN(string(src[1:len(src)-1]), ",", 2) - if len(parts) < 2 { - return nil, fmt.Errorf("invalid format for point") - } - - x, err := strconv.ParseFloat(parts[0], 64) - if err != nil { - return nil, err - } - - y, err := strconv.ParseFloat(parts[1], 64) - if err != nil { - return nil, err - } - - return &Point{P: Vec2{x, y}, Status: Present}, nil -} - -func (dst Point) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Point) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Point) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Point{Status: Null} - return nil - } - - if len(src) < 5 { - return fmt.Errorf("invalid length for point: %v", len(src)) - } - - parts := strings.SplitN(string(src[1:len(src)-1]), ",", 2) - if len(parts) < 2 { - return fmt.Errorf("invalid format for point") - } - - x, err := strconv.ParseFloat(parts[0], 64) - if err != nil { - return err - } - - y, err := strconv.ParseFloat(parts[1], 64) - if err != nil { - return err - } - - *dst = Point{P: Vec2{x, y}, Status: Present} - return nil -} - -func (dst *Point) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Point{Status: Null} - return nil - } - - if len(src) != 16 { - return fmt.Errorf("invalid length for point: %v", len(src)) - } - - x := binary.BigEndian.Uint64(src) - y := binary.BigEndian.Uint64(src[8:]) - - *dst = Point{ - P: Vec2{math.Float64frombits(x), math.Float64frombits(y)}, - Status: Present, - } - return nil -} - -func (src Point) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, fmt.Sprintf(`(%s,%s)`, - strconv.FormatFloat(src.P.X, 'f', -1, 64), - strconv.FormatFloat(src.P.Y, 'f', -1, 64), - )...), nil -} - -func (src Point) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint64(buf, math.Float64bits(src.P.X)) - buf = pgio.AppendUint64(buf, math.Float64bits(src.P.Y)) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Point) Scan(src interface{}) error { - if src == nil { - *dst = Point{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Point) Value() (driver.Value, error) { - return EncodeValueText(src) -} - -func (src Point) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - var buff bytes.Buffer - buff.WriteByte('"') - buff.WriteString(fmt.Sprintf("(%g,%g)", src.P.X, src.P.Y)) - buff.WriteByte('"') - return buff.Bytes(), nil - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - return nil, errBadStatus -} - -func (dst *Point) UnmarshalJSON(point []byte) error { - p, err := parsePoint(point) - if err != nil { - return err - } - *dst = *p - return nil -} diff --git a/vendor/github.com/jackc/pgtype/polygon.go b/vendor/github.com/jackc/pgtype/polygon.go deleted file mode 100644 index 207cadc..0000000 --- a/vendor/github.com/jackc/pgtype/polygon.go +++ /dev/null @@ -1,226 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "math" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -type Polygon struct { - P []Vec2 - Status Status -} - -// Set converts src to dest. -// -// src can be nil, string, []float64, and []pgtype.Vec2. -// -// If src is string the format must be ((x1,y1),(x2,y2),...,(xn,yn)). -// Important that there are no spaces in it. -func (dst *Polygon) Set(src interface{}) error { - if src == nil { - dst.Status = Null - return nil - } - err := fmt.Errorf("cannot convert %v to Polygon", src) - var p *Polygon - switch value := src.(type) { - case string: - p, err = stringToPolygon(value) - case []Vec2: - p = &Polygon{Status: Present, P: value} - err = nil - case []float64: - p, err = float64ToPolygon(value) - default: - return err - } - if err != nil { - return err - } - *dst = *p - return nil -} - -func stringToPolygon(src string) (*Polygon, error) { - p := &Polygon{} - err := p.DecodeText(nil, []byte(src)) - return p, err -} - -func float64ToPolygon(src []float64) (*Polygon, error) { - p := &Polygon{Status: Null} - if len(src) == 0 { - return p, nil - } - if len(src)%2 != 0 { - p.Status = Undefined - return p, fmt.Errorf("invalid length for polygon: %v", len(src)) - } - p.Status = Present - p.P = make([]Vec2, 0) - for i := 0; i < len(src); i += 2 { - p.P = append(p.P, Vec2{X: src[i], Y: src[i+1]}) - } - return p, nil -} - -func (dst Polygon) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Polygon) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Polygon) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Polygon{Status: Null} - return nil - } - - if len(src) < 7 { - return fmt.Errorf("invalid length for Polygon: %v", len(src)) - } - - points := make([]Vec2, 0) - - str := string(src[2:]) - - for { - end := strings.IndexByte(str, ',') - x, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - str = str[end+1:] - end = strings.IndexByte(str, ')') - - y, err := strconv.ParseFloat(str[:end], 64) - if err != nil { - return err - } - - points = append(points, Vec2{x, y}) - - if end+3 < len(str) { - str = str[end+3:] - } else { - break - } - } - - *dst = Polygon{P: points, Status: Present} - return nil -} - -func (dst *Polygon) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Polygon{Status: Null} - return nil - } - - if len(src) < 5 { - return fmt.Errorf("invalid length for Polygon: %v", len(src)) - } - - pointCount := int(binary.BigEndian.Uint32(src)) - rp := 4 - - if 4+pointCount*16 != len(src) { - return fmt.Errorf("invalid length for Polygon with %d points: %v", pointCount, len(src)) - } - - points := make([]Vec2, pointCount) - for i := 0; i < len(points); i++ { - x := binary.BigEndian.Uint64(src[rp:]) - rp += 8 - y := binary.BigEndian.Uint64(src[rp:]) - rp += 8 - points[i] = Vec2{math.Float64frombits(x), math.Float64frombits(y)} - } - - *dst = Polygon{ - P: points, - Status: Present, - } - return nil -} - -func (src Polygon) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, '(') - - for i, p := range src.P { - if i > 0 { - buf = append(buf, ',') - } - buf = append(buf, fmt.Sprintf(`(%s,%s)`, - strconv.FormatFloat(p.X, 'f', -1, 64), - strconv.FormatFloat(p.Y, 'f', -1, 64), - )...) - } - - return append(buf, ')'), nil -} - -func (src Polygon) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, int32(len(src.P))) - - for _, p := range src.P { - buf = pgio.AppendUint64(buf, math.Float64bits(p.X)) - buf = pgio.AppendUint64(buf, math.Float64bits(p.Y)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Polygon) Scan(src interface{}) error { - if src == nil { - *dst = Polygon{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Polygon) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/qchar.go b/vendor/github.com/jackc/pgtype/qchar.go deleted file mode 100644 index 574f606..0000000 --- a/vendor/github.com/jackc/pgtype/qchar.go +++ /dev/null @@ -1,152 +0,0 @@ -package pgtype - -import ( - "fmt" - "math" - "strconv" -) - -// QChar is for PostgreSQL's special 8-bit-only "char" type more akin to the C -// language's char type, or Go's byte type. (Note that the name in PostgreSQL -// itself is "char", in double-quotes, and not char.) It gets used a lot in -// PostgreSQL's system tables to hold a single ASCII character value (eg -// pg_class.relkind). It is named Qchar for quoted char to disambiguate from SQL -// standard type char. -// -// Not all possible values of QChar are representable in the text format. -// Therefore, QChar does not implement TextEncoder and TextDecoder. In -// addition, database/sql Scanner and database/sql/driver Value are not -// implemented. -type QChar struct { - Int int8 - Status Status -} - -func (dst *QChar) Set(src interface{}) error { - if src == nil { - *dst = QChar{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case int8: - *dst = QChar{Int: value, Status: Present} - case uint8: - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case int16: - if value < math.MinInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case uint16: - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case int32: - if value < math.MinInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case uint32: - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case int64: - if value < math.MinInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case uint64: - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case int: - if value < math.MinInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case uint: - if value > math.MaxInt8 { - return fmt.Errorf("%d is greater than maximum value for QChar", value) - } - *dst = QChar{Int: int8(value), Status: Present} - case string: - num, err := strconv.ParseInt(value, 10, 8) - if err != nil { - return err - } - *dst = QChar{Int: int8(num), Status: Present} - default: - if originalSrc, ok := underlyingNumberType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to QChar", value) - } - - return nil -} - -func (dst QChar) Get() interface{} { - switch dst.Status { - case Present: - return dst.Int - case Null: - return nil - default: - return dst.Status - } -} - -func (src *QChar) AssignTo(dst interface{}) error { - return int64AssignTo(int64(src.Int), src.Status, dst) -} - -func (dst *QChar) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = QChar{Status: Null} - return nil - } - - if len(src) != 1 { - return fmt.Errorf(`invalid length for "char": %v`, len(src)) - } - - *dst = QChar{Int: int8(src[0]), Status: Present} - return nil -} - -func (src QChar) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, byte(src.Int)), nil -} diff --git a/vendor/github.com/jackc/pgtype/record.go b/vendor/github.com/jackc/pgtype/record.go deleted file mode 100644 index 5cf2c93..0000000 --- a/vendor/github.com/jackc/pgtype/record.go +++ /dev/null @@ -1,126 +0,0 @@ -package pgtype - -import ( - "fmt" - "reflect" -) - -// Record is the generic PostgreSQL record type such as is created with the -// "row" function. Record only implements BinaryDecoder and Value. The text -// format output format from PostgreSQL does not include type information and is -// therefore impossible to decode. No encoders are implemented because -// PostgreSQL does not support input of generic records. -type Record struct { - Fields []Value - Status Status -} - -func (dst *Record) Set(src interface{}) error { - if src == nil { - *dst = Record{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case []Value: - *dst = Record{Fields: value, Status: Present} - default: - return fmt.Errorf("cannot convert %v to Record", src) - } - - return nil -} - -func (dst Record) Get() interface{} { - switch dst.Status { - case Present: - return dst.Fields - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Record) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *[]Value: - *v = make([]Value, len(src.Fields)) - copy(*v, src.Fields) - return nil - case *[]interface{}: - *v = make([]interface{}, len(src.Fields)) - for i := range *v { - (*v)[i] = src.Fields[i].Get() - } - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func prepareNewBinaryDecoder(ci *ConnInfo, fieldOID uint32, v *Value) (BinaryDecoder, error) { - var binaryDecoder BinaryDecoder - - if dt, ok := ci.DataTypeForOID(fieldOID); ok { - binaryDecoder, _ = dt.Value.(BinaryDecoder) - } else { - return nil, fmt.Errorf("unknown oid while decoding record: %v", fieldOID) - } - - if binaryDecoder == nil { - return nil, fmt.Errorf("no binary decoder registered for: %v", fieldOID) - } - - // Duplicate struct to scan into - binaryDecoder = reflect.New(reflect.ValueOf(binaryDecoder).Elem().Type()).Interface().(BinaryDecoder) - *v = binaryDecoder.(Value) - return binaryDecoder, nil -} - -func (dst *Record) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Record{Status: Null} - return nil - } - - scanner := NewCompositeBinaryScanner(ci, src) - - fields := make([]Value, scanner.FieldCount()) - - for i := 0; scanner.Next(); i++ { - binaryDecoder, err := prepareNewBinaryDecoder(ci, scanner.OID(), &fields[i]) - if err != nil { - return err - } - - if err = binaryDecoder.DecodeBinary(ci, scanner.Bytes()); err != nil { - return err - } - } - - if scanner.Err() != nil { - return scanner.Err() - } - - *dst = Record{Fields: fields, Status: Present} - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/record_array.go b/vendor/github.com/jackc/pgtype/record_array.go deleted file mode 100644 index 2271717..0000000 --- a/vendor/github.com/jackc/pgtype/record_array.go +++ /dev/null @@ -1,318 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "encoding/binary" - "fmt" - "reflect" -) - -type RecordArray struct { - Elements []Record - Dimensions []ArrayDimension - Status Status -} - -func (dst *RecordArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = RecordArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case [][]Value: - if value == nil { - *dst = RecordArray{Status: Null} - } else if len(value) == 0 { - *dst = RecordArray{Status: Present} - } else { - elements := make([]Record, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = RecordArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Record: - if value == nil { - *dst = RecordArray{Status: Null} - } else if len(value) == 0 { - *dst = RecordArray{Status: Present} - } else { - *dst = RecordArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = RecordArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for RecordArray", src) - } - if elementsLength == 0 { - *dst = RecordArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to RecordArray", src) - } - - *dst = RecordArray{ - Elements: make([]Record, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Record, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to RecordArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *RecordArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to RecordArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in RecordArray", err) - } - index++ - - return index, nil -} - -func (dst RecordArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *RecordArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[][]Value: - *v = make([][]Value, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *RecordArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from RecordArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from RecordArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *RecordArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = RecordArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = RecordArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Record, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = RecordArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} diff --git a/vendor/github.com/jackc/pgtype/text.go b/vendor/github.com/jackc/pgtype/text.go deleted file mode 100644 index a01815d..0000000 --- a/vendor/github.com/jackc/pgtype/text.go +++ /dev/null @@ -1,212 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/json" - "fmt" -) - -type Text struct { - String string - Status Status -} - -func (dst *Text) Set(src interface{}) error { - if src == nil { - *dst = Text{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case string: - *dst = Text{String: value, Status: Present} - case *string: - if value == nil { - *dst = Text{Status: Null} - } else { - *dst = Text{String: *value, Status: Present} - } - case []byte: - if value == nil { - *dst = Text{Status: Null} - } else { - *dst = Text{String: string(value), Status: Present} - } - case fmt.Stringer: - if value == fmt.Stringer(nil) { - *dst = Text{Status: Null} - } else { - *dst = Text{String: value.String(), Status: Present} - } - default: - // Cannot be part of the switch: If Value() returns nil on - // non-string, we should still try to checks the underlying type - // using reflection. - // - // For example the struct might implement driver.Valuer with - // pointer receiver and fmt.Stringer with value receiver. - if value, ok := src.(driver.Valuer); ok { - if value == driver.Valuer(nil) { - *dst = Text{Status: Null} - return nil - } else { - v, err := value.Value() - if err != nil { - return fmt.Errorf("driver.Valuer Value() method failed: %w", err) - } - - // Handles also v == nil case. - if s, ok := v.(string); ok { - *dst = Text{String: s, Status: Present} - return nil - } - } - } - - if originalSrc, ok := underlyingStringType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Text", value) - } - - return nil -} - -func (dst Text) Get() interface{} { - switch dst.Status { - case Present: - return dst.String - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Text) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *string: - *v = src.String - return nil - case *[]byte: - *v = make([]byte, len(src.String)) - copy(*v, src.String) - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (Text) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *Text) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Text{Status: Null} - return nil - } - - *dst = Text{String: string(src), Status: Present} - return nil -} - -func (dst *Text) DecodeBinary(ci *ConnInfo, src []byte) error { - return dst.DecodeText(ci, src) -} - -func (Text) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (src Text) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.String...), nil -} - -func (src Text) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return src.EncodeText(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *Text) Scan(src interface{}) error { - if src == nil { - *dst = Text{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Text) Value() (driver.Value, error) { - switch src.Status { - case Present: - return src.String, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Text) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - return json.Marshal(src.String) - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - return nil, errBadStatus -} - -func (dst *Text) UnmarshalJSON(b []byte) error { - var s *string - err := json.Unmarshal(b, &s) - if err != nil { - return err - } - - if s == nil { - *dst = Text{Status: Null} - } else { - *dst = Text{String: *s, Status: Present} - } - - return nil -} diff --git a/vendor/github.com/jackc/pgtype/text_array.go b/vendor/github.com/jackc/pgtype/text_array.go deleted file mode 100644 index 2461966..0000000 --- a/vendor/github.com/jackc/pgtype/text_array.go +++ /dev/null @@ -1,517 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type TextArray struct { - Elements []Text - Dimensions []ArrayDimension - Status Status -} - -func (dst *TextArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = TextArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = TextArray{Status: Null} - } else if len(value) == 0 { - *dst = TextArray{Status: Present} - } else { - elements := make([]Text, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = TextArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*string: - if value == nil { - *dst = TextArray{Status: Null} - } else if len(value) == 0 { - *dst = TextArray{Status: Present} - } else { - elements := make([]Text, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = TextArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Text: - if value == nil { - *dst = TextArray{Status: Null} - } else if len(value) == 0 { - *dst = TextArray{Status: Present} - } else { - *dst = TextArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = TextArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for TextArray", src) - } - if elementsLength == 0 { - *dst = TextArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to TextArray", src) - } - - *dst = TextArray{ - Elements: make([]Text, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Text, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to TextArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *TextArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to TextArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in TextArray", err) - } - index++ - - return index, nil -} - -func (dst TextArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *TextArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*string: - *v = make([]*string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *TextArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from TextArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from TextArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *TextArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TextArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Text - - if len(uta.Elements) > 0 { - elements = make([]Text, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Text - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = TextArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *TextArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TextArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = TextArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Text, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = TextArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src TextArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src TextArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("text"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "text") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *TextArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src TextArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/tid.go b/vendor/github.com/jackc/pgtype/tid.go deleted file mode 100644 index 4bb57f6..0000000 --- a/vendor/github.com/jackc/pgtype/tid.go +++ /dev/null @@ -1,156 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "strconv" - "strings" - - "github.com/jackc/pgio" -) - -// TID is PostgreSQL's Tuple Identifier type. -// -// When one does -// -// select ctid, * from some_table; -// -// it is the data type of the ctid hidden system column. -// -// It is currently implemented as a pair unsigned two byte integers. -// Its conversion functions can be found in src/backend/utils/adt/tid.c -// in the PostgreSQL sources. -type TID struct { - BlockNumber uint32 - OffsetNumber uint16 - Status Status -} - -func (dst *TID) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to TID", src) -} - -func (dst TID) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *TID) AssignTo(dst interface{}) error { - if src.Status == Present { - switch v := dst.(type) { - case *string: - *v = fmt.Sprintf(`(%d,%d)`, src.BlockNumber, src.OffsetNumber) - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - } - - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *TID) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TID{Status: Null} - return nil - } - - if len(src) < 5 { - return fmt.Errorf("invalid length for tid: %v", len(src)) - } - - parts := strings.SplitN(string(src[1:len(src)-1]), ",", 2) - if len(parts) < 2 { - return fmt.Errorf("invalid format for tid") - } - - blockNumber, err := strconv.ParseUint(parts[0], 10, 32) - if err != nil { - return err - } - - offsetNumber, err := strconv.ParseUint(parts[1], 10, 16) - if err != nil { - return err - } - - *dst = TID{BlockNumber: uint32(blockNumber), OffsetNumber: uint16(offsetNumber), Status: Present} - return nil -} - -func (dst *TID) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TID{Status: Null} - return nil - } - - if len(src) != 6 { - return fmt.Errorf("invalid length for tid: %v", len(src)) - } - - *dst = TID{ - BlockNumber: binary.BigEndian.Uint32(src), - OffsetNumber: binary.BigEndian.Uint16(src[4:]), - Status: Present, - } - return nil -} - -func (src TID) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, fmt.Sprintf(`(%d,%d)`, src.BlockNumber, src.OffsetNumber)...) - return buf, nil -} - -func (src TID) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendUint32(buf, src.BlockNumber) - buf = pgio.AppendUint16(buf, src.OffsetNumber) - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *TID) Scan(src interface{}) error { - if src == nil { - *dst = TID{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src TID) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/time.go b/vendor/github.com/jackc/pgtype/time.go deleted file mode 100644 index f7a2887..0000000 --- a/vendor/github.com/jackc/pgtype/time.go +++ /dev/null @@ -1,231 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "strconv" - "time" - - "github.com/jackc/pgio" -) - -// Time represents the PostgreSQL time type. The PostgreSQL time is a time of day without time zone. -// -// Time is represented as the number of microseconds since midnight in the same way that PostgreSQL does. Other time -// and date types in pgtype can use time.Time as the underlying representation. However, pgtype.Time type cannot due -// to needing to handle 24:00:00. time.Time converts that to 00:00:00 on the following day. -type Time struct { - Microseconds int64 // Number of microseconds since midnight - Status Status -} - -// Set converts src into a Time and stores in dst. -func (dst *Time) Set(src interface{}) error { - if src == nil { - *dst = Time{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case time.Time: - usec := int64(value.Hour())*microsecondsPerHour + - int64(value.Minute())*microsecondsPerMinute + - int64(value.Second())*microsecondsPerSecond + - int64(value.Nanosecond())/1000 - *dst = Time{Microseconds: usec, Status: Present} - case *time.Time: - if value == nil { - *dst = Time{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingTimeType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Time", value) - } - - return nil -} - -func (dst Time) Get() interface{} { - switch dst.Status { - case Present: - return dst.Microseconds - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Time) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *time.Time: - // 24:00:00 is max allowed time in PostgreSQL, but time.Time will normalize that to 00:00:00 the next day. - var maxRepresentableByTime int64 = 24*60*60*1000000 - 1 - if src.Microseconds > maxRepresentableByTime { - return fmt.Errorf("%d microseconds cannot be represented as time.Time", src.Microseconds) - } - - usec := src.Microseconds - hours := usec / microsecondsPerHour - usec -= hours * microsecondsPerHour - minutes := usec / microsecondsPerMinute - usec -= minutes * microsecondsPerMinute - seconds := usec / microsecondsPerSecond - usec -= seconds * microsecondsPerSecond - ns := usec * 1000 - *v = time.Date(2000, 1, 1, int(hours), int(minutes), int(seconds), int(ns), time.UTC) - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -// DecodeText decodes from src into dst. -func (dst *Time) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Time{Status: Null} - return nil - } - - s := string(src) - - if len(s) < 8 { - return fmt.Errorf("cannot decode %v into Time", s) - } - - hours, err := strconv.ParseInt(s[0:2], 10, 64) - if err != nil { - return fmt.Errorf("cannot decode %v into Time", s) - } - usec := hours * microsecondsPerHour - - minutes, err := strconv.ParseInt(s[3:5], 10, 64) - if err != nil { - return fmt.Errorf("cannot decode %v into Time", s) - } - usec += minutes * microsecondsPerMinute - - seconds, err := strconv.ParseInt(s[6:8], 10, 64) - if err != nil { - return fmt.Errorf("cannot decode %v into Time", s) - } - usec += seconds * microsecondsPerSecond - - if len(s) > 9 { - fraction := s[9:] - n, err := strconv.ParseInt(fraction, 10, 64) - if err != nil { - return fmt.Errorf("cannot decode %v into Time", s) - } - - for i := len(fraction); i < 6; i++ { - n *= 10 - } - - usec += n - } - - *dst = Time{Microseconds: usec, Status: Present} - - return nil -} - -// DecodeBinary decodes from src into dst. -func (dst *Time) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Time{Status: Null} - return nil - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for time: %v", len(src)) - } - - usec := int64(binary.BigEndian.Uint64(src)) - *dst = Time{Microseconds: usec, Status: Present} - - return nil -} - -// EncodeText writes the text encoding of src into w. -func (src Time) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - usec := src.Microseconds - hours := usec / microsecondsPerHour - usec -= hours * microsecondsPerHour - minutes := usec / microsecondsPerMinute - usec -= minutes * microsecondsPerMinute - seconds := usec / microsecondsPerSecond - usec -= seconds * microsecondsPerSecond - - s := fmt.Sprintf("%02d:%02d:%02d.%06d", hours, minutes, seconds, usec) - - return append(buf, s...), nil -} - -// EncodeBinary writes the binary encoding of src into w. If src.Time is not in -// the UTC time zone it returns an error. -func (src Time) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return pgio.AppendInt64(buf, src.Microseconds), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Time) Scan(src interface{}) error { - if src == nil { - *dst = Time{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - case time.Time: - return dst.Set(src) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Time) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/timestamp.go b/vendor/github.com/jackc/pgtype/timestamp.go deleted file mode 100644 index fce490c..0000000 --- a/vendor/github.com/jackc/pgtype/timestamp.go +++ /dev/null @@ -1,261 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "strings" - "time" - - "github.com/jackc/pgio" -) - -const pgTimestampFormat = "2006-01-02 15:04:05.999999999" - -// Timestamp represents the PostgreSQL timestamp type. The PostgreSQL -// timestamp does not have a time zone. This presents a problem when -// translating to and from time.Time which requires a time zone. It is highly -// recommended to use timestamptz whenever possible. Timestamp methods either -// convert to UTC or return an error on non-UTC times. -type Timestamp struct { - Time time.Time // Time must always be in UTC. - Status Status - InfinityModifier InfinityModifier -} - -// Set converts src into a Timestamp and stores in dst. If src is a -// time.Time in a non-UTC time zone, the time zone is discarded. -func (dst *Timestamp) Set(src interface{}) error { - if src == nil { - *dst = Timestamp{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case time.Time: - *dst = Timestamp{Time: time.Date(value.Year(), value.Month(), value.Day(), value.Hour(), value.Minute(), value.Second(), value.Nanosecond(), time.UTC), Status: Present} - case *time.Time: - if value == nil { - *dst = Timestamp{Status: Null} - } else { - return dst.Set(*value) - } - case string: - return dst.DecodeText(nil, []byte(value)) - case *string: - if value == nil { - *dst = Timestamp{Status: Null} - } else { - return dst.Set(*value) - } - case InfinityModifier: - *dst = Timestamp{InfinityModifier: value, Status: Present} - default: - if originalSrc, ok := underlyingTimeType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Timestamp", value) - } - - return nil -} - -func (dst Timestamp) Get() interface{} { - switch dst.Status { - case Present: - if dst.InfinityModifier != None { - return dst.InfinityModifier - } - return dst.Time - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Timestamp) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *time.Time: - if src.InfinityModifier != None { - return fmt.Errorf("cannot assign %v to %T", src, dst) - } - *v = src.Time - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -// DecodeText decodes from src into dst. The decoded time is considered to -// be in UTC. -func (dst *Timestamp) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Timestamp{Status: Null} - return nil - } - - sbuf := string(src) - switch sbuf { - case "infinity": - *dst = Timestamp{Status: Present, InfinityModifier: Infinity} - case "-infinity": - *dst = Timestamp{Status: Present, InfinityModifier: -Infinity} - default: - if strings.HasSuffix(sbuf, " BC") { - t, err := time.Parse(pgTimestampFormat, strings.TrimRight(sbuf, " BC")) - t2 := time.Date(1-t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - if err != nil { - return err - } - *dst = Timestamp{Time: t2, Status: Present} - return nil - } - tim, err := time.Parse(pgTimestampFormat, sbuf) - if err != nil { - return err - } - - *dst = Timestamp{Time: tim, Status: Present} - } - - return nil -} - -// DecodeBinary decodes from src into dst. The decoded time is considered to -// be in UTC. -func (dst *Timestamp) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Timestamp{Status: Null} - return nil - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for timestamp: %v", len(src)) - } - - microsecSinceY2K := int64(binary.BigEndian.Uint64(src)) - - switch microsecSinceY2K { - case infinityMicrosecondOffset: - *dst = Timestamp{Status: Present, InfinityModifier: Infinity} - case negativeInfinityMicrosecondOffset: - *dst = Timestamp{Status: Present, InfinityModifier: -Infinity} - default: - tim := time.Unix( - microsecFromUnixEpochToY2K/1000000+microsecSinceY2K/1000000, - (microsecFromUnixEpochToY2K%1000000*1000)+(microsecSinceY2K%1000000*1000), - ).UTC() - *dst = Timestamp{Time: tim, Status: Present} - } - - return nil -} - -// EncodeText writes the text encoding of src into w. If src.Time is not in -// the UTC time zone it returns an error. -func (src Timestamp) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - if src.Time.Location() != time.UTC { - return nil, fmt.Errorf("cannot encode non-UTC time into timestamp") - } - - var s string - - switch src.InfinityModifier { - case None: - s = src.Time.Truncate(time.Microsecond).Format(pgTimestampFormat) - case Infinity: - s = "infinity" - case NegativeInfinity: - s = "-infinity" - } - - return append(buf, s...), nil -} - -// EncodeBinary writes the binary encoding of src into w. If src.Time is not in -// the UTC time zone it returns an error. -func (src Timestamp) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - if src.Time.Location() != time.UTC { - return nil, fmt.Errorf("cannot encode non-UTC time into timestamp") - } - - var microsecSinceY2K int64 - switch src.InfinityModifier { - case None: - microsecSinceUnixEpoch := src.Time.Unix()*1000000 + int64(src.Time.Nanosecond())/1000 - microsecSinceY2K = microsecSinceUnixEpoch - microsecFromUnixEpochToY2K - case Infinity: - microsecSinceY2K = infinityMicrosecondOffset - case NegativeInfinity: - microsecSinceY2K = negativeInfinityMicrosecondOffset - } - - return pgio.AppendInt64(buf, microsecSinceY2K), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Timestamp) Scan(src interface{}) error { - if src == nil { - *dst = Timestamp{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - case time.Time: - *dst = Timestamp{Time: src, Status: Present} - return nil - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Timestamp) Value() (driver.Value, error) { - switch src.Status { - case Present: - if src.InfinityModifier != None { - return src.InfinityModifier.String(), nil - } - return src.Time, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} diff --git a/vendor/github.com/jackc/pgtype/timestamp_array.go b/vendor/github.com/jackc/pgtype/timestamp_array.go deleted file mode 100644 index e12481e..0000000 --- a/vendor/github.com/jackc/pgtype/timestamp_array.go +++ /dev/null @@ -1,518 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - "time" - - "github.com/jackc/pgio" -) - -type TimestampArray struct { - Elements []Timestamp - Dimensions []ArrayDimension - Status Status -} - -func (dst *TimestampArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = TimestampArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []time.Time: - if value == nil { - *dst = TimestampArray{Status: Null} - } else if len(value) == 0 { - *dst = TimestampArray{Status: Present} - } else { - elements := make([]Timestamp, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = TimestampArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*time.Time: - if value == nil { - *dst = TimestampArray{Status: Null} - } else if len(value) == 0 { - *dst = TimestampArray{Status: Present} - } else { - elements := make([]Timestamp, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = TimestampArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Timestamp: - if value == nil { - *dst = TimestampArray{Status: Null} - } else if len(value) == 0 { - *dst = TimestampArray{Status: Present} - } else { - *dst = TimestampArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = TimestampArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for TimestampArray", src) - } - if elementsLength == 0 { - *dst = TimestampArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to TimestampArray", src) - } - - *dst = TimestampArray{ - Elements: make([]Timestamp, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Timestamp, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to TimestampArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *TimestampArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to TimestampArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in TimestampArray", err) - } - index++ - - return index, nil -} - -func (dst TimestampArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *TimestampArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]time.Time: - *v = make([]time.Time, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*time.Time: - *v = make([]*time.Time, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *TimestampArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from TimestampArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from TimestampArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *TimestampArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TimestampArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Timestamp - - if len(uta.Elements) > 0 { - elements = make([]Timestamp, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Timestamp - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = TimestampArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *TimestampArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TimestampArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = TimestampArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Timestamp, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = TimestampArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src TimestampArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src TimestampArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("timestamp"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "timestamp") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *TimestampArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src TimestampArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/timestamptz.go b/vendor/github.com/jackc/pgtype/timestamptz.go deleted file mode 100644 index 72ae499..0000000 --- a/vendor/github.com/jackc/pgtype/timestamptz.go +++ /dev/null @@ -1,322 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "encoding/json" - "fmt" - "time" - - "github.com/jackc/pgio" -) - -const pgTimestamptzHourFormat = "2006-01-02 15:04:05.999999999Z07" -const pgTimestamptzMinuteFormat = "2006-01-02 15:04:05.999999999Z07:00" -const pgTimestamptzSecondFormat = "2006-01-02 15:04:05.999999999Z07:00:00" -const microsecFromUnixEpochToY2K = 946684800 * 1000000 - -const ( - negativeInfinityMicrosecondOffset = -9223372036854775808 - infinityMicrosecondOffset = 9223372036854775807 -) - -type Timestamptz struct { - Time time.Time - Status Status - InfinityModifier InfinityModifier -} - -func (dst *Timestamptz) Set(src interface{}) error { - if src == nil { - *dst = Timestamptz{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - switch value := src.(type) { - case time.Time: - *dst = Timestamptz{Time: value, Status: Present} - case *time.Time: - if value == nil { - *dst = Timestamptz{Status: Null} - } else { - return dst.Set(*value) - } - case string: - return dst.DecodeText(nil, []byte(value)) - case *string: - if value == nil { - *dst = Timestamptz{Status: Null} - } else { - return dst.Set(*value) - } - case InfinityModifier: - *dst = Timestamptz{InfinityModifier: value, Status: Present} - default: - if originalSrc, ok := underlyingTimeType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to Timestamptz", value) - } - - return nil -} - -func (dst Timestamptz) Get() interface{} { - switch dst.Status { - case Present: - if dst.InfinityModifier != None { - return dst.InfinityModifier - } - return dst.Time - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Timestamptz) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *time.Time: - if src.InfinityModifier != None { - return fmt.Errorf("cannot assign %v to %T", src, dst) - } - *v = src.Time - return nil - default: - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - return fmt.Errorf("unable to assign to %T", dst) - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (dst *Timestamptz) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Timestamptz{Status: Null} - return nil - } - - sbuf := string(src) - switch sbuf { - case "infinity": - *dst = Timestamptz{Status: Present, InfinityModifier: Infinity} - case "-infinity": - *dst = Timestamptz{Status: Present, InfinityModifier: -Infinity} - default: - var format string - if len(sbuf) >= 9 && (sbuf[len(sbuf)-9] == '-' || sbuf[len(sbuf)-9] == '+') { - format = pgTimestamptzSecondFormat - } else if len(sbuf) >= 6 && (sbuf[len(sbuf)-6] == '-' || sbuf[len(sbuf)-6] == '+') { - format = pgTimestamptzMinuteFormat - } else { - format = pgTimestamptzHourFormat - } - - tim, err := time.Parse(format, sbuf) - if err != nil { - return err - } - - *dst = Timestamptz{Time: normalizePotentialUTC(tim), Status: Present} - } - - return nil -} - -func (dst *Timestamptz) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Timestamptz{Status: Null} - return nil - } - - if len(src) != 8 { - return fmt.Errorf("invalid length for timestamptz: %v", len(src)) - } - - microsecSinceY2K := int64(binary.BigEndian.Uint64(src)) - - switch microsecSinceY2K { - case infinityMicrosecondOffset: - *dst = Timestamptz{Status: Present, InfinityModifier: Infinity} - case negativeInfinityMicrosecondOffset: - *dst = Timestamptz{Status: Present, InfinityModifier: -Infinity} - default: - tim := time.Unix( - microsecFromUnixEpochToY2K/1000000+microsecSinceY2K/1000000, - (microsecFromUnixEpochToY2K%1000000*1000)+(microsecSinceY2K%1000000*1000), - ) - *dst = Timestamptz{Time: tim, Status: Present} - } - - return nil -} - -func (src Timestamptz) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var s string - - switch src.InfinityModifier { - case None: - s = src.Time.UTC().Truncate(time.Microsecond).Format(pgTimestamptzSecondFormat) - case Infinity: - s = "infinity" - case NegativeInfinity: - s = "-infinity" - } - - return append(buf, s...), nil -} - -func (src Timestamptz) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var microsecSinceY2K int64 - switch src.InfinityModifier { - case None: - microsecSinceUnixEpoch := src.Time.Unix()*1000000 + int64(src.Time.Nanosecond())/1000 - microsecSinceY2K = microsecSinceUnixEpoch - microsecFromUnixEpochToY2K - case Infinity: - microsecSinceY2K = infinityMicrosecondOffset - case NegativeInfinity: - microsecSinceY2K = negativeInfinityMicrosecondOffset - } - - return pgio.AppendInt64(buf, microsecSinceY2K), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Timestamptz) Scan(src interface{}) error { - if src == nil { - *dst = Timestamptz{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - case time.Time: - *dst = Timestamptz{Time: src, Status: Present} - return nil - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Timestamptz) Value() (driver.Value, error) { - switch src.Status { - case Present: - if src.InfinityModifier != None { - return src.InfinityModifier.String(), nil - } - if src.Time.Location().String() == time.UTC.String() { - return src.Time.UTC(), nil - } - return src.Time, nil - case Null: - return nil, nil - default: - return nil, errUndefined - } -} - -func (src Timestamptz) MarshalJSON() ([]byte, error) { - switch src.Status { - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - - if src.Status != Present { - return nil, errBadStatus - } - - var s string - - switch src.InfinityModifier { - case None: - s = src.Time.Format(time.RFC3339Nano) - case Infinity: - s = "infinity" - case NegativeInfinity: - s = "-infinity" - } - - return json.Marshal(s) -} - -func (dst *Timestamptz) UnmarshalJSON(b []byte) error { - var s *string - err := json.Unmarshal(b, &s) - if err != nil { - return err - } - - if s == nil { - *dst = Timestamptz{Status: Null} - return nil - } - - switch *s { - case "infinity": - *dst = Timestamptz{Status: Present, InfinityModifier: Infinity} - case "-infinity": - *dst = Timestamptz{Status: Present, InfinityModifier: -Infinity} - default: - // PostgreSQL uses ISO 8601 for to_json function and casting from a string to timestamptz - tim, err := time.Parse(time.RFC3339Nano, *s) - if err != nil { - return err - } - - *dst = Timestamptz{Time: normalizePotentialUTC(tim), Status: Present} - } - - return nil -} - -// Normalize timestamps in UTC location to behave similarly to how the Golang -// standard library does it: UTC timestamps lack a .loc value. -// -// Reason for this: when comparing two timestamps with reflect.DeepEqual (generally -// speaking not a good idea, but several testing libraries (for example testify) -// does this), their location data needs to be equal for them to be considered -// equal. -func normalizePotentialUTC(timestamp time.Time) time.Time { - if timestamp.Location().String() != time.UTC.String() { - return timestamp - } - - return timestamp.UTC() -} diff --git a/vendor/github.com/jackc/pgtype/timestamptz_array.go b/vendor/github.com/jackc/pgtype/timestamptz_array.go deleted file mode 100644 index a3b4b26..0000000 --- a/vendor/github.com/jackc/pgtype/timestamptz_array.go +++ /dev/null @@ -1,518 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - "time" - - "github.com/jackc/pgio" -) - -type TimestamptzArray struct { - Elements []Timestamptz - Dimensions []ArrayDimension - Status Status -} - -func (dst *TimestamptzArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = TimestamptzArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []time.Time: - if value == nil { - *dst = TimestamptzArray{Status: Null} - } else if len(value) == 0 { - *dst = TimestamptzArray{Status: Present} - } else { - elements := make([]Timestamptz, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = TimestamptzArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*time.Time: - if value == nil { - *dst = TimestamptzArray{Status: Null} - } else if len(value) == 0 { - *dst = TimestamptzArray{Status: Present} - } else { - elements := make([]Timestamptz, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = TimestamptzArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Timestamptz: - if value == nil { - *dst = TimestamptzArray{Status: Null} - } else if len(value) == 0 { - *dst = TimestamptzArray{Status: Present} - } else { - *dst = TimestamptzArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = TimestamptzArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for TimestamptzArray", src) - } - if elementsLength == 0 { - *dst = TimestamptzArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to TimestamptzArray", src) - } - - *dst = TimestamptzArray{ - Elements: make([]Timestamptz, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Timestamptz, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to TimestamptzArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *TimestamptzArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to TimestamptzArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in TimestamptzArray", err) - } - index++ - - return index, nil -} - -func (dst TimestamptzArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *TimestamptzArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]time.Time: - *v = make([]time.Time, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*time.Time: - *v = make([]*time.Time, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *TimestamptzArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from TimestamptzArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from TimestamptzArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *TimestamptzArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TimestamptzArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Timestamptz - - if len(uta.Elements) > 0 { - elements = make([]Timestamptz, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Timestamptz - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = TimestamptzArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *TimestamptzArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TimestamptzArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = TimestamptzArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Timestamptz, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = TimestamptzArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src TimestamptzArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src TimestamptzArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("timestamptz"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "timestamptz") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *TimestamptzArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src TimestamptzArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/tsrange.go b/vendor/github.com/jackc/pgtype/tsrange.go deleted file mode 100644 index 19ecf44..0000000 --- a/vendor/github.com/jackc/pgtype/tsrange.go +++ /dev/null @@ -1,267 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - - "github.com/jackc/pgio" -) - -type Tsrange struct { - Lower Timestamp - Upper Timestamp - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *Tsrange) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Tsrange{Status: Null} - return nil - } - - switch value := src.(type) { - case Tsrange: - *dst = value - case *Tsrange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to Tsrange", src) - } - - return nil -} - -func (dst Tsrange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Tsrange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Tsrange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Tsrange{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = Tsrange{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *Tsrange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Tsrange{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = Tsrange{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src Tsrange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src Tsrange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Tsrange) Scan(src interface{}) error { - if src == nil { - *dst = Tsrange{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Tsrange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/tsrange_array.go b/vendor/github.com/jackc/pgtype/tsrange_array.go deleted file mode 100644 index c64048e..0000000 --- a/vendor/github.com/jackc/pgtype/tsrange_array.go +++ /dev/null @@ -1,470 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type TsrangeArray struct { - Elements []Tsrange - Dimensions []ArrayDimension - Status Status -} - -func (dst *TsrangeArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = TsrangeArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []Tsrange: - if value == nil { - *dst = TsrangeArray{Status: Null} - } else if len(value) == 0 { - *dst = TsrangeArray{Status: Present} - } else { - *dst = TsrangeArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = TsrangeArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for TsrangeArray", src) - } - if elementsLength == 0 { - *dst = TsrangeArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to TsrangeArray", src) - } - - *dst = TsrangeArray{ - Elements: make([]Tsrange, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Tsrange, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to TsrangeArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *TsrangeArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to TsrangeArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in TsrangeArray", err) - } - index++ - - return index, nil -} - -func (dst TsrangeArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *TsrangeArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]Tsrange: - *v = make([]Tsrange, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *TsrangeArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from TsrangeArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from TsrangeArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *TsrangeArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TsrangeArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Tsrange - - if len(uta.Elements) > 0 { - elements = make([]Tsrange, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Tsrange - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = TsrangeArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *TsrangeArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TsrangeArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = TsrangeArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Tsrange, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = TsrangeArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src TsrangeArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src TsrangeArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("tsrange"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "tsrange") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *TsrangeArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src TsrangeArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/tstzrange.go b/vendor/github.com/jackc/pgtype/tstzrange.go deleted file mode 100644 index 2557630..0000000 --- a/vendor/github.com/jackc/pgtype/tstzrange.go +++ /dev/null @@ -1,267 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "fmt" - - "github.com/jackc/pgio" -) - -type Tstzrange struct { - Lower Timestamptz - Upper Timestamptz - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *Tstzrange) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = Tstzrange{Status: Null} - return nil - } - - switch value := src.(type) { - case Tstzrange: - *dst = value - case *Tstzrange: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to Tstzrange", src) - } - - return nil -} - -func (dst Tstzrange) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Tstzrange) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Tstzrange) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Tstzrange{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = Tstzrange{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *Tstzrange) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Tstzrange{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = Tstzrange{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src Tstzrange) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src Tstzrange) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Tstzrange) Scan(src interface{}) error { - if src == nil { - *dst = Tstzrange{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Tstzrange) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/tstzrange_array.go b/vendor/github.com/jackc/pgtype/tstzrange_array.go deleted file mode 100644 index a216820..0000000 --- a/vendor/github.com/jackc/pgtype/tstzrange_array.go +++ /dev/null @@ -1,470 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type TstzrangeArray struct { - Elements []Tstzrange - Dimensions []ArrayDimension - Status Status -} - -func (dst *TstzrangeArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = TstzrangeArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []Tstzrange: - if value == nil { - *dst = TstzrangeArray{Status: Null} - } else if len(value) == 0 { - *dst = TstzrangeArray{Status: Present} - } else { - *dst = TstzrangeArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = TstzrangeArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for TstzrangeArray", src) - } - if elementsLength == 0 { - *dst = TstzrangeArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to TstzrangeArray", src) - } - - *dst = TstzrangeArray{ - Elements: make([]Tstzrange, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Tstzrange, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to TstzrangeArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *TstzrangeArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to TstzrangeArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in TstzrangeArray", err) - } - index++ - - return index, nil -} - -func (dst TstzrangeArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *TstzrangeArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]Tstzrange: - *v = make([]Tstzrange, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *TstzrangeArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from TstzrangeArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from TstzrangeArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *TstzrangeArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TstzrangeArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Tstzrange - - if len(uta.Elements) > 0 { - elements = make([]Tstzrange, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Tstzrange - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = TstzrangeArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *TstzrangeArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = TstzrangeArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = TstzrangeArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Tstzrange, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = TstzrangeArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src TstzrangeArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src TstzrangeArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("tstzrange"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "tstzrange") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *TstzrangeArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src TstzrangeArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/typed_array.go.erb b/vendor/github.com/jackc/pgtype/typed_array.go.erb deleted file mode 100644 index e8433c0..0000000 --- a/vendor/github.com/jackc/pgtype/typed_array.go.erb +++ /dev/null @@ -1,512 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -<% - # defaults when not explicitly set on command line - - binary_format ||= "true" - text_format ||= "true" - - text_null ||= "NULL" - - encode_binary ||= binary_format - decode_binary ||= binary_format -%> - -package pgtype - -import ( - "bytes" - "fmt" - "io" - - "github.com/jackc/pgio" -) - -type <%= pgtype_array_type %> struct { - Elements []<%= pgtype_element_type %> - Dimensions []ArrayDimension - Status Status -} - -func (dst *<%= pgtype_array_type %>) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = <%= pgtype_array_type %>{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - <% go_array_types.split(",").each do |t| %> - <% if t != "[]#{pgtype_element_type}" %> - case <%= t %>: - if value == nil { - *dst = <%= pgtype_array_type %>{Status: Null} - } else if len(value) == 0 { - *dst = <%= pgtype_array_type %>{Status: Present} - } else { - elements := make([]<%= pgtype_element_type %>, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = <%= pgtype_array_type %>{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - <% end %> - <% end %> - case []<%= pgtype_element_type %>: - if value == nil { - *dst = <%= pgtype_array_type %>{Status: Null} - } else if len(value) == 0 { - *dst = <%= pgtype_array_type %>{Status: Present} - } else { - *dst = <%= pgtype_array_type %>{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status : Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = <%= pgtype_array_type %>{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for <%= pgtype_array_type %>", src) - } - if elementsLength == 0 { - *dst = <%= pgtype_array_type %>{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to <%= pgtype_array_type %>", src) - } - - *dst = <%= pgtype_array_type %> { - Elements: make([]<%= pgtype_element_type %>, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]<%= pgtype_element_type %>, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to <%= pgtype_array_type %>, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *<%= pgtype_array_type %>) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to <%= pgtype_array_type %>") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in <%= pgtype_array_type %>", err) - } - index++ - - return index, nil -} - -func (dst <%= pgtype_array_type %>) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *<%= pgtype_array_type %>) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1{ - // Attempt to match to select common types: - switch v := dst.(type) { - <% go_array_types.split(",").each do |t| %> - case *<%= t %>: - *v = make(<%= t %>, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - <% end %> - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *<%= pgtype_array_type %>) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr(){ - return 0, fmt.Errorf("cannot assign all values from <%= pgtype_array_type %>") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from <%= pgtype_array_type %>") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -<% if text_format == "true" %> -func (dst *<%= pgtype_array_type %>) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = <%= pgtype_array_type %>{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []<%= pgtype_element_type %> - - if len(uta.Elements) > 0 { - elements = make([]<%= pgtype_element_type %>, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem <%= pgtype_element_type %> - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = <%= pgtype_array_type %>{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} -<% end %> - -<% if decode_binary == "true" %> -func (dst *<%= pgtype_array_type %>) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = <%= pgtype_array_type %>{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = <%= pgtype_array_type %>{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]<%= pgtype_element_type %>, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp:rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = <%= pgtype_array_type %>{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} -<% end %> - -<% if text_format == "true" %> -func (src <%= pgtype_array_type %>) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `<%= text_null %>`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} -<% end %> - -<% if encode_binary == "true" %> - func (src <%= pgtype_array_type %>) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("<%= element_type_name %>"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "<%= element_type_name %>") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil - } -<% end %> - -<% if text_format == "true" %> -// Scan implements the database/sql Scanner interface. -func (dst *<%= pgtype_array_type %>) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src <%= pgtype_array_type %>) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} -<% end %> diff --git a/vendor/github.com/jackc/pgtype/typed_array_gen.sh b/vendor/github.com/jackc/pgtype/typed_array_gen.sh deleted file mode 100644 index 9ec768b..0000000 --- a/vendor/github.com/jackc/pgtype/typed_array_gen.sh +++ /dev/null @@ -1,31 +0,0 @@ -erb pgtype_array_type=Int2Array pgtype_element_type=Int2 go_array_types=[]int16,[]*int16,[]uint16,[]*uint16,[]int32,[]*int32,[]uint32,[]*uint32,[]int64,[]*int64,[]uint64,[]*uint64,[]int,[]*int,[]uint,[]*uint element_type_name=int2 typed_array.go.erb > int2_array.go -erb pgtype_array_type=Int4Array pgtype_element_type=Int4 go_array_types=[]int16,[]*int16,[]uint16,[]*uint16,[]int32,[]*int32,[]uint32,[]*uint32,[]int64,[]*int64,[]uint64,[]*uint64,[]int,[]*int,[]uint,[]*uint element_type_name=int4 typed_array.go.erb > int4_array.go -erb pgtype_array_type=Int8Array pgtype_element_type=Int8 go_array_types=[]int16,[]*int16,[]uint16,[]*uint16,[]int32,[]*int32,[]uint32,[]*uint32,[]int64,[]*int64,[]uint64,[]*uint64,[]int,[]*int,[]uint,[]*uint element_type_name=int8 typed_array.go.erb > int8_array.go -erb pgtype_array_type=BoolArray pgtype_element_type=Bool go_array_types=[]bool,[]*bool element_type_name=bool typed_array.go.erb > bool_array.go -erb pgtype_array_type=DateArray pgtype_element_type=Date go_array_types=[]time.Time,[]*time.Time element_type_name=date typed_array.go.erb > date_array.go -erb pgtype_array_type=TimestamptzArray pgtype_element_type=Timestamptz go_array_types=[]time.Time,[]*time.Time element_type_name=timestamptz typed_array.go.erb > timestamptz_array.go -erb pgtype_array_type=TstzrangeArray pgtype_element_type=Tstzrange go_array_types=[]Tstzrange element_type_name=tstzrange typed_array.go.erb > tstzrange_array.go -erb pgtype_array_type=TsrangeArray pgtype_element_type=Tsrange go_array_types=[]Tsrange element_type_name=tsrange typed_array.go.erb > tsrange_array.go -erb pgtype_array_type=TimestampArray pgtype_element_type=Timestamp go_array_types=[]time.Time,[]*time.Time element_type_name=timestamp typed_array.go.erb > timestamp_array.go -erb pgtype_array_type=Float4Array pgtype_element_type=Float4 go_array_types=[]float32,[]*float32 element_type_name=float4 typed_array.go.erb > float4_array.go -erb pgtype_array_type=Float8Array pgtype_element_type=Float8 go_array_types=[]float64,[]*float64 element_type_name=float8 typed_array.go.erb > float8_array.go -erb pgtype_array_type=InetArray pgtype_element_type=Inet go_array_types=[]*net.IPNet,[]net.IP,[]*net.IP element_type_name=inet typed_array.go.erb > inet_array.go -erb pgtype_array_type=MacaddrArray pgtype_element_type=Macaddr go_array_types=[]net.HardwareAddr,[]*net.HardwareAddr element_type_name=macaddr typed_array.go.erb > macaddr_array.go -erb pgtype_array_type=CIDRArray pgtype_element_type=CIDR go_array_types=[]*net.IPNet,[]net.IP,[]*net.IP element_type_name=cidr typed_array.go.erb > cidr_array.go -erb pgtype_array_type=TextArray pgtype_element_type=Text go_array_types=[]string,[]*string element_type_name=text typed_array.go.erb > text_array.go -erb pgtype_array_type=VarcharArray pgtype_element_type=Varchar go_array_types=[]string,[]*string element_type_name=varchar typed_array.go.erb > varchar_array.go -erb pgtype_array_type=BPCharArray pgtype_element_type=BPChar go_array_types=[]string,[]*string element_type_name=bpchar typed_array.go.erb > bpchar_array.go -erb pgtype_array_type=ByteaArray pgtype_element_type=Bytea go_array_types=[][]byte element_type_name=bytea typed_array.go.erb > bytea_array.go -erb pgtype_array_type=ACLItemArray pgtype_element_type=ACLItem go_array_types=[]string,[]*string element_type_name=aclitem binary_format=false typed_array.go.erb > aclitem_array.go -erb pgtype_array_type=HstoreArray pgtype_element_type=Hstore go_array_types=[]map[string]string element_type_name=hstore typed_array.go.erb > hstore_array.go -erb pgtype_array_type=NumericArray pgtype_element_type=Numeric go_array_types=[]float32,[]*float32,[]float64,[]*float64,[]int64,[]*int64,[]uint64,[]*uint64 element_type_name=numeric typed_array.go.erb > numeric_array.go -erb pgtype_array_type=UUIDArray pgtype_element_type=UUID go_array_types=[][16]byte,[][]byte,[]string,[]*string element_type_name=uuid typed_array.go.erb > uuid_array.go -erb pgtype_array_type=JSONArray pgtype_element_type=JSON go_array_types=[]string,[][]byte,[]json.RawMessage element_type_name=json typed_array.go.erb > json_array.go -erb pgtype_array_type=JSONBArray pgtype_element_type=JSONB go_array_types=[]string,[][]byte,[]json.RawMessage element_type_name=jsonb typed_array.go.erb > jsonb_array.go - -# While the binary format is theoretically possible it is only practical to use the text format. -erb pgtype_array_type=EnumArray pgtype_element_type=GenericText go_array_types=[]string,[]*string binary_format=false typed_array.go.erb > enum_array.go - -erb pgtype_array_type=RecordArray pgtype_element_type=Record go_array_types=[][]Value element_type_name=record text_null=NULL encode_binary=false text_format=false typed_array.go.erb > record_array.go - -goimports -w *_array.go diff --git a/vendor/github.com/jackc/pgtype/typed_multirange.go.erb b/vendor/github.com/jackc/pgtype/typed_multirange.go.erb deleted file mode 100644 index 84c8299..0000000 --- a/vendor/github.com/jackc/pgtype/typed_multirange.go.erb +++ /dev/null @@ -1,239 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - - "github.com/jackc/pgio" -) - -type <%= multirange_type %> struct { - Ranges []<%= range_type %> - Status Status -} - -func (dst *<%= multirange_type %>) Set(src interface{}) error { - //untyped nil and typed nil interfaces are different - if src == nil { - *dst = <%= multirange_type %>{Status: Null} - return nil - } - - switch value := src.(type) { - case <%= multirange_type %>: - *dst = value - case *<%= multirange_type %>: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - case []<%= range_type %>: - if value == nil { - *dst = <%= multirange_type %>{Status: Null} - } else if len(value) == 0 { - *dst = <%= multirange_type %>{Status: Present} - } else { - elements := make([]<%= range_type %>, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = <%= multirange_type %>{ - Ranges: elements, - Status: Present, - } - } - case []*<%= range_type %>: - if value == nil { - *dst = <%= multirange_type %>{Status: Null} - } else if len(value) == 0 { - *dst = <%= multirange_type %>{Status: Present} - } else { - elements := make([]<%= range_type %>, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = <%= multirange_type %>{ - Ranges: elements, - Status: Present, - } - } - default: - return fmt.Errorf("cannot convert %v to <%= multirange_type %>", src) - } - - return nil - -} - -func (dst <%= multirange_type %>) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *<%= multirange_type %>) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *<%= multirange_type %>) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = <%= multirange_type %>{Status: Null} - return nil - } - - utmr, err := ParseUntypedTextMultirange(string(src)) - if err != nil { - return err - } - - var elements []<%= range_type %> - - if len(utmr.Elements) > 0 { - elements = make([]<%= range_type %>, len(utmr.Elements)) - - for i, s := range utmr.Elements { - var elem <%= range_type %> - - elemSrc := []byte(s) - - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = <%= multirange_type %>{Ranges: elements, Status: Present} - - return nil -} - -func (dst *<%= multirange_type %>) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = <%= multirange_type %>{Status: Null} - return nil - } - - rp := 0 - - numElems := int(binary.BigEndian.Uint32(src[rp:])) - rp += 4 - - if numElems == 0 { - *dst = <%= multirange_type %>{Status: Present} - return nil - } - - elements := make([]<%= range_type %>, numElems) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err := elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = <%= multirange_type %>{Ranges: elements, Status: Present} - return nil -} - -func (src <%= multirange_type %>) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = append(buf, '{') - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Ranges { - if i > 0 { - buf = append(buf, ',') - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - return nil, fmt.Errorf("multi-range does not allow null range") - } else { - buf = append(buf, string(elemBuf)...) - } - - } - - buf = append(buf, '}') - - return buf, nil -} - -func (src <%= multirange_type %>) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, int32(len(src.Ranges))) - - for i := range src.Ranges { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Ranges[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *<%= multirange_type %>) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src <%= multirange_type %>) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/typed_multirange_gen.sh b/vendor/github.com/jackc/pgtype/typed_multirange_gen.sh deleted file mode 100644 index 610f40a..0000000 --- a/vendor/github.com/jackc/pgtype/typed_multirange_gen.sh +++ /dev/null @@ -1,8 +0,0 @@ -erb range_type=Numrange multirange_type=Nummultirange typed_multirange.go.erb > num_multirange.go -erb range_type=Int4range multirange_type=Int4multirange typed_multirange.go.erb > int4_multirange.go -erb range_type=Int8range multirange_type=Int8multirange typed_multirange.go.erb > int8_multirange.go -# TODO -# erb range_type=Tsrange multirange_type=Tsmultirange typed_multirange.go.erb > ts_multirange.go -# erb range_type=Tstzrange multirange_type=Tstzmultirange typed_multirange.go.erb > tstz_multirange.go -# erb range_type=Daterange multirange_type=Datemultirange typed_multirange.go.erb > date_multirange.go -goimports -w *multirange.go \ No newline at end of file diff --git a/vendor/github.com/jackc/pgtype/typed_range.go.erb b/vendor/github.com/jackc/pgtype/typed_range.go.erb deleted file mode 100644 index 5625587..0000000 --- a/vendor/github.com/jackc/pgtype/typed_range.go.erb +++ /dev/null @@ -1,269 +0,0 @@ -package pgtype - -import ( - "bytes" - "database/sql/driver" - "fmt" - "io" - - "github.com/jackc/pgio" -) - -type <%= range_type %> struct { - Lower <%= element_type %> - Upper <%= element_type %> - LowerType BoundType - UpperType BoundType - Status Status -} - -func (dst *<%= range_type %>) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = <%= range_type %>{Status: Null} - return nil - } - - switch value := src.(type) { - case <%= range_type %>: - *dst = value - case *<%= range_type %>: - *dst = *value - case string: - return dst.DecodeText(nil, []byte(value)) - default: - return fmt.Errorf("cannot convert %v to <%= range_type %>", src) - } - - return nil -} - -func (dst <%= range_type %>) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *<%= range_type %>) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *<%= range_type %>) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = <%= range_type %>{Status: Null} - return nil - } - - utr, err := ParseUntypedTextRange(string(src)) - if err != nil { - return err - } - - *dst = <%= range_type %>{Status: Present} - - dst.LowerType = utr.LowerType - dst.UpperType = utr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeText(ci, []byte(utr.Lower)); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeText(ci, []byte(utr.Upper)); err != nil { - return err - } - } - - return nil -} - -func (dst *<%= range_type %>) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = <%= range_type %>{Status: Null} - return nil - } - - ubr, err := ParseUntypedBinaryRange(src) - if err != nil { - return err - } - - *dst = <%= range_type %>{Status: Present} - - dst.LowerType = ubr.LowerType - dst.UpperType = ubr.UpperType - - if dst.LowerType == Empty { - return nil - } - - if dst.LowerType == Inclusive || dst.LowerType == Exclusive { - if err := dst.Lower.DecodeBinary(ci, ubr.Lower); err != nil { - return err - } - } - - if dst.UpperType == Inclusive || dst.UpperType == Exclusive { - if err := dst.Upper.DecodeBinary(ci, ubr.Upper); err != nil { - return err - } - } - - return nil -} - -func (src <%= range_type %>) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - switch src.LowerType { - case Exclusive, Unbounded: - buf = append(buf, '(') - case Inclusive: - buf = append(buf, '[') - case Empty: - return append(buf, "empty"...), nil - default: - return nil, fmt.Errorf("unknown lower bound type %v", src.LowerType) - } - - var err error - - if src.LowerType != Unbounded { - buf, err = src.Lower.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - } - - buf = append(buf, ',') - - if src.UpperType != Unbounded { - buf, err = src.Upper.EncodeText(ci, buf) - if err != nil { - return nil, err - } else if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - } - - switch src.UpperType { - case Exclusive, Unbounded: - buf = append(buf, ')') - case Inclusive: - buf = append(buf, ']') - default: - return nil, fmt.Errorf("unknown upper bound type %v", src.UpperType) - } - - return buf, nil -} - -func (src <%= range_type %>) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - var rangeType byte - switch src.LowerType { - case Inclusive: - rangeType |= lowerInclusiveMask - case Unbounded: - rangeType |= lowerUnboundedMask - case Exclusive: - case Empty: - return append(buf, emptyMask), nil - default: - return nil, fmt.Errorf("unknown LowerType: %v", src.LowerType) - } - - switch src.UpperType { - case Inclusive: - rangeType |= upperInclusiveMask - case Unbounded: - rangeType |= upperUnboundedMask - case Exclusive: - default: - return nil, fmt.Errorf("unknown UpperType: %v", src.UpperType) - } - - buf = append(buf, rangeType) - - var err error - - if src.LowerType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Lower.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Lower cannot be null unless LowerType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - if src.UpperType != Unbounded { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - buf, err = src.Upper.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if buf == nil { - return nil, fmt.Errorf("Upper cannot be null unless UpperType is Unbounded") - } - - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *<%= range_type %>) Scan(src interface{}) error { - if src == nil { - *dst = <%= range_type %>{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src <%= range_type %>) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/typed_range_gen.sh b/vendor/github.com/jackc/pgtype/typed_range_gen.sh deleted file mode 100644 index bedda29..0000000 --- a/vendor/github.com/jackc/pgtype/typed_range_gen.sh +++ /dev/null @@ -1,7 +0,0 @@ -erb range_type=Int4range element_type=Int4 typed_range.go.erb > int4range.go -erb range_type=Int8range element_type=Int8 typed_range.go.erb > int8range.go -erb range_type=Tsrange element_type=Timestamp typed_range.go.erb > tsrange.go -erb range_type=Tstzrange element_type=Timestamptz typed_range.go.erb > tstzrange.go -erb range_type=Daterange element_type=Date typed_range.go.erb > daterange.go -erb range_type=Numrange element_type=Numeric typed_range.go.erb > numrange.go -goimports -w *range.go diff --git a/vendor/github.com/jackc/pgtype/unknown.go b/vendor/github.com/jackc/pgtype/unknown.go deleted file mode 100644 index c591b70..0000000 --- a/vendor/github.com/jackc/pgtype/unknown.go +++ /dev/null @@ -1,44 +0,0 @@ -package pgtype - -import "database/sql/driver" - -// Unknown represents the PostgreSQL unknown type. It is either a string literal -// or NULL. It is used when PostgreSQL does not know the type of a value. In -// general, this will only be used in pgx when selecting a null value without -// type information. e.g. SELECT NULL; -type Unknown struct { - String string - Status Status -} - -func (dst *Unknown) Set(src interface{}) error { - return (*Text)(dst).Set(src) -} - -func (dst Unknown) Get() interface{} { - return (Text)(dst).Get() -} - -// AssignTo assigns from src to dst. Note that as Unknown is not a general number -// type AssignTo does not do automatic type conversion as other number types do. -func (src *Unknown) AssignTo(dst interface{}) error { - return (*Text)(src).AssignTo(dst) -} - -func (dst *Unknown) DecodeText(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeText(ci, src) -} - -func (dst *Unknown) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeBinary(ci, src) -} - -// Scan implements the database/sql Scanner interface. -func (dst *Unknown) Scan(src interface{}) error { - return (*Text)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Unknown) Value() (driver.Value, error) { - return (Text)(src).Value() -} diff --git a/vendor/github.com/jackc/pgtype/uuid.go b/vendor/github.com/jackc/pgtype/uuid.go deleted file mode 100644 index 6839c05..0000000 --- a/vendor/github.com/jackc/pgtype/uuid.go +++ /dev/null @@ -1,231 +0,0 @@ -package pgtype - -import ( - "bytes" - "database/sql/driver" - "encoding/hex" - "fmt" -) - -type UUID struct { - Bytes [16]byte - Status Status -} - -func (dst *UUID) Set(src interface{}) error { - if src == nil { - *dst = UUID{Status: Null} - return nil - } - - switch value := src.(type) { - case interface{ Get() interface{} }: - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - case fmt.Stringer: - value2 := value.String() - return dst.Set(value2) - case [16]byte: - *dst = UUID{Bytes: value, Status: Present} - case []byte: - if value != nil { - if len(value) != 16 { - return fmt.Errorf("[]byte must be 16 bytes to convert to UUID: %d", len(value)) - } - *dst = UUID{Status: Present} - copy(dst.Bytes[:], value) - } else { - *dst = UUID{Status: Null} - } - case string: - uuid, err := parseUUID(value) - if err != nil { - return err - } - *dst = UUID{Bytes: uuid, Status: Present} - case *string: - if value == nil { - *dst = UUID{Status: Null} - } else { - return dst.Set(*value) - } - default: - if originalSrc, ok := underlyingUUIDType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to UUID", value) - } - - return nil -} - -func (dst UUID) Get() interface{} { - switch dst.Status { - case Present: - return dst.Bytes - case Null: - return nil - default: - return dst.Status - } -} - -func (src *UUID) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - switch v := dst.(type) { - case *[16]byte: - *v = src.Bytes - return nil - case *[]byte: - *v = make([]byte, 16) - copy(*v, src.Bytes[:]) - return nil - case *string: - *v = encodeUUID(src.Bytes) - return nil - default: - if nextDst, retry := GetAssignToDstType(v); retry { - return src.AssignTo(nextDst) - } - } - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot assign %v into %T", src, dst) -} - -// parseUUID converts a string UUID in standard form to a byte array. -func parseUUID(src string) (dst [16]byte, err error) { - switch len(src) { - case 36: - src = src[0:8] + src[9:13] + src[14:18] + src[19:23] + src[24:] - case 32: - // dashes already stripped, assume valid - default: - // assume invalid. - return dst, fmt.Errorf("cannot parse UUID %v", src) - } - - buf, err := hex.DecodeString(src) - if err != nil { - return dst, err - } - - copy(dst[:], buf) - return dst, err -} - -// encodeUUID converts a uuid byte array to UUID standard string form. -func encodeUUID(src [16]byte) string { - return fmt.Sprintf("%x-%x-%x-%x-%x", src[0:4], src[4:6], src[6:8], src[8:10], src[10:16]) -} - -func (dst *UUID) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = UUID{Status: Null} - return nil - } - - if len(src) != 36 { - return fmt.Errorf("invalid length for UUID: %v", len(src)) - } - - buf, err := parseUUID(string(src)) - if err != nil { - return err - } - - *dst = UUID{Bytes: buf, Status: Present} - return nil -} - -func (dst *UUID) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = UUID{Status: Null} - return nil - } - - if len(src) != 16 { - return fmt.Errorf("invalid length for UUID: %v", len(src)) - } - - *dst = UUID{Status: Present} - copy(dst.Bytes[:], src) - return nil -} - -func (src UUID) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, encodeUUID(src.Bytes)...), nil -} - -func (src UUID) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - return append(buf, src.Bytes[:]...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *UUID) Scan(src interface{}) error { - if src == nil { - *dst = UUID{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src UUID) Value() (driver.Value, error) { - return EncodeValueText(src) -} - -func (src UUID) MarshalJSON() ([]byte, error) { - switch src.Status { - case Present: - var buff bytes.Buffer - buff.WriteByte('"') - buff.WriteString(encodeUUID(src.Bytes)) - buff.WriteByte('"') - return buff.Bytes(), nil - case Null: - return []byte("null"), nil - case Undefined: - return nil, errUndefined - } - return nil, errBadStatus -} - -func (dst *UUID) UnmarshalJSON(src []byte) error { - if bytes.Compare(src, []byte("null")) == 0 { - return dst.Set(nil) - } - if len(src) != 38 { - return fmt.Errorf("invalid length for UUID: %v", len(src)) - } - return dst.Set(string(src[1 : len(src)-1])) -} diff --git a/vendor/github.com/jackc/pgtype/uuid_array.go b/vendor/github.com/jackc/pgtype/uuid_array.go deleted file mode 100644 index 00721ef..0000000 --- a/vendor/github.com/jackc/pgtype/uuid_array.go +++ /dev/null @@ -1,573 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type UUIDArray struct { - Elements []UUID - Dimensions []ArrayDimension - Status Status -} - -func (dst *UUIDArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = UUIDArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case [][16]byte: - if value == nil { - *dst = UUIDArray{Status: Null} - } else if len(value) == 0 { - *dst = UUIDArray{Status: Present} - } else { - elements := make([]UUID, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = UUIDArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case [][]byte: - if value == nil { - *dst = UUIDArray{Status: Null} - } else if len(value) == 0 { - *dst = UUIDArray{Status: Present} - } else { - elements := make([]UUID, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = UUIDArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []string: - if value == nil { - *dst = UUIDArray{Status: Null} - } else if len(value) == 0 { - *dst = UUIDArray{Status: Present} - } else { - elements := make([]UUID, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = UUIDArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*string: - if value == nil { - *dst = UUIDArray{Status: Null} - } else if len(value) == 0 { - *dst = UUIDArray{Status: Present} - } else { - elements := make([]UUID, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = UUIDArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []UUID: - if value == nil { - *dst = UUIDArray{Status: Null} - } else if len(value) == 0 { - *dst = UUIDArray{Status: Present} - } else { - *dst = UUIDArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = UUIDArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for UUIDArray", src) - } - if elementsLength == 0 { - *dst = UUIDArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to UUIDArray", src) - } - - *dst = UUIDArray{ - Elements: make([]UUID, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]UUID, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to UUIDArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *UUIDArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to UUIDArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in UUIDArray", err) - } - index++ - - return index, nil -} - -func (dst UUIDArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *UUIDArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[][16]byte: - *v = make([][16]byte, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[][]byte: - *v = make([][]byte, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*string: - *v = make([]*string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *UUIDArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from UUIDArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from UUIDArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *UUIDArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = UUIDArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []UUID - - if len(uta.Elements) > 0 { - elements = make([]UUID, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem UUID - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = UUIDArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *UUIDArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = UUIDArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = UUIDArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]UUID, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = UUIDArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src UUIDArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src UUIDArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("uuid"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "uuid") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *UUIDArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src UUIDArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/varbit.go b/vendor/github.com/jackc/pgtype/varbit.go deleted file mode 100644 index f24dc5b..0000000 --- a/vendor/github.com/jackc/pgtype/varbit.go +++ /dev/null @@ -1,133 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - - "github.com/jackc/pgio" -) - -type Varbit struct { - Bytes []byte - Len int32 // Number of bits - Status Status -} - -func (dst *Varbit) Set(src interface{}) error { - return fmt.Errorf("cannot convert %v to Varbit", src) -} - -func (dst Varbit) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *Varbit) AssignTo(dst interface{}) error { - return fmt.Errorf("cannot assign %v to %T", src, dst) -} - -func (dst *Varbit) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Varbit{Status: Null} - return nil - } - - bitLen := len(src) - byteLen := bitLen / 8 - if bitLen%8 > 0 { - byteLen++ - } - buf := make([]byte, byteLen) - - for i, b := range src { - if b == '1' { - byteIdx := i / 8 - bitIdx := uint(i % 8) - buf[byteIdx] = buf[byteIdx] | (128 >> bitIdx) - } - } - - *dst = Varbit{Bytes: buf, Len: int32(bitLen), Status: Present} - return nil -} - -func (dst *Varbit) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = Varbit{Status: Null} - return nil - } - - if len(src) < 4 { - return fmt.Errorf("invalid length for varbit: %v", len(src)) - } - - bitLen := int32(binary.BigEndian.Uint32(src)) - rp := 4 - - *dst = Varbit{Bytes: src[rp:], Len: bitLen, Status: Present} - return nil -} - -func (src Varbit) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - for i := int32(0); i < src.Len; i++ { - byteIdx := i / 8 - bitMask := byte(128 >> byte(i%8)) - char := byte('0') - if src.Bytes[byteIdx]&bitMask > 0 { - char = '1' - } - buf = append(buf, char) - } - - return buf, nil -} - -func (src Varbit) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - buf = pgio.AppendInt32(buf, src.Len) - return append(buf, src.Bytes...), nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *Varbit) Scan(src interface{}) error { - if src == nil { - *dst = Varbit{Status: Null} - return nil - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Varbit) Value() (driver.Value, error) { - return EncodeValueText(src) -} diff --git a/vendor/github.com/jackc/pgtype/varchar.go b/vendor/github.com/jackc/pgtype/varchar.go deleted file mode 100644 index fea31d1..0000000 --- a/vendor/github.com/jackc/pgtype/varchar.go +++ /dev/null @@ -1,66 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -type Varchar Text - -// Set converts from src to dst. Note that as Varchar is not a general -// number type Set does not do automatic type conversion as other number -// types do. -func (dst *Varchar) Set(src interface{}) error { - return (*Text)(dst).Set(src) -} - -func (dst Varchar) Get() interface{} { - return (Text)(dst).Get() -} - -// AssignTo assigns from src to dst. Note that as Varchar is not a general number -// type AssignTo does not do automatic type conversion as other number types do. -func (src *Varchar) AssignTo(dst interface{}) error { - return (*Text)(src).AssignTo(dst) -} - -func (Varchar) PreferredResultFormat() int16 { - return TextFormatCode -} - -func (dst *Varchar) DecodeText(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeText(ci, src) -} - -func (dst *Varchar) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*Text)(dst).DecodeBinary(ci, src) -} - -func (Varchar) PreferredParamFormat() int16 { - return TextFormatCode -} - -func (src Varchar) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeText(ci, buf) -} - -func (src Varchar) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (Text)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *Varchar) Scan(src interface{}) error { - return (*Text)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src Varchar) Value() (driver.Value, error) { - return (Text)(src).Value() -} - -func (src Varchar) MarshalJSON() ([]byte, error) { - return (Text)(src).MarshalJSON() -} - -func (dst *Varchar) UnmarshalJSON(b []byte) error { - return (*Text)(dst).UnmarshalJSON(b) -} diff --git a/vendor/github.com/jackc/pgtype/varchar_array.go b/vendor/github.com/jackc/pgtype/varchar_array.go deleted file mode 100644 index 8a309a3..0000000 --- a/vendor/github.com/jackc/pgtype/varchar_array.go +++ /dev/null @@ -1,517 +0,0 @@ -// Code generated by erb. DO NOT EDIT. - -package pgtype - -import ( - "database/sql/driver" - "encoding/binary" - "fmt" - "reflect" - - "github.com/jackc/pgio" -) - -type VarcharArray struct { - Elements []Varchar - Dimensions []ArrayDimension - Status Status -} - -func (dst *VarcharArray) Set(src interface{}) error { - // untyped nil and typed nil interfaces are different - if src == nil { - *dst = VarcharArray{Status: Null} - return nil - } - - if value, ok := src.(interface{ Get() interface{} }); ok { - value2 := value.Get() - if value2 != value { - return dst.Set(value2) - } - } - - // Attempt to match to select common types: - switch value := src.(type) { - - case []string: - if value == nil { - *dst = VarcharArray{Status: Null} - } else if len(value) == 0 { - *dst = VarcharArray{Status: Present} - } else { - elements := make([]Varchar, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = VarcharArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []*string: - if value == nil { - *dst = VarcharArray{Status: Null} - } else if len(value) == 0 { - *dst = VarcharArray{Status: Present} - } else { - elements := make([]Varchar, len(value)) - for i := range value { - if err := elements[i].Set(value[i]); err != nil { - return err - } - } - *dst = VarcharArray{ - Elements: elements, - Dimensions: []ArrayDimension{{Length: int32(len(elements)), LowerBound: 1}}, - Status: Present, - } - } - - case []Varchar: - if value == nil { - *dst = VarcharArray{Status: Null} - } else if len(value) == 0 { - *dst = VarcharArray{Status: Present} - } else { - *dst = VarcharArray{ - Elements: value, - Dimensions: []ArrayDimension{{Length: int32(len(value)), LowerBound: 1}}, - Status: Present, - } - } - default: - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - reflectedValue := reflect.ValueOf(src) - if !reflectedValue.IsValid() || reflectedValue.IsZero() { - *dst = VarcharArray{Status: Null} - return nil - } - - dimensions, elementsLength, ok := findDimensionsFromValue(reflectedValue, nil, 0) - if !ok { - return fmt.Errorf("cannot find dimensions of %v for VarcharArray", src) - } - if elementsLength == 0 { - *dst = VarcharArray{Status: Present} - return nil - } - if len(dimensions) == 0 { - if originalSrc, ok := underlyingSliceType(src); ok { - return dst.Set(originalSrc) - } - return fmt.Errorf("cannot convert %v to VarcharArray", src) - } - - *dst = VarcharArray{ - Elements: make([]Varchar, elementsLength), - Dimensions: dimensions, - Status: Present, - } - elementCount, err := dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - // Maybe the target was one dimension too far, try again: - if len(dst.Dimensions) > 1 { - dst.Dimensions = dst.Dimensions[:len(dst.Dimensions)-1] - elementsLength = 0 - for _, dim := range dst.Dimensions { - if elementsLength == 0 { - elementsLength = int(dim.Length) - } else { - elementsLength *= int(dim.Length) - } - } - dst.Elements = make([]Varchar, elementsLength) - elementCount, err = dst.setRecursive(reflectedValue, 0, 0) - if err != nil { - return err - } - } else { - return err - } - } - if elementCount != len(dst.Elements) { - return fmt.Errorf("cannot convert %v to VarcharArray, expected %d dst.Elements, but got %d instead", src, len(dst.Elements), elementCount) - } - } - - return nil -} - -func (dst *VarcharArray) setRecursive(value reflect.Value, index, dimension int) (int, error) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(dst.Dimensions) == dimension { - break - } - - valueLen := value.Len() - if int32(valueLen) != dst.Dimensions[dimension].Length { - return 0, fmt.Errorf("multidimensional arrays must have array expressions with matching dimensions") - } - for i := 0; i < valueLen; i++ { - var err error - index, err = dst.setRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if !value.CanInterface() { - return 0, fmt.Errorf("cannot convert all values to VarcharArray") - } - if err := dst.Elements[index].Set(value.Interface()); err != nil { - return 0, fmt.Errorf("%v in VarcharArray", err) - } - index++ - - return index, nil -} - -func (dst VarcharArray) Get() interface{} { - switch dst.Status { - case Present: - return dst - case Null: - return nil - default: - return dst.Status - } -} - -func (src *VarcharArray) AssignTo(dst interface{}) error { - switch src.Status { - case Present: - if len(src.Dimensions) <= 1 { - // Attempt to match to select common types: - switch v := dst.(type) { - - case *[]string: - *v = make([]string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - case *[]*string: - *v = make([]*string, len(src.Elements)) - for i := range src.Elements { - if err := src.Elements[i].AssignTo(&((*v)[i])); err != nil { - return err - } - } - return nil - - } - } - - // Try to convert to something AssignTo can use directly. - if nextDst, retry := GetAssignToDstType(dst); retry { - return src.AssignTo(nextDst) - } - - // Fallback to reflection if an optimised match was not found. - // The reflection is necessary for arrays and multidimensional slices, - // but it comes with a 20-50% performance penalty for large arrays/slices - value := reflect.ValueOf(dst) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - switch value.Kind() { - case reflect.Array, reflect.Slice: - default: - return fmt.Errorf("cannot assign %T to %T", src, dst) - } - - if len(src.Elements) == 0 { - if value.Kind() == reflect.Slice { - value.Set(reflect.MakeSlice(value.Type(), 0, 0)) - return nil - } - } - - elementCount, err := src.assignToRecursive(value, 0, 0) - if err != nil { - return err - } - if elementCount != len(src.Elements) { - return fmt.Errorf("cannot assign %v, needed to assign %d elements, but only assigned %d", dst, len(src.Elements), elementCount) - } - - return nil - case Null: - return NullAssignTo(dst) - } - - return fmt.Errorf("cannot decode %#v into %T", src, dst) -} - -func (src *VarcharArray) assignToRecursive(value reflect.Value, index, dimension int) (int, error) { - switch kind := value.Kind(); kind { - case reflect.Array: - fallthrough - case reflect.Slice: - if len(src.Dimensions) == dimension { - break - } - - length := int(src.Dimensions[dimension].Length) - if reflect.Array == kind { - typ := value.Type() - if typ.Len() != length { - return 0, fmt.Errorf("expected size %d array, but %s has size %d array", length, typ, typ.Len()) - } - value.Set(reflect.New(typ).Elem()) - } else { - value.Set(reflect.MakeSlice(value.Type(), length, length)) - } - - var err error - for i := 0; i < length; i++ { - index, err = src.assignToRecursive(value.Index(i), index, dimension+1) - if err != nil { - return 0, err - } - } - - return index, nil - } - if len(src.Dimensions) != dimension { - return 0, fmt.Errorf("incorrect dimensions, expected %d, found %d", len(src.Dimensions), dimension) - } - if !value.CanAddr() { - return 0, fmt.Errorf("cannot assign all values from VarcharArray") - } - addr := value.Addr() - if !addr.CanInterface() { - return 0, fmt.Errorf("cannot assign all values from VarcharArray") - } - if err := src.Elements[index].AssignTo(addr.Interface()); err != nil { - return 0, err - } - index++ - return index, nil -} - -func (dst *VarcharArray) DecodeText(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = VarcharArray{Status: Null} - return nil - } - - uta, err := ParseUntypedTextArray(string(src)) - if err != nil { - return err - } - - var elements []Varchar - - if len(uta.Elements) > 0 { - elements = make([]Varchar, len(uta.Elements)) - - for i, s := range uta.Elements { - var elem Varchar - var elemSrc []byte - if s != "NULL" || uta.Quoted[i] { - elemSrc = []byte(s) - } - err = elem.DecodeText(ci, elemSrc) - if err != nil { - return err - } - - elements[i] = elem - } - } - - *dst = VarcharArray{Elements: elements, Dimensions: uta.Dimensions, Status: Present} - - return nil -} - -func (dst *VarcharArray) DecodeBinary(ci *ConnInfo, src []byte) error { - if src == nil { - *dst = VarcharArray{Status: Null} - return nil - } - - var arrayHeader ArrayHeader - rp, err := arrayHeader.DecodeBinary(ci, src) - if err != nil { - return err - } - - if len(arrayHeader.Dimensions) == 0 { - *dst = VarcharArray{Dimensions: arrayHeader.Dimensions, Status: Present} - return nil - } - - elementCount := arrayHeader.Dimensions[0].Length - for _, d := range arrayHeader.Dimensions[1:] { - elementCount *= d.Length - } - - elements := make([]Varchar, elementCount) - - for i := range elements { - elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) - rp += 4 - var elemSrc []byte - if elemLen >= 0 { - elemSrc = src[rp : rp+elemLen] - rp += elemLen - } - err = elements[i].DecodeBinary(ci, elemSrc) - if err != nil { - return err - } - } - - *dst = VarcharArray{Elements: elements, Dimensions: arrayHeader.Dimensions, Status: Present} - return nil -} - -func (src VarcharArray) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - if len(src.Dimensions) == 0 { - return append(buf, '{', '}'), nil - } - - buf = EncodeTextArrayDimensions(buf, src.Dimensions) - - // dimElemCounts is the multiples of elements that each array lies on. For - // example, a single dimension array of length 4 would have a dimElemCounts of - // [4]. A multi-dimensional array of lengths [3,5,2] would have a - // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' - // or '}'. - dimElemCounts := make([]int, len(src.Dimensions)) - dimElemCounts[len(src.Dimensions)-1] = int(src.Dimensions[len(src.Dimensions)-1].Length) - for i := len(src.Dimensions) - 2; i > -1; i-- { - dimElemCounts[i] = int(src.Dimensions[i].Length) * dimElemCounts[i+1] - } - - inElemBuf := make([]byte, 0, 32) - for i, elem := range src.Elements { - if i > 0 { - buf = append(buf, ',') - } - - for _, dec := range dimElemCounts { - if i%dec == 0 { - buf = append(buf, '{') - } - } - - elemBuf, err := elem.EncodeText(ci, inElemBuf) - if err != nil { - return nil, err - } - if elemBuf == nil { - buf = append(buf, `NULL`...) - } else { - buf = append(buf, QuoteArrayElementIfNeeded(string(elemBuf))...) - } - - for _, dec := range dimElemCounts { - if (i+1)%dec == 0 { - buf = append(buf, '}') - } - } - } - - return buf, nil -} - -func (src VarcharArray) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - switch src.Status { - case Null: - return nil, nil - case Undefined: - return nil, errUndefined - } - - arrayHeader := ArrayHeader{ - Dimensions: src.Dimensions, - } - - if dt, ok := ci.DataTypeForName("varchar"); ok { - arrayHeader.ElementOID = int32(dt.OID) - } else { - return nil, fmt.Errorf("unable to find oid for type name %v", "varchar") - } - - for i := range src.Elements { - if src.Elements[i].Status == Null { - arrayHeader.ContainsNull = true - break - } - } - - buf = arrayHeader.EncodeBinary(ci, buf) - - for i := range src.Elements { - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - - elemBuf, err := src.Elements[i].EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if elemBuf != nil { - buf = elemBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - } - - return buf, nil -} - -// Scan implements the database/sql Scanner interface. -func (dst *VarcharArray) Scan(src interface{}) error { - if src == nil { - return dst.DecodeText(nil, nil) - } - - switch src := src.(type) { - case string: - return dst.DecodeText(nil, []byte(src)) - case []byte: - srcCopy := make([]byte, len(src)) - copy(srcCopy, src) - return dst.DecodeText(nil, srcCopy) - } - - return fmt.Errorf("cannot scan %T", src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src VarcharArray) Value() (driver.Value, error) { - buf, err := src.EncodeText(nil, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - - return string(buf), nil -} diff --git a/vendor/github.com/jackc/pgtype/xid.go b/vendor/github.com/jackc/pgtype/xid.go deleted file mode 100644 index f6d6b22..0000000 --- a/vendor/github.com/jackc/pgtype/xid.go +++ /dev/null @@ -1,64 +0,0 @@ -package pgtype - -import ( - "database/sql/driver" -) - -// XID is PostgreSQL's Transaction ID type. -// -// In later versions of PostgreSQL, it is the type used for the backend_xid -// and backend_xmin columns of the pg_stat_activity system view. -// -// Also, when one does -// -// select xmin, xmax, * from some_table; -// -// it is the data type of the xmin and xmax hidden system columns. -// -// It is currently implemented as an unsigned four byte integer. -// Its definition can be found in src/include/postgres_ext.h as TransactionId -// in the PostgreSQL sources. -type XID pguint32 - -// Set converts from src to dst. Note that as XID is not a general -// number type Set does not do automatic type conversion as other number -// types do. -func (dst *XID) Set(src interface{}) error { - return (*pguint32)(dst).Set(src) -} - -func (dst XID) Get() interface{} { - return (pguint32)(dst).Get() -} - -// AssignTo assigns from src to dst. Note that as XID is not a general number -// type AssignTo does not do automatic type conversion as other number types do. -func (src *XID) AssignTo(dst interface{}) error { - return (*pguint32)(src).AssignTo(dst) -} - -func (dst *XID) DecodeText(ci *ConnInfo, src []byte) error { - return (*pguint32)(dst).DecodeText(ci, src) -} - -func (dst *XID) DecodeBinary(ci *ConnInfo, src []byte) error { - return (*pguint32)(dst).DecodeBinary(ci, src) -} - -func (src XID) EncodeText(ci *ConnInfo, buf []byte) ([]byte, error) { - return (pguint32)(src).EncodeText(ci, buf) -} - -func (src XID) EncodeBinary(ci *ConnInfo, buf []byte) ([]byte, error) { - return (pguint32)(src).EncodeBinary(ci, buf) -} - -// Scan implements the database/sql Scanner interface. -func (dst *XID) Scan(src interface{}) error { - return (*pguint32)(dst).Scan(src) -} - -// Value implements the database/sql/driver Valuer interface. -func (src XID) Value() (driver.Value, error) { - return (pguint32)(src).Value() -} diff --git a/vendor/github.com/jackc/pgx/v4/CHANGELOG.md b/vendor/github.com/jackc/pgx/v4/CHANGELOG.md deleted file mode 100644 index 17d29cc..0000000 --- a/vendor/github.com/jackc/pgx/v4/CHANGELOG.md +++ /dev/null @@ -1,304 +0,0 @@ -# 4.18.2 (March 4, 2024) - -Fix CVE-2024-27289 - -SQL injection can occur when all of the following conditions are met: - -1. The non-default simple protocol is used. -2. A placeholder for a numeric value must be immediately preceded by a minus. -3. There must be a second placeholder for a string value after the first placeholder; both must be on the same line. -4. Both parameter values must be user-controlled. - -Thanks to Paul Gerste for reporting this issue. - -Fix CVE-2024-27304 - -SQL injection can occur if an attacker can cause a single query or bind message to exceed 4 GB in size. An integer -overflow in the calculated message size can cause the one large message to be sent as multiple messages under the -attacker's control. - -Thanks to Paul Gerste for reporting this issue. - -* Fix *dbTx.Exec not checking if it is already closed - -# 4.18.1 (February 27, 2023) - -* Fix: Support pgx v4 and v5 stdlib in same program (Tomáš Procházka) - -# 4.18.0 (February 11, 2023) - -* Upgrade pgconn to v1.14.0 -* Upgrade pgproto3 to v2.3.2 -* Upgrade pgtype to v1.14.0 -* Fix query sanitizer when query text contains Unicode replacement character -* Fix context with value in BeforeConnect (David Harju) -* Support pgx v4 and v5 stdlib in same program (Vitalii Solodilov) - -# 4.17.2 (September 3, 2022) - -* Fix panic when logging batch error (Tom Möller) - -# 4.17.1 (August 27, 2022) - -* Upgrade puddle to v1.3.0 - fixes context failing to cancel Acquire when acquire is creating resource which was introduced in v4.17.0 (James Hartig) -* Fix atomic alignment on 32-bit platforms - -# 4.17.0 (August 6, 2022) - -* Upgrade pgconn to v1.13.0 -* Upgrade pgproto3 to v2.3.1 -* Upgrade pgtype to v1.12.0 -* Allow background pool connections to continue even if cause is canceled (James Hartig) -* Add LoggerFunc (Gabor Szabad) -* pgxpool: health check should avoid going below minConns (James Hartig) -* Add pgxpool.Conn.Hijack() -* Logging improvements (Stepan Rabotkin) - -# 4.16.1 (May 7, 2022) - -* Upgrade pgconn to v1.12.1 -* Fix explicitly prepared statements with describe statement cache mode - -# 4.16.0 (April 21, 2022) - -* Upgrade pgconn to v1.12.0 -* Upgrade pgproto3 to v2.3.0 -* Upgrade pgtype to v1.11.0 -* Fix: Do not panic when context cancelled while getting statement from cache. -* Fix: Less memory pinning from old Rows. -* Fix: Support '\r' line ending when sanitizing SQL comment. -* Add pluggable GSSAPI support (Oliver Tan) - -# 4.15.0 (February 7, 2022) - -* Upgrade to pgconn v1.11.0 -* Upgrade to pgtype v1.10.0 -* Upgrade puddle to v1.2.1 -* Make BatchResults.Close safe to be called multiple times - -# 4.14.1 (November 28, 2021) - -* Upgrade pgtype to v1.9.1 (fixes unintentional change to timestamp binary decoding) -* Start pgxpool background health check after initial connections - -# 4.14.0 (November 20, 2021) - -* Upgrade pgconn to v1.10.1 -* Upgrade pgproto3 to v2.2.0 -* Upgrade pgtype to v1.9.0 -* Upgrade puddle to v1.2.0 -* Add QueryFunc to BatchResults -* Add context options to zerologadapter (Thomas Frössman) -* Add zerologadapter.NewContextLogger (urso) -* Eager initialize minpoolsize on connect (Daniel) -* Unpin memory used by large queries immediately after use - -# 4.13.0 (July 24, 2021) - -* Trimmed pseudo-dependencies in Go modules from other packages tests -* Upgrade pgconn -- context cancellation no longer will return a net.Error -* Support time durations for simple protocol (Michael Darr) - -# 4.12.0 (July 10, 2021) - -* ResetSession hook is called before a connection is reused from pool for another query (Dmytro Haranzha) -* stdlib: Add RandomizeHostOrderFunc (dkinder) -* stdlib: add OptionBeforeConnect (dkinder) -* stdlib: Do not reuse ConnConfig strings (Andrew Kimball) -* stdlib: implement Conn.ResetSession (Jonathan Amsterdam) -* Upgrade pgconn to v1.9.0 -* Upgrade pgtype to v1.8.0 - -# 4.11.0 (March 25, 2021) - -* Add BeforeConnect callback to pgxpool.Config (Robert Froehlich) -* Add Ping method to pgxpool.Conn (davidsbond) -* Added a kitlog level log adapter (Fabrice Aneche) -* Make ScanArgError public to allow identification of offending column (Pau Sanchez) -* Add *pgxpool.AcquireFunc -* Add BeginFunc and BeginTxFunc -* Add prefer_simple_protocol to connection string -* Add logging on CopyFrom (Patrick Hemmer) -* Add comment support when sanitizing SQL queries (Rusakow Andrew) -* Do not panic on double close of pgxpool.Pool (Matt Schultz) -* Avoid panic on SendBatch on closed Tx (Matt Schultz) -* Update pgconn to v1.8.1 -* Update pgtype to v1.7.0 - -# 4.10.1 (December 19, 2020) - -* Fix panic on Query error with nil stmtcache. - -# 4.10.0 (December 3, 2020) - -* Add CopyFromSlice to simplify CopyFrom usage (Egon Elbre) -* Remove broken prepared statements from stmtcache (Ethan Pailes) -* stdlib: consider any Ping error as fatal -* Update puddle to v1.1.3 - this fixes an issue where concurrent Acquires can hang when a connection cannot be established -* Update pgtype to v1.6.2 - -# 4.9.2 (November 3, 2020) - -The underlying library updates fix an issue where appending to a scanned slice could corrupt other data. - -* Update pgconn to v1.7.2 -* Update pgproto3 to v2.0.6 - -# 4.9.1 (October 31, 2020) - -* Update pgconn to v1.7.1 -* Update pgtype to v1.6.1 -* Fix SendBatch of all prepared statements with statement cache disabled - -# 4.9.0 (September 26, 2020) - -* pgxpool now waits for connection cleanup to finish before making room in pool for another connection. This prevents temporarily exceeding max pool size. -* Fix when scanning a column to nil to skip it on the first row but scanning it to a real value on a subsequent row. -* Fix prefer simple protocol with prepared statements. (Jinzhu) -* Fix FieldDescriptions not being available on Rows before calling Next the first time. -* Various minor fixes in updated versions of pgconn, pgtype, and puddle. - -# 4.8.1 (July 29, 2020) - -* Update pgconn to v1.6.4 - * Fix deadlock on error after CommandComplete but before ReadyForQuery - * Fix panic on parsing DSN with trailing '=' - -# 4.8.0 (July 22, 2020) - -* All argument types supported by native pgx should now also work through database/sql -* Update pgconn to v1.6.3 -* Update pgtype to v1.4.2 - -# 4.7.2 (July 14, 2020) - -* Improve performance of Columns() (zikaeroh) -* Fix fatal Commit() failure not being considered fatal -* Update pgconn to v1.6.2 -* Update pgtype to v1.4.1 - -# 4.7.1 (June 29, 2020) - -* Fix stdlib decoding error with certain order and combination of fields - -# 4.7.0 (June 27, 2020) - -* Update pgtype to v1.4.0 -* Update pgconn to v1.6.1 -* Update puddle to v1.1.1 -* Fix context propagation with Tx commit and Rollback (georgysavva) -* Add lazy connect option to pgxpool (georgysavva) -* Fix connection leak if pgxpool.BeginTx() fail (Jean-Baptiste Bronisz) -* Add native Go slice support for strings and numbers to simple protocol -* stdlib add default timeouts for Conn.Close() and Stmt.Close() (georgysavva) -* Assorted performance improvements especially with large result sets -* Fix close pool on not lazy connect failure (Yegor Myskin) -* Add Config copy (georgysavva) -* Support SendBatch with Simple Protocol (Jordan Lewis) -* Better error logging on rows close (Igor V. Kozinov) -* Expose stdlib.Conn.Conn() to enable database/sql.Conn.Raw() -* Improve unknown type support for database/sql -* Fix transaction commit failure closing connection - -# 4.6.0 (March 30, 2020) - -* stdlib: Bail early if preloading rows.Next() results in rows.Err() (Bas van Beek) -* Sanitize time to microsecond accuracy (Andrew Nicoll) -* Update pgtype to v1.3.0 -* Update pgconn to v1.5.0 - * Update golang.org/x/crypto for security fix - * Implement "verify-ca" SSL mode - -# 4.5.0 (March 7, 2020) - -* Update to pgconn v1.4.0 - * Fixes QueryRow with empty SQL - * Adds PostgreSQL service file support -* Add Len() to *pgx.Batch (WGH) -* Better logging for individual batch items (Ben Bader) - -# 4.4.1 (February 14, 2020) - -* Update pgconn to v1.3.2 - better default read buffer size -* Fix race in CopyFrom - -# 4.4.0 (February 5, 2020) - -* Update puddle to v1.1.0 - fixes possible deadlock when acquire is cancelled -* Update pgconn to v1.3.1 - fixes CopyFrom deadlock when multiple NoticeResponse received during copy -* Update pgtype to v1.2.0 -* Add MaxConnIdleTime to pgxpool (Patrick Ellul) -* Add MinConns to pgxpool (Patrick Ellul) -* Fix: stdlib.ReleaseConn closes connections left in invalid state - -# 4.3.0 (January 23, 2020) - -* Fix Rows.Values panic when unable to decode -* Add Rows.Values support for unknown types -* Add DriverContext support for stdlib (Alex Gaynor) -* Update pgproto3 to v2.0.1 to never return an io.EOF as it would be misinterpreted by database/sql. Instead return io.UnexpectedEOF. - -# 4.2.1 (January 13, 2020) - -* Update pgconn to v1.2.1 (fixes context cancellation data race introduced in v1.2.0)) - -# 4.2.0 (January 11, 2020) - -* Update pgconn to v1.2.0. -* Update pgtype to v1.1.0. -* Return error instead of panic when wrong number of arguments passed to Exec. (malstoun) -* Fix large objects functionality when PreferSimpleProtocol = true. -* Restore GetDefaultDriver which existed in v3. (Johan Brandhorst) -* Add RegisterConnConfig to stdlib which replaces the removed RegisterDriverConfig from v3. - -# 4.1.2 (October 22, 2019) - -* Fix dbSavepoint.Begin recursive self call -* Upgrade pgtype to v1.0.2 - fix scan pointer to pointer - -# 4.1.1 (October 21, 2019) - -* Fix pgxpool Rows.CommandTag() infinite loop / typo - -# 4.1.0 (October 12, 2019) - -## Potentially Breaking Changes - -Technically, two changes are breaking changes, but in practice these are extremely unlikely to break existing code. - -* Conn.Begin and Conn.BeginTx return a Tx interface instead of the internal dbTx struct. This is necessary for the Conn.Begin method to signature as other methods that begin a transaction. -* Add Conn() to Tx interface. This is necessary to allow code using a Tx to access the *Conn (and pgconn.PgConn) on which the Tx is executing. - -## Fixes - -* Releasing a busy connection closes the connection instead of returning an unusable connection to the pool -* Do not mutate config.Config.OnNotification in connect - -# 4.0.1 (September 19, 2019) - -* Fix statement cache cleanup. -* Corrected daterange OID. -* Fix Tx when committing or rolling back multiple times in certain cases. -* Improve documentation. - -# 4.0.0 (September 14, 2019) - -v4 is a major release with many significant changes some of which are breaking changes. The most significant are -included below. - -* Simplified establishing a connection with a connection string. -* All potentially blocking operations now require a context.Context. The non-context aware functions have been removed. -* OIDs are hard-coded for known types. This saves the query on connection. -* Context cancellations while network activity is in progress is now always fatal. Previously, it was sometimes recoverable. This led to increased complexity in pgx itself and in application code. -* Go modules are required. -* Errors are now implemented in the Go 1.13 style. -* `Rows` and `Tx` are now interfaces. -* The connection pool as been decoupled from pgx and is now a separate, included package (github.com/jackc/pgx/v4/pgxpool). -* pgtype has been spun off to a separate package (github.com/jackc/pgtype). -* pgproto3 has been spun off to a separate package (github.com/jackc/pgproto3/v2). -* Logical replication support has been spun off to a separate package (github.com/jackc/pglogrepl). -* Lower level PostgreSQL functionality is now implemented in a separate package (github.com/jackc/pgconn). -* Tests are now configured with environment variables. -* Conn has an automatic statement cache by default. -* Batch interface has been simplified. -* QueryArgs has been removed. diff --git a/vendor/github.com/jackc/pgx/v4/LICENSE b/vendor/github.com/jackc/pgx/v4/LICENSE deleted file mode 100644 index 5c486c3..0000000 --- a/vendor/github.com/jackc/pgx/v4/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2013-2021 Jack Christensen - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/jackc/pgx/v4/README.md b/vendor/github.com/jackc/pgx/v4/README.md deleted file mode 100644 index 46b9c55..0000000 --- a/vendor/github.com/jackc/pgx/v4/README.md +++ /dev/null @@ -1,196 +0,0 @@ -[![](https://godoc.org/github.com/jackc/pgx?status.svg)](https://pkg.go.dev/github.com/jackc/pgx/v4) -[![Build Status](https://travis-ci.org/jackc/pgx.svg)](https://travis-ci.org/jackc/pgx) - ---- - -This is the previous stable `v4` release. `v5` been released. - ---- -# pgx - PostgreSQL Driver and Toolkit - -pgx is a pure Go driver and toolkit for PostgreSQL. - -pgx aims to be low-level, fast, and performant, while also enabling PostgreSQL-specific features that the standard `database/sql` package does not allow for. - -The driver component of pgx can be used alongside the standard `database/sql` package. - -The toolkit component is a related set of packages that implement PostgreSQL functionality such as parsing the wire protocol -and type mapping between PostgreSQL and Go. These underlying packages can be used to implement alternative drivers, -proxies, load balancers, logical replication clients, etc. - -The current release of `pgx v4` requires Go modules. To use the previous version, checkout and vendor the `v3` branch. - -## Example Usage - -```go -package main - -import ( - "context" - "fmt" - "os" - - "github.com/jackc/pgx/v4" -) - -func main() { - // urlExample := "postgres://username:password@localhost:5432/database_name" - conn, err := pgx.Connect(context.Background(), os.Getenv("DATABASE_URL")) - if err != nil { - fmt.Fprintf(os.Stderr, "Unable to connect to database: %v\n", err) - os.Exit(1) - } - defer conn.Close(context.Background()) - - var name string - var weight int64 - err = conn.QueryRow(context.Background(), "select name, weight from widgets where id=$1", 42).Scan(&name, &weight) - if err != nil { - fmt.Fprintf(os.Stderr, "QueryRow failed: %v\n", err) - os.Exit(1) - } - - fmt.Println(name, weight) -} -``` - -See the [getting started guide](https://github.com/jackc/pgx/wiki/Getting-started-with-pgx) for more information. - -## Choosing Between the pgx and database/sql Interfaces - -It is recommended to use the pgx interface if: -1. The application only targets PostgreSQL. -2. No other libraries that require `database/sql` are in use. - -The pgx interface is faster and exposes more features. - -The `database/sql` interface only allows the underlying driver to return or receive the following types: `int64`, -`float64`, `bool`, `[]byte`, `string`, `time.Time`, or `nil`. Handling other types requires implementing the -`database/sql.Scanner` and the `database/sql/driver/driver.Valuer` interfaces which require transmission of values in text format. The binary format can be substantially faster, which is what the pgx interface uses. - -## Features - -pgx supports many features beyond what is available through `database/sql`: - -* Support for approximately 70 different PostgreSQL types -* Automatic statement preparation and caching -* Batch queries -* Single-round trip query mode -* Full TLS connection control -* Binary format support for custom types (allows for much quicker encoding/decoding) -* COPY protocol support for faster bulk data loads -* Extendable logging support including built-in support for `log15adapter`, [`logrus`](https://github.com/sirupsen/logrus), [`zap`](https://github.com/uber-go/zap), and [`zerolog`](https://github.com/rs/zerolog) -* Connection pool with after-connect hook for arbitrary connection setup -* Listen / notify -* Conversion of PostgreSQL arrays to Go slice mappings for integers, floats, and strings -* Hstore support -* JSON and JSONB support -* Maps `inet` and `cidr` PostgreSQL types to `net.IPNet` and `net.IP` -* Large object support -* NULL mapping to Null* struct or pointer to pointer -* Supports `database/sql.Scanner` and `database/sql/driver.Valuer` interfaces for custom types -* Notice response handling -* Simulated nested transactions with savepoints - -## Performance - -There are three areas in particular where pgx can provide a significant performance advantage over the standard -`database/sql` interface and other drivers: - -1. PostgreSQL specific types - Types such as arrays can be parsed much quicker because pgx uses the binary format. -2. Automatic statement preparation and caching - pgx will prepare and cache statements by default. This can provide an - significant free improvement to code that does not explicitly use prepared statements. Under certain workloads, it can - perform nearly 3x the number of queries per second. -3. Batched queries - Multiple queries can be batched together to minimize network round trips. - -## Testing - -pgx tests naturally require a PostgreSQL database. It will connect to the database specified in the `PGX_TEST_DATABASE` environment -variable. The `PGX_TEST_DATABASE` environment variable can either be a URL or DSN. In addition, the standard `PG*` environment -variables will be respected. Consider using [direnv](https://github.com/direnv/direnv) to simplify environment variable -handling. - -### Example Test Environment - -Connect to your PostgreSQL server and run: - -``` -create database pgx_test; -``` - -Connect to the newly-created database and run: - -``` -create domain uint64 as numeric(20,0); -``` - -Now, you can run the tests: - -``` -PGX_TEST_DATABASE="host=/var/run/postgresql database=pgx_test" go test ./... -``` - -In addition, there are tests specific for PgBouncer that will be executed if `PGX_TEST_PGBOUNCER_CONN_STRING` is set. - -## Supported Go and PostgreSQL Versions - -pgx supports the same versions of Go and PostgreSQL that are supported by their respective teams. For [Go](https://golang.org/doc/devel/release.html#policy) that is the two most recent major releases and for [PostgreSQL](https://www.postgresql.org/support/versioning/) the major releases in the last 5 years. This means pgx supports Go 1.17 and higher and PostgreSQL 10 and higher. pgx also is tested against the latest version of [CockroachDB](https://www.cockroachlabs.com/product/). - -## Version Policy - -pgx follows semantic versioning for the documented public API on stable releases. `v4` is the latest stable major version. - -## PGX Family Libraries - -pgx is the head of a family of PostgreSQL libraries. Many of these can be used independently. Many can also be accessed -from pgx for lower-level control. - -### [github.com/jackc/pgconn](https://github.com/jackc/pgconn) - -`pgconn` is a lower-level PostgreSQL database driver that operates at nearly the same level as the C library `libpq`. - -### [github.com/jackc/pgx/v4/pgxpool](https://github.com/jackc/pgx/tree/master/pgxpool) - -`pgxpool` is a connection pool for pgx. pgx is entirely decoupled from its default pool implementation. This means that pgx can be used with a different pool or without any pool at all. - -### [github.com/jackc/pgx/v4/stdlib](https://github.com/jackc/pgx/tree/master/stdlib) - -This is a `database/sql` compatibility layer for pgx. pgx can be used as a normal `database/sql` driver, but at any time, the native interface can be acquired for more performance or PostgreSQL specific functionality. - -### [github.com/jackc/pgtype](https://github.com/jackc/pgtype) - -Over 70 PostgreSQL types are supported including `uuid`, `hstore`, `json`, `bytea`, `numeric`, `interval`, `inet`, and arrays. These types support `database/sql` interfaces and are usable outside of pgx. They are fully tested in pgx and pq. They also support a higher performance interface when used with the pgx driver. - -### [github.com/jackc/pgproto3](https://github.com/jackc/pgproto3) - -pgproto3 provides standalone encoding and decoding of the PostgreSQL v3 wire protocol. This is useful for implementing very low level PostgreSQL tooling. - -### [github.com/jackc/pglogrepl](https://github.com/jackc/pglogrepl) - -pglogrepl provides functionality to act as a client for PostgreSQL logical replication. - -### [github.com/jackc/pgmock](https://github.com/jackc/pgmock) - -pgmock offers the ability to create a server that mocks the PostgreSQL wire protocol. This is used internally to test pgx by purposely inducing unusual errors. pgproto3 and pgmock together provide most of the foundational tooling required to implement a PostgreSQL proxy or MitM (such as for a custom connection pooler). - -### [github.com/jackc/tern](https://github.com/jackc/tern) - -tern is a stand-alone SQL migration system. - -### [github.com/jackc/pgerrcode](https://github.com/jackc/pgerrcode) - -pgerrcode contains constants for the PostgreSQL error codes. - -## 3rd Party Libraries with PGX Support - -### [github.com/georgysavva/scany](https://github.com/georgysavva/scany) - -Library for scanning data from a database into Go structs and more. - -### [https://github.com/otan/gopgkrb5](https://github.com/otan/gopgkrb5) - -Adds GSSAPI / Kerberos authentication support. - -### [https://github.com/vgarvardt/pgx-google-uuid](https://github.com/vgarvardt/pgx-google-uuid) - -Adds support for [`github.com/google/uuid`](https://github.com/google/uuid). diff --git a/vendor/github.com/jackc/pgx/v4/batch.go b/vendor/github.com/jackc/pgx/v4/batch.go deleted file mode 100644 index 7f86ad5..0000000 --- a/vendor/github.com/jackc/pgx/v4/batch.go +++ /dev/null @@ -1,228 +0,0 @@ -package pgx - -import ( - "context" - "errors" - "fmt" - - "github.com/jackc/pgconn" -) - -type batchItem struct { - query string - arguments []interface{} -} - -// Batch queries are a way of bundling multiple queries together to avoid -// unnecessary network round trips. -type Batch struct { - items []*batchItem -} - -// Queue queues a query to batch b. query can be an SQL query or the name of a prepared statement. -func (b *Batch) Queue(query string, arguments ...interface{}) { - b.items = append(b.items, &batchItem{ - query: query, - arguments: arguments, - }) -} - -// Len returns number of queries that have been queued so far. -func (b *Batch) Len() int { - return len(b.items) -} - -type BatchResults interface { - // Exec reads the results from the next query in the batch as if the query has been sent with Conn.Exec. - Exec() (pgconn.CommandTag, error) - - // Query reads the results from the next query in the batch as if the query has been sent with Conn.Query. - Query() (Rows, error) - - // QueryRow reads the results from the next query in the batch as if the query has been sent with Conn.QueryRow. - QueryRow() Row - - // QueryFunc reads the results from the next query in the batch as if the query has been sent with Conn.QueryFunc. - QueryFunc(scans []interface{}, f func(QueryFuncRow) error) (pgconn.CommandTag, error) - - // Close closes the batch operation. This must be called before the underlying connection can be used again. Any error - // that occurred during a batch operation may have made it impossible to resyncronize the connection with the server. - // In this case the underlying connection will have been closed. Close is safe to call multiple times. - Close() error -} - -type batchResults struct { - ctx context.Context - conn *Conn - mrr *pgconn.MultiResultReader - err error - b *Batch - ix int - closed bool -} - -// Exec reads the results from the next query in the batch as if the query has been sent with Exec. -func (br *batchResults) Exec() (pgconn.CommandTag, error) { - if br.err != nil { - return nil, br.err - } - if br.closed { - return nil, fmt.Errorf("batch already closed") - } - - query, arguments, _ := br.nextQueryAndArgs() - - if !br.mrr.NextResult() { - err := br.mrr.Close() - if err == nil { - err = errors.New("no result") - } - if br.conn.shouldLog(LogLevelError) { - br.conn.log(br.ctx, LogLevelError, "BatchResult.Exec", map[string]interface{}{ - "sql": query, - "args": logQueryArgs(arguments), - "err": err, - }) - } - return nil, err - } - - commandTag, err := br.mrr.ResultReader().Close() - - if err != nil { - if br.conn.shouldLog(LogLevelError) { - br.conn.log(br.ctx, LogLevelError, "BatchResult.Exec", map[string]interface{}{ - "sql": query, - "args": logQueryArgs(arguments), - "err": err, - }) - } - } else if br.conn.shouldLog(LogLevelInfo) { - br.conn.log(br.ctx, LogLevelInfo, "BatchResult.Exec", map[string]interface{}{ - "sql": query, - "args": logQueryArgs(arguments), - "commandTag": commandTag, - }) - } - - return commandTag, err -} - -// Query reads the results from the next query in the batch as if the query has been sent with Query. -func (br *batchResults) Query() (Rows, error) { - query, arguments, ok := br.nextQueryAndArgs() - if !ok { - query = "batch query" - } - - if br.err != nil { - return &connRows{err: br.err, closed: true}, br.err - } - - if br.closed { - alreadyClosedErr := fmt.Errorf("batch already closed") - return &connRows{err: alreadyClosedErr, closed: true}, alreadyClosedErr - } - - rows := br.conn.getRows(br.ctx, query, arguments) - - if !br.mrr.NextResult() { - rows.err = br.mrr.Close() - if rows.err == nil { - rows.err = errors.New("no result") - } - rows.closed = true - - if br.conn.shouldLog(LogLevelError) { - br.conn.log(br.ctx, LogLevelError, "BatchResult.Query", map[string]interface{}{ - "sql": query, - "args": logQueryArgs(arguments), - "err": rows.err, - }) - } - - return rows, rows.err - } - - rows.resultReader = br.mrr.ResultReader() - return rows, nil -} - -// QueryFunc reads the results from the next query in the batch as if the query has been sent with Conn.QueryFunc. -func (br *batchResults) QueryFunc(scans []interface{}, f func(QueryFuncRow) error) (pgconn.CommandTag, error) { - if br.closed { - return nil, fmt.Errorf("batch already closed") - } - - rows, err := br.Query() - if err != nil { - return nil, err - } - defer rows.Close() - - for rows.Next() { - err = rows.Scan(scans...) - if err != nil { - return nil, err - } - - err = f(rows) - if err != nil { - return nil, err - } - } - - if err := rows.Err(); err != nil { - return nil, err - } - - return rows.CommandTag(), nil -} - -// QueryRow reads the results from the next query in the batch as if the query has been sent with QueryRow. -func (br *batchResults) QueryRow() Row { - rows, _ := br.Query() - return (*connRow)(rows.(*connRows)) - -} - -// Close closes the batch operation. Any error that occurred during a batch operation may have made it impossible to -// resyncronize the connection with the server. In this case the underlying connection will have been closed. -func (br *batchResults) Close() error { - if br.err != nil { - return br.err - } - - if br.closed { - return nil - } - br.closed = true - - // log any queries that haven't yet been logged by Exec or Query - for { - query, args, ok := br.nextQueryAndArgs() - if !ok { - break - } - - if br.conn.shouldLog(LogLevelInfo) { - br.conn.log(br.ctx, LogLevelInfo, "BatchResult.Close", map[string]interface{}{ - "sql": query, - "args": logQueryArgs(args), - }) - } - } - - return br.mrr.Close() -} - -func (br *batchResults) nextQueryAndArgs() (query string, args []interface{}, ok bool) { - if br.b != nil && br.ix < len(br.b.items) { - bi := br.b.items[br.ix] - query = bi.query - args = bi.arguments - ok = true - br.ix++ - } - return -} diff --git a/vendor/github.com/jackc/pgx/v4/conn.go b/vendor/github.com/jackc/pgx/v4/conn.go deleted file mode 100644 index 6f83f49..0000000 --- a/vendor/github.com/jackc/pgx/v4/conn.go +++ /dev/null @@ -1,857 +0,0 @@ -package pgx - -import ( - "context" - "errors" - "fmt" - "strconv" - "strings" - "time" - - "github.com/jackc/pgconn" - "github.com/jackc/pgconn/stmtcache" - "github.com/jackc/pgproto3/v2" - "github.com/jackc/pgtype" - "github.com/jackc/pgx/v4/internal/sanitize" -) - -// ConnConfig contains all the options used to establish a connection. It must be created by ParseConfig and -// then it can be modified. A manually initialized ConnConfig will cause ConnectConfig to panic. -type ConnConfig struct { - pgconn.Config - Logger Logger - LogLevel LogLevel - - // Original connection string that was parsed into config. - connString string - - // BuildStatementCache creates the stmtcache.Cache implementation for connections created with this config. Set - // to nil to disable automatic prepared statements. - BuildStatementCache BuildStatementCacheFunc - - // PreferSimpleProtocol disables implicit prepared statement usage. By default pgx automatically uses the extended - // protocol. This can improve performance due to being able to use the binary format. It also does not rely on client - // side parameter sanitization. However, it does incur two round-trips per query (unless using a prepared statement) - // and may be incompatible proxies such as PGBouncer. Setting PreferSimpleProtocol causes the simple protocol to be - // used by default. The same functionality can be controlled on a per query basis by setting - // QueryExOptions.SimpleProtocol. - PreferSimpleProtocol bool - - createdByParseConfig bool // Used to enforce created by ParseConfig rule. -} - -// Copy returns a deep copy of the config that is safe to use and modify. -// The only exception is the tls.Config: -// according to the tls.Config docs it must not be modified after creation. -func (cc *ConnConfig) Copy() *ConnConfig { - newConfig := new(ConnConfig) - *newConfig = *cc - newConfig.Config = *newConfig.Config.Copy() - return newConfig -} - -// ConnString returns the connection string as parsed by pgx.ParseConfig into pgx.ConnConfig. -func (cc *ConnConfig) ConnString() string { return cc.connString } - -// BuildStatementCacheFunc is a function that can be used to create a stmtcache.Cache implementation for connection. -type BuildStatementCacheFunc func(conn *pgconn.PgConn) stmtcache.Cache - -// Conn is a PostgreSQL connection handle. It is not safe for concurrent usage. Use a connection pool to manage access -// to multiple database connections from multiple goroutines. -type Conn struct { - pgConn *pgconn.PgConn - config *ConnConfig // config used when establishing this connection - preparedStatements map[string]*pgconn.StatementDescription - stmtcache stmtcache.Cache - logger Logger - logLevel LogLevel - - notifications []*pgconn.Notification - - doneChan chan struct{} - closedChan chan error - - connInfo *pgtype.ConnInfo - - wbuf []byte - eqb extendedQueryBuilder -} - -// Identifier a PostgreSQL identifier or name. Identifiers can be composed of -// multiple parts such as ["schema", "table"] or ["table", "column"]. -type Identifier []string - -// Sanitize returns a sanitized string safe for SQL interpolation. -func (ident Identifier) Sanitize() string { - parts := make([]string, len(ident)) - for i := range ident { - s := strings.ReplaceAll(ident[i], string([]byte{0}), "") - parts[i] = `"` + strings.ReplaceAll(s, `"`, `""`) + `"` - } - return strings.Join(parts, ".") -} - -// ErrNoRows occurs when rows are expected but none are returned. -var ErrNoRows = errors.New("no rows in result set") - -// ErrInvalidLogLevel occurs on attempt to set an invalid log level. -var ErrInvalidLogLevel = errors.New("invalid log level") - -// Connect establishes a connection with a PostgreSQL server with a connection string. See -// pgconn.Connect for details. -func Connect(ctx context.Context, connString string) (*Conn, error) { - connConfig, err := ParseConfig(connString) - if err != nil { - return nil, err - } - return connect(ctx, connConfig) -} - -// ConnectConfig establishes a connection with a PostgreSQL server with a configuration struct. -// connConfig must have been created by ParseConfig. -func ConnectConfig(ctx context.Context, connConfig *ConnConfig) (*Conn, error) { - return connect(ctx, connConfig) -} - -// ParseConfig creates a ConnConfig from a connection string. ParseConfig handles all options that pgconn.ParseConfig -// does. In addition, it accepts the following options: -// -// statement_cache_capacity -// The maximum size of the automatic statement cache. Set to 0 to disable automatic statement caching. Default: 512. -// -// statement_cache_mode -// Possible values: "prepare" and "describe". "prepare" will create prepared statements on the PostgreSQL server. -// "describe" will use the anonymous prepared statement to describe a statement without creating a statement on the -// server. "describe" is primarily useful when the environment does not allow prepared statements such as when -// running a connection pooler like PgBouncer. Default: "prepare" -// -// prefer_simple_protocol -// Possible values: "true" and "false". Use the simple protocol instead of extended protocol. Default: false -func ParseConfig(connString string) (*ConnConfig, error) { - config, err := pgconn.ParseConfig(connString) - if err != nil { - return nil, err - } - - var buildStatementCache BuildStatementCacheFunc - statementCacheCapacity := 512 - statementCacheMode := stmtcache.ModePrepare - if s, ok := config.RuntimeParams["statement_cache_capacity"]; ok { - delete(config.RuntimeParams, "statement_cache_capacity") - n, err := strconv.ParseInt(s, 10, 32) - if err != nil { - return nil, fmt.Errorf("cannot parse statement_cache_capacity: %w", err) - } - statementCacheCapacity = int(n) - } - - if s, ok := config.RuntimeParams["statement_cache_mode"]; ok { - delete(config.RuntimeParams, "statement_cache_mode") - switch s { - case "prepare": - statementCacheMode = stmtcache.ModePrepare - case "describe": - statementCacheMode = stmtcache.ModeDescribe - default: - return nil, fmt.Errorf("invalid statement_cache_mod: %s", s) - } - } - - if statementCacheCapacity > 0 { - buildStatementCache = func(conn *pgconn.PgConn) stmtcache.Cache { - return stmtcache.New(conn, statementCacheMode, statementCacheCapacity) - } - } - - preferSimpleProtocol := false - if s, ok := config.RuntimeParams["prefer_simple_protocol"]; ok { - delete(config.RuntimeParams, "prefer_simple_protocol") - if b, err := strconv.ParseBool(s); err == nil { - preferSimpleProtocol = b - } else { - return nil, fmt.Errorf("invalid prefer_simple_protocol: %v", err) - } - } - - connConfig := &ConnConfig{ - Config: *config, - createdByParseConfig: true, - LogLevel: LogLevelInfo, - BuildStatementCache: buildStatementCache, - PreferSimpleProtocol: preferSimpleProtocol, - connString: connString, - } - - return connConfig, nil -} - -func connect(ctx context.Context, config *ConnConfig) (c *Conn, err error) { - // Default values are set in ParseConfig. Enforce initial creation by ParseConfig rather than setting defaults from - // zero values. - if !config.createdByParseConfig { - panic("config must be created by ParseConfig") - } - originalConfig := config - - // This isn't really a deep copy. But it is enough to avoid the config.Config.OnNotification mutation from affecting - // other connections with the same config. See https://github.com/jackc/pgx/issues/618. - { - configCopy := *config - config = &configCopy - } - - c = &Conn{ - config: originalConfig, - connInfo: pgtype.NewConnInfo(), - logLevel: config.LogLevel, - logger: config.Logger, - } - - // Only install pgx notification system if no other callback handler is present. - if config.Config.OnNotification == nil { - config.Config.OnNotification = c.bufferNotifications - } else { - if c.shouldLog(LogLevelDebug) { - c.log(ctx, LogLevelDebug, "pgx notification handler disabled by application supplied OnNotification", map[string]interface{}{"host": config.Config.Host}) - } - } - - if c.shouldLog(LogLevelInfo) { - c.log(ctx, LogLevelInfo, "Dialing PostgreSQL server", map[string]interface{}{"host": config.Config.Host}) - } - c.pgConn, err = pgconn.ConnectConfig(ctx, &config.Config) - if err != nil { - if c.shouldLog(LogLevelError) { - c.log(ctx, LogLevelError, "connect failed", map[string]interface{}{"err": err}) - } - return nil, err - } - - c.preparedStatements = make(map[string]*pgconn.StatementDescription) - c.doneChan = make(chan struct{}) - c.closedChan = make(chan error) - c.wbuf = make([]byte, 0, 1024) - - if c.config.BuildStatementCache != nil { - c.stmtcache = c.config.BuildStatementCache(c.pgConn) - } - - // Replication connections can't execute the queries to - // populate the c.PgTypes and c.pgsqlAfInet - if _, ok := config.Config.RuntimeParams["replication"]; ok { - return c, nil - } - - return c, nil -} - -// Close closes a connection. It is safe to call Close on a already closed -// connection. -func (c *Conn) Close(ctx context.Context) error { - if c.IsClosed() { - return nil - } - - err := c.pgConn.Close(ctx) - if c.shouldLog(LogLevelInfo) { - c.log(ctx, LogLevelInfo, "closed connection", nil) - } - return err -} - -// Prepare creates a prepared statement with name and sql. sql can contain placeholders -// for bound parameters. These placeholders are referenced positional as $1, $2, etc. -// -// Prepare is idempotent; i.e. it is safe to call Prepare multiple times with the same -// name and sql arguments. This allows a code path to Prepare and Query/Exec without -// concern for if the statement has already been prepared. -func (c *Conn) Prepare(ctx context.Context, name, sql string) (sd *pgconn.StatementDescription, err error) { - if name != "" { - var ok bool - if sd, ok = c.preparedStatements[name]; ok && sd.SQL == sql { - return sd, nil - } - } - - if c.shouldLog(LogLevelError) { - defer func() { - if err != nil { - c.log(ctx, LogLevelError, "Prepare failed", map[string]interface{}{"err": err, "name": name, "sql": sql}) - } - }() - } - - sd, err = c.pgConn.Prepare(ctx, name, sql, nil) - if err != nil { - return nil, err - } - - if name != "" { - c.preparedStatements[name] = sd - } - - return sd, nil -} - -// Deallocate released a prepared statement -func (c *Conn) Deallocate(ctx context.Context, name string) error { - delete(c.preparedStatements, name) - _, err := c.pgConn.Exec(ctx, "deallocate "+quoteIdentifier(name)).ReadAll() - return err -} - -func (c *Conn) bufferNotifications(_ *pgconn.PgConn, n *pgconn.Notification) { - c.notifications = append(c.notifications, n) -} - -// WaitForNotification waits for a PostgreSQL notification. It wraps the underlying pgconn notification system in a -// slightly more convenient form. -func (c *Conn) WaitForNotification(ctx context.Context) (*pgconn.Notification, error) { - var n *pgconn.Notification - - // Return already received notification immediately - if len(c.notifications) > 0 { - n = c.notifications[0] - c.notifications = c.notifications[1:] - return n, nil - } - - err := c.pgConn.WaitForNotification(ctx) - if len(c.notifications) > 0 { - n = c.notifications[0] - c.notifications = c.notifications[1:] - } - return n, err -} - -// IsClosed reports if the connection has been closed. -func (c *Conn) IsClosed() bool { - return c.pgConn.IsClosed() -} - -func (c *Conn) die(err error) { - if c.IsClosed() { - return - } - - ctx, cancel := context.WithCancel(context.Background()) - cancel() // force immediate hard cancel - c.pgConn.Close(ctx) -} - -func (c *Conn) shouldLog(lvl LogLevel) bool { - return c.logger != nil && c.logLevel >= lvl -} - -func (c *Conn) log(ctx context.Context, lvl LogLevel, msg string, data map[string]interface{}) { - if data == nil { - data = map[string]interface{}{} - } - if c.pgConn != nil && c.pgConn.PID() != 0 { - data["pid"] = c.pgConn.PID() - } - - c.logger.Log(ctx, lvl, msg, data) -} - -func quoteIdentifier(s string) string { - return `"` + strings.ReplaceAll(s, `"`, `""`) + `"` -} - -// Ping executes an empty sql statement against the *Conn -// If the sql returns without error, the database Ping is considered successful, otherwise, the error is returned. -func (c *Conn) Ping(ctx context.Context) error { - _, err := c.Exec(ctx, ";") - return err -} - -// PgConn returns the underlying *pgconn.PgConn. This is an escape hatch method that allows lower level access to the -// PostgreSQL connection than pgx exposes. -// -// It is strongly recommended that the connection be idle (no in-progress queries) before the underlying *pgconn.PgConn -// is used and the connection must be returned to the same state before any *pgx.Conn methods are again used. -func (c *Conn) PgConn() *pgconn.PgConn { return c.pgConn } - -// StatementCache returns the statement cache used for this connection. -func (c *Conn) StatementCache() stmtcache.Cache { return c.stmtcache } - -// ConnInfo returns the connection info used for this connection. -func (c *Conn) ConnInfo() *pgtype.ConnInfo { return c.connInfo } - -// Config returns a copy of config that was used to establish this connection. -func (c *Conn) Config() *ConnConfig { return c.config.Copy() } - -// Exec executes sql. sql can be either a prepared statement name or an SQL string. arguments should be referenced -// positionally from the sql string as $1, $2, etc. -func (c *Conn) Exec(ctx context.Context, sql string, arguments ...interface{}) (pgconn.CommandTag, error) { - startTime := time.Now() - - commandTag, err := c.exec(ctx, sql, arguments...) - if err != nil { - if c.shouldLog(LogLevelError) { - endTime := time.Now() - c.log(ctx, LogLevelError, "Exec", map[string]interface{}{"sql": sql, "args": logQueryArgs(arguments), "err": err, "time": endTime.Sub(startTime)}) - } - return commandTag, err - } - - if c.shouldLog(LogLevelInfo) { - endTime := time.Now() - c.log(ctx, LogLevelInfo, "Exec", map[string]interface{}{"sql": sql, "args": logQueryArgs(arguments), "time": endTime.Sub(startTime), "commandTag": commandTag}) - } - - return commandTag, err -} - -func (c *Conn) exec(ctx context.Context, sql string, arguments ...interface{}) (commandTag pgconn.CommandTag, err error) { - simpleProtocol := c.config.PreferSimpleProtocol - -optionLoop: - for len(arguments) > 0 { - switch arg := arguments[0].(type) { - case QuerySimpleProtocol: - simpleProtocol = bool(arg) - arguments = arguments[1:] - default: - break optionLoop - } - } - - if sd, ok := c.preparedStatements[sql]; ok { - return c.execPrepared(ctx, sd, arguments) - } - - if simpleProtocol { - return c.execSimpleProtocol(ctx, sql, arguments) - } - - if len(arguments) == 0 { - return c.execSimpleProtocol(ctx, sql, arguments) - } - - if c.stmtcache != nil { - sd, err := c.stmtcache.Get(ctx, sql) - if err != nil { - return nil, err - } - - if c.stmtcache.Mode() == stmtcache.ModeDescribe { - return c.execParams(ctx, sd, arguments) - } - return c.execPrepared(ctx, sd, arguments) - } - - sd, err := c.Prepare(ctx, "", sql) - if err != nil { - return nil, err - } - return c.execPrepared(ctx, sd, arguments) -} - -func (c *Conn) execSimpleProtocol(ctx context.Context, sql string, arguments []interface{}) (commandTag pgconn.CommandTag, err error) { - if len(arguments) > 0 { - sql, err = c.sanitizeForSimpleQuery(sql, arguments...) - if err != nil { - return nil, err - } - } - - mrr := c.pgConn.Exec(ctx, sql) - for mrr.NextResult() { - commandTag, err = mrr.ResultReader().Close() - } - err = mrr.Close() - return commandTag, err -} - -func (c *Conn) execParamsAndPreparedPrefix(sd *pgconn.StatementDescription, arguments []interface{}) error { - if len(sd.ParamOIDs) != len(arguments) { - return fmt.Errorf("expected %d arguments, got %d", len(sd.ParamOIDs), len(arguments)) - } - - c.eqb.Reset() - - args, err := convertDriverValuers(arguments) - if err != nil { - return err - } - - for i := range args { - err = c.eqb.AppendParam(c.connInfo, sd.ParamOIDs[i], args[i]) - if err != nil { - return err - } - } - - for i := range sd.Fields { - c.eqb.AppendResultFormat(c.ConnInfo().ResultFormatCodeForOID(sd.Fields[i].DataTypeOID)) - } - - return nil -} - -func (c *Conn) execParams(ctx context.Context, sd *pgconn.StatementDescription, arguments []interface{}) (pgconn.CommandTag, error) { - err := c.execParamsAndPreparedPrefix(sd, arguments) - if err != nil { - return nil, err - } - - result := c.pgConn.ExecParams(ctx, sd.SQL, c.eqb.paramValues, sd.ParamOIDs, c.eqb.paramFormats, c.eqb.resultFormats).Read() - c.eqb.Reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. - return result.CommandTag, result.Err -} - -func (c *Conn) execPrepared(ctx context.Context, sd *pgconn.StatementDescription, arguments []interface{}) (pgconn.CommandTag, error) { - err := c.execParamsAndPreparedPrefix(sd, arguments) - if err != nil { - return nil, err - } - - result := c.pgConn.ExecPrepared(ctx, sd.Name, c.eqb.paramValues, c.eqb.paramFormats, c.eqb.resultFormats).Read() - c.eqb.Reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. - return result.CommandTag, result.Err -} - -func (c *Conn) getRows(ctx context.Context, sql string, args []interface{}) *connRows { - r := &connRows{} - - r.ctx = ctx - r.logger = c - r.connInfo = c.connInfo - r.startTime = time.Now() - r.sql = sql - r.args = args - r.conn = c - - return r -} - -// QuerySimpleProtocol controls whether the simple or extended protocol is used to send the query. -type QuerySimpleProtocol bool - -// QueryResultFormats controls the result format (text=0, binary=1) of a query by result column position. -type QueryResultFormats []int16 - -// QueryResultFormatsByOID controls the result format (text=0, binary=1) of a query by the result column OID. -type QueryResultFormatsByOID map[uint32]int16 - -// Query sends a query to the server and returns a Rows to read the results. Only errors encountered sending the query -// and initializing Rows will be returned. Err() on the returned Rows must be checked after the Rows is closed to -// determine if the query executed successfully. -// -// The returned Rows must be closed before the connection can be used again. It is safe to attempt to read from the -// returned Rows even if an error is returned. The error will be the available in rows.Err() after rows are closed. It -// is allowed to ignore the error returned from Query and handle it in Rows. -// -// Err() on the returned Rows must be checked after the Rows is closed to determine if the query executed successfully -// as some errors can only be detected by reading the entire response. e.g. A divide by zero error on the last row. -// -// For extra control over how the query is executed, the types QuerySimpleProtocol, QueryResultFormats, and -// QueryResultFormatsByOID may be used as the first args to control exactly how the query is executed. This is rarely -// needed. See the documentation for those types for details. -func (c *Conn) Query(ctx context.Context, sql string, args ...interface{}) (Rows, error) { - var resultFormats QueryResultFormats - var resultFormatsByOID QueryResultFormatsByOID - simpleProtocol := c.config.PreferSimpleProtocol - -optionLoop: - for len(args) > 0 { - switch arg := args[0].(type) { - case QueryResultFormats: - resultFormats = arg - args = args[1:] - case QueryResultFormatsByOID: - resultFormatsByOID = arg - args = args[1:] - case QuerySimpleProtocol: - simpleProtocol = bool(arg) - args = args[1:] - default: - break optionLoop - } - } - - rows := c.getRows(ctx, sql, args) - - var err error - sd, ok := c.preparedStatements[sql] - - if simpleProtocol && !ok { - sql, err = c.sanitizeForSimpleQuery(sql, args...) - if err != nil { - rows.fatal(err) - return rows, err - } - - mrr := c.pgConn.Exec(ctx, sql) - if mrr.NextResult() { - rows.resultReader = mrr.ResultReader() - rows.multiResultReader = mrr - } else { - err = mrr.Close() - rows.fatal(err) - return rows, err - } - - return rows, nil - } - - c.eqb.Reset() - - if !ok { - if c.stmtcache != nil { - sd, err = c.stmtcache.Get(ctx, sql) - if err != nil { - rows.fatal(err) - return rows, rows.err - } - } else { - sd, err = c.pgConn.Prepare(ctx, "", sql, nil) - if err != nil { - rows.fatal(err) - return rows, rows.err - } - } - } - if len(sd.ParamOIDs) != len(args) { - rows.fatal(fmt.Errorf("expected %d arguments, got %d", len(sd.ParamOIDs), len(args))) - return rows, rows.err - } - - rows.sql = sd.SQL - - args, err = convertDriverValuers(args) - if err != nil { - rows.fatal(err) - return rows, rows.err - } - - for i := range args { - err = c.eqb.AppendParam(c.connInfo, sd.ParamOIDs[i], args[i]) - if err != nil { - rows.fatal(err) - return rows, rows.err - } - } - - if resultFormatsByOID != nil { - resultFormats = make([]int16, len(sd.Fields)) - for i := range resultFormats { - resultFormats[i] = resultFormatsByOID[uint32(sd.Fields[i].DataTypeOID)] - } - } - - if resultFormats == nil { - for i := range sd.Fields { - c.eqb.AppendResultFormat(c.ConnInfo().ResultFormatCodeForOID(sd.Fields[i].DataTypeOID)) - } - - resultFormats = c.eqb.resultFormats - } - - if c.stmtcache != nil && c.stmtcache.Mode() == stmtcache.ModeDescribe && !ok { - rows.resultReader = c.pgConn.ExecParams(ctx, sql, c.eqb.paramValues, sd.ParamOIDs, c.eqb.paramFormats, resultFormats) - } else { - rows.resultReader = c.pgConn.ExecPrepared(ctx, sd.Name, c.eqb.paramValues, c.eqb.paramFormats, resultFormats) - } - - c.eqb.Reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. - - return rows, rows.err -} - -// QueryRow is a convenience wrapper over Query. Any error that occurs while -// querying is deferred until calling Scan on the returned Row. That Row will -// error with ErrNoRows if no rows are returned. -func (c *Conn) QueryRow(ctx context.Context, sql string, args ...interface{}) Row { - rows, _ := c.Query(ctx, sql, args...) - return (*connRow)(rows.(*connRows)) -} - -// QueryFuncRow is the argument to the QueryFunc callback function. -// -// QueryFuncRow is an interface instead of a struct to allow tests to mock QueryFunc. However, adding a method to an -// interface is technically a breaking change. Because of this the QueryFuncRow interface is partially excluded from -// semantic version requirements. Methods will not be removed or changed, but new methods may be added. -type QueryFuncRow interface { - FieldDescriptions() []pgproto3.FieldDescription - - // RawValues returns the unparsed bytes of the row values. The returned [][]byte is only valid during the current - // function call. However, the underlying byte data is safe to retain a reference to and mutate. - RawValues() [][]byte -} - -// QueryFunc executes sql with args. For each row returned by the query the values will scanned into the elements of -// scans and f will be called. If any row fails to scan or f returns an error the query will be aborted and the error -// will be returned. -func (c *Conn) QueryFunc(ctx context.Context, sql string, args []interface{}, scans []interface{}, f func(QueryFuncRow) error) (pgconn.CommandTag, error) { - rows, err := c.Query(ctx, sql, args...) - if err != nil { - return nil, err - } - defer rows.Close() - - for rows.Next() { - err = rows.Scan(scans...) - if err != nil { - return nil, err - } - - err = f(rows) - if err != nil { - return nil, err - } - } - - if err := rows.Err(); err != nil { - return nil, err - } - - return rows.CommandTag(), nil -} - -// SendBatch sends all queued queries to the server at once. All queries are run in an implicit transaction unless -// explicit transaction control statements are executed. The returned BatchResults must be closed before the connection -// is used again. -func (c *Conn) SendBatch(ctx context.Context, b *Batch) BatchResults { - startTime := time.Now() - - simpleProtocol := c.config.PreferSimpleProtocol - var sb strings.Builder - if simpleProtocol { - for i, bi := range b.items { - if i > 0 { - sb.WriteByte(';') - } - sql, err := c.sanitizeForSimpleQuery(bi.query, bi.arguments...) - if err != nil { - return &batchResults{ctx: ctx, conn: c, err: err} - } - sb.WriteString(sql) - } - mrr := c.pgConn.Exec(ctx, sb.String()) - return &batchResults{ - ctx: ctx, - conn: c, - mrr: mrr, - b: b, - ix: 0, - } - } - - distinctUnpreparedQueries := map[string]struct{}{} - - for _, bi := range b.items { - if _, ok := c.preparedStatements[bi.query]; ok { - continue - } - distinctUnpreparedQueries[bi.query] = struct{}{} - } - - var stmtCache stmtcache.Cache - if len(distinctUnpreparedQueries) > 0 { - if c.stmtcache != nil && c.stmtcache.Cap() >= len(distinctUnpreparedQueries) { - stmtCache = c.stmtcache - } else { - stmtCache = stmtcache.New(c.pgConn, stmtcache.ModeDescribe, len(distinctUnpreparedQueries)) - } - - for sql, _ := range distinctUnpreparedQueries { - _, err := stmtCache.Get(ctx, sql) - if err != nil { - return &batchResults{ctx: ctx, conn: c, err: err} - } - } - } - - batch := &pgconn.Batch{} - - for _, bi := range b.items { - c.eqb.Reset() - - sd := c.preparedStatements[bi.query] - if sd == nil { - var err error - sd, err = stmtCache.Get(ctx, bi.query) - if err != nil { - return c.logBatchResults(ctx, startTime, &batchResults{ctx: ctx, conn: c, err: err}) - } - } - - if len(sd.ParamOIDs) != len(bi.arguments) { - return c.logBatchResults(ctx, startTime, &batchResults{ctx: ctx, conn: c, err: fmt.Errorf("mismatched param and argument count")}) - } - - args, err := convertDriverValuers(bi.arguments) - if err != nil { - return c.logBatchResults(ctx, startTime, &batchResults{ctx: ctx, conn: c, err: err}) - } - - for i := range args { - err = c.eqb.AppendParam(c.connInfo, sd.ParamOIDs[i], args[i]) - if err != nil { - return c.logBatchResults(ctx, startTime, &batchResults{ctx: ctx, conn: c, err: err}) - } - } - - for i := range sd.Fields { - c.eqb.AppendResultFormat(c.ConnInfo().ResultFormatCodeForOID(sd.Fields[i].DataTypeOID)) - } - - if sd.Name == "" { - batch.ExecParams(bi.query, c.eqb.paramValues, sd.ParamOIDs, c.eqb.paramFormats, c.eqb.resultFormats) - } else { - batch.ExecPrepared(sd.Name, c.eqb.paramValues, c.eqb.paramFormats, c.eqb.resultFormats) - } - } - - c.eqb.Reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. - - mrr := c.pgConn.ExecBatch(ctx, batch) - - return c.logBatchResults(ctx, startTime, &batchResults{ - ctx: ctx, - conn: c, - mrr: mrr, - b: b, - ix: 0, - }) -} - -func (c *Conn) logBatchResults(ctx context.Context, startTime time.Time, results *batchResults) BatchResults { - if results.err != nil { - if c.shouldLog(LogLevelError) { - endTime := time.Now() - c.log(ctx, LogLevelError, "SendBatch", map[string]interface{}{"err": results.err, "time": endTime.Sub(startTime)}) - } - return results - } - - if c.shouldLog(LogLevelInfo) { - endTime := time.Now() - c.log(ctx, LogLevelInfo, "SendBatch", map[string]interface{}{"batchLen": results.b.Len(), "time": endTime.Sub(startTime)}) - } - - return results -} - -func (c *Conn) sanitizeForSimpleQuery(sql string, args ...interface{}) (string, error) { - if c.pgConn.ParameterStatus("standard_conforming_strings") != "on" { - return "", errors.New("simple protocol queries must be run with standard_conforming_strings=on") - } - - if c.pgConn.ParameterStatus("client_encoding") != "UTF8" { - return "", errors.New("simple protocol queries must be run with client_encoding=UTF8") - } - - var err error - valueArgs := make([]interface{}, len(args)) - for i, a := range args { - valueArgs[i], err = convertSimpleArgument(c.connInfo, a) - if err != nil { - return "", err - } - } - - return sanitize.SanitizeSQL(sql, valueArgs...) -} diff --git a/vendor/github.com/jackc/pgx/v4/doc.go b/vendor/github.com/jackc/pgx/v4/doc.go deleted file mode 100644 index 222f904..0000000 --- a/vendor/github.com/jackc/pgx/v4/doc.go +++ /dev/null @@ -1,340 +0,0 @@ -// Package pgx is a PostgreSQL database driver. -/* -pgx provides lower level access to PostgreSQL than the standard database/sql. It remains as similar to the database/sql -interface as possible while providing better speed and access to PostgreSQL specific features. Import -github.com/jackc/pgx/v4/stdlib to use pgx as a database/sql compatible driver. - -Establishing a Connection - -The primary way of establishing a connection is with `pgx.Connect`. - - conn, err := pgx.Connect(context.Background(), os.Getenv("DATABASE_URL")) - -The database connection string can be in URL or DSN format. Both PostgreSQL settings and pgx settings can be specified -here. In addition, a config struct can be created by `ParseConfig` and modified before establishing the connection with -`ConnectConfig`. - - config, err := pgx.ParseConfig(os.Getenv("DATABASE_URL")) - if err != nil { - // ... - } - config.Logger = log15adapter.NewLogger(log.New("module", "pgx")) - - conn, err := pgx.ConnectConfig(context.Background(), config) - -Connection Pool - -`*pgx.Conn` represents a single connection to the database and is not concurrency safe. Use sub-package pgxpool for a -concurrency safe connection pool. - -Query Interface - -pgx implements Query and Scan in the familiar database/sql style. - - var sum int32 - - // Send the query to the server. The returned rows MUST be closed - // before conn can be used again. - rows, err := conn.Query(context.Background(), "select generate_series(1,$1)", 10) - if err != nil { - return err - } - - // rows.Close is called by rows.Next when all rows are read - // or an error occurs in Next or Scan. So it may optionally be - // omitted if nothing in the rows.Next loop can panic. It is - // safe to close rows multiple times. - defer rows.Close() - - // Iterate through the result set - for rows.Next() { - var n int32 - err = rows.Scan(&n) - if err != nil { - return err - } - sum += n - } - - // Any errors encountered by rows.Next or rows.Scan will be returned here - if rows.Err() != nil { - return rows.Err() - } - - // No errors found - do something with sum - -pgx also implements QueryRow in the same style as database/sql. - - var name string - var weight int64 - err := conn.QueryRow(context.Background(), "select name, weight from widgets where id=$1", 42).Scan(&name, &weight) - if err != nil { - return err - } - -Use Exec to execute a query that does not return a result set. - - commandTag, err := conn.Exec(context.Background(), "delete from widgets where id=$1", 42) - if err != nil { - return err - } - if commandTag.RowsAffected() != 1 { - return errors.New("No row found to delete") - } - -QueryFunc can be used to execute a callback function for every row. This is often easier to use than Query. - - var sum, n int32 - _, err = conn.QueryFunc( - context.Background(), - "select generate_series(1,$1)", - []interface{}{10}, - []interface{}{&n}, - func(pgx.QueryFuncRow) error { - sum += n - return nil - }, - ) - if err != nil { - return err - } - -Base Type Mapping - -pgx maps between all common base types directly between Go and PostgreSQL. In particular: - - Go PostgreSQL - ----------------------- - string varchar - text - - // Integers are automatically be converted to any other integer type if - // it can be done without overflow or underflow. - int8 - int16 smallint - int32 int - int64 bigint - int - uint8 - uint16 - uint32 - uint64 - uint - - // Floats are strict and do not automatically convert like integers. - float32 float4 - float64 float8 - - time.Time date - timestamp - timestamptz - - []byte bytea - - -Null Mapping - -pgx can map nulls in two ways. The first is package pgtype provides types that have a data field and a status field. -They work in a similar fashion to database/sql. The second is to use a pointer to a pointer. - - var foo pgtype.Varchar - var bar *string - err := conn.QueryRow("select foo, bar from widgets where id=$1", 42).Scan(&foo, &bar) - if err != nil { - return err - } - -Array Mapping - -pgx maps between int16, int32, int64, float32, float64, and string Go slices and the equivalent PostgreSQL array type. -Go slices of native types do not support nulls, so if a PostgreSQL array that contains a null is read into a native Go -slice an error will occur. The pgtype package includes many more array types for PostgreSQL types that do not directly -map to native Go types. - -JSON and JSONB Mapping - -pgx includes built-in support to marshal and unmarshal between Go types and the PostgreSQL JSON and JSONB. - -Inet and CIDR Mapping - -pgx encodes from net.IPNet to and from inet and cidr PostgreSQL types. In addition, as a convenience pgx will encode -from a net.IP; it will assume a /32 netmask for IPv4 and a /128 for IPv6. - -Custom Type Support - -pgx includes support for the common data types like integers, floats, strings, dates, and times that have direct -mappings between Go and SQL. In addition, pgx uses the github.com/jackc/pgtype library to support more types. See -documention for that library for instructions on how to implement custom types. - -See example_custom_type_test.go for an example of a custom type for the PostgreSQL point type. - -pgx also includes support for custom types implementing the database/sql.Scanner and database/sql/driver.Valuer -interfaces. - -If pgx does cannot natively encode a type and that type is a renamed type (e.g. type MyTime time.Time) pgx will attempt -to encode the underlying type. While this is usually desired behavior it can produce surprising behavior if one the -underlying type and the renamed type each implement database/sql interfaces and the other implements pgx interfaces. It -is recommended that this situation be avoided by implementing pgx interfaces on the renamed type. - -Composite types and row values - -Row values and composite types are represented as pgtype.Record (https://pkg.go.dev/github.com/jackc/pgtype?tab=doc#Record). -It is possible to get values of your custom type by implementing DecodeBinary interface. Decoding into -pgtype.Record first can simplify process by avoiding dealing with raw protocol directly. - -For example: - - type MyType struct { - a int // NULL will cause decoding error - b *string // there can be NULL in this position in SQL - } - - func (t *MyType) DecodeBinary(ci *pgtype.ConnInfo, src []byte) error { - r := pgtype.Record{ - Fields: []pgtype.Value{&pgtype.Int4{}, &pgtype.Text{}}, - } - - if err := r.DecodeBinary(ci, src); err != nil { - return err - } - - if r.Status != pgtype.Present { - return errors.New("BUG: decoding should not be called on NULL value") - } - - a := r.Fields[0].(*pgtype.Int4) - b := r.Fields[1].(*pgtype.Text) - - // type compatibility is checked by AssignTo - // only lossless assignments will succeed - if err := a.AssignTo(&t.a); err != nil { - return err - } - - // AssignTo also deals with null value handling - if err := b.AssignTo(&t.b); err != nil { - return err - } - return nil - } - - result := MyType{} - err := conn.QueryRow(context.Background(), "select row(1, 'foo'::text)", pgx.QueryResultFormats{pgx.BinaryFormatCode}).Scan(&r) - -Raw Bytes Mapping - -[]byte passed as arguments to Query, QueryRow, and Exec are passed unmodified to PostgreSQL. - -Transactions - -Transactions are started by calling Begin. - - tx, err := conn.Begin(context.Background()) - if err != nil { - return err - } - // Rollback is safe to call even if the tx is already closed, so if - // the tx commits successfully, this is a no-op - defer tx.Rollback(context.Background()) - - _, err = tx.Exec(context.Background(), "insert into foo(id) values (1)") - if err != nil { - return err - } - - err = tx.Commit(context.Background()) - if err != nil { - return err - } - -The Tx returned from Begin also implements the Begin method. This can be used to implement pseudo nested transactions. -These are internally implemented with savepoints. - -Use BeginTx to control the transaction mode. - -BeginFunc and BeginTxFunc are variants that begin a transaction, execute a function, and commit or rollback the -transaction depending on the return value of the function. These can be simpler and less error prone to use. - - err = conn.BeginFunc(context.Background(), func(tx pgx.Tx) error { - _, err := tx.Exec(context.Background(), "insert into foo(id) values (1)") - return err - }) - if err != nil { - return err - } - -Prepared Statements - -Prepared statements can be manually created with the Prepare method. However, this is rarely necessary because pgx -includes an automatic statement cache by default. Queries run through the normal Query, QueryRow, and Exec functions are -automatically prepared on first execution and the prepared statement is reused on subsequent executions. See ParseConfig -for information on how to customize or disable the statement cache. - -Copy Protocol - -Use CopyFrom to efficiently insert multiple rows at a time using the PostgreSQL copy protocol. CopyFrom accepts a -CopyFromSource interface. If the data is already in a [][]interface{} use CopyFromRows to wrap it in a CopyFromSource -interface. Or implement CopyFromSource to avoid buffering the entire data set in memory. - - rows := [][]interface{}{ - {"John", "Smith", int32(36)}, - {"Jane", "Doe", int32(29)}, - } - - copyCount, err := conn.CopyFrom( - context.Background(), - pgx.Identifier{"people"}, - []string{"first_name", "last_name", "age"}, - pgx.CopyFromRows(rows), - ) - -When you already have a typed array using CopyFromSlice can be more convenient. - - rows := []User{ - {"John", "Smith", 36}, - {"Jane", "Doe", 29}, - } - - copyCount, err := conn.CopyFrom( - context.Background(), - pgx.Identifier{"people"}, - []string{"first_name", "last_name", "age"}, - pgx.CopyFromSlice(len(rows), func(i int) ([]interface{}, error) { - return []interface{}{rows[i].FirstName, rows[i].LastName, rows[i].Age}, nil - }), - ) - -CopyFrom can be faster than an insert with as few as 5 rows. - -Listen and Notify - -pgx can listen to the PostgreSQL notification system with the `Conn.WaitForNotification` method. It blocks until a -notification is received or the context is canceled. - - _, err := conn.Exec(context.Background(), "listen channelname") - if err != nil { - return nil - } - - if notification, err := conn.WaitForNotification(context.Background()); err != nil { - // do something with notification - } - - -Logging - -pgx defines a simple logger interface. Connections optionally accept a logger that satisfies this interface. Set -LogLevel to control logging verbosity. Adapters for github.com/inconshreveable/log15, github.com/sirupsen/logrus, -go.uber.org/zap, github.com/rs/zerolog, and the testing log are provided in the log directory. - -Lower Level PostgreSQL Functionality - -pgx is implemented on top of github.com/jackc/pgconn a lower level PostgreSQL driver. The Conn.PgConn() method can be -used to access this lower layer. - -PgBouncer - -pgx is compatible with PgBouncer in two modes. One is when the connection has a statement cache in "describe" mode. The -other is when the connection is using the simple protocol. This can be set with the PreferSimpleProtocol config option. -*/ -package pgx diff --git a/vendor/github.com/jackc/pgx/v4/extended_query_builder.go b/vendor/github.com/jackc/pgx/v4/extended_query_builder.go deleted file mode 100644 index d06f63f..0000000 --- a/vendor/github.com/jackc/pgx/v4/extended_query_builder.go +++ /dev/null @@ -1,161 +0,0 @@ -package pgx - -import ( - "database/sql/driver" - "fmt" - "reflect" - - "github.com/jackc/pgtype" -) - -type extendedQueryBuilder struct { - paramValues [][]byte - paramValueBytes []byte - paramFormats []int16 - resultFormats []int16 -} - -func (eqb *extendedQueryBuilder) AppendParam(ci *pgtype.ConnInfo, oid uint32, arg interface{}) error { - f := chooseParameterFormatCode(ci, oid, arg) - eqb.paramFormats = append(eqb.paramFormats, f) - - v, err := eqb.encodeExtendedParamValue(ci, oid, f, arg) - if err != nil { - return err - } - eqb.paramValues = append(eqb.paramValues, v) - - return nil -} - -func (eqb *extendedQueryBuilder) AppendResultFormat(f int16) { - eqb.resultFormats = append(eqb.resultFormats, f) -} - -// Reset readies eqb to build another query. -func (eqb *extendedQueryBuilder) Reset() { - eqb.paramValues = eqb.paramValues[0:0] - eqb.paramValueBytes = eqb.paramValueBytes[0:0] - eqb.paramFormats = eqb.paramFormats[0:0] - eqb.resultFormats = eqb.resultFormats[0:0] - - if cap(eqb.paramValues) > 64 { - eqb.paramValues = make([][]byte, 0, 64) - } - - if cap(eqb.paramValueBytes) > 256 { - eqb.paramValueBytes = make([]byte, 0, 256) - } - - if cap(eqb.paramFormats) > 64 { - eqb.paramFormats = make([]int16, 0, 64) - } - if cap(eqb.resultFormats) > 64 { - eqb.resultFormats = make([]int16, 0, 64) - } -} - -func (eqb *extendedQueryBuilder) encodeExtendedParamValue(ci *pgtype.ConnInfo, oid uint32, formatCode int16, arg interface{}) ([]byte, error) { - if arg == nil { - return nil, nil - } - - refVal := reflect.ValueOf(arg) - argIsPtr := refVal.Kind() == reflect.Ptr - - if argIsPtr && refVal.IsNil() { - return nil, nil - } - - if eqb.paramValueBytes == nil { - eqb.paramValueBytes = make([]byte, 0, 128) - } - - var err error - var buf []byte - pos := len(eqb.paramValueBytes) - - if arg, ok := arg.(string); ok { - return []byte(arg), nil - } - - if formatCode == TextFormatCode { - if arg, ok := arg.(pgtype.TextEncoder); ok { - buf, err = arg.EncodeText(ci, eqb.paramValueBytes) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - eqb.paramValueBytes = buf - return eqb.paramValueBytes[pos:], nil - } - } else if formatCode == BinaryFormatCode { - if arg, ok := arg.(pgtype.BinaryEncoder); ok { - buf, err = arg.EncodeBinary(ci, eqb.paramValueBytes) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - eqb.paramValueBytes = buf - return eqb.paramValueBytes[pos:], nil - } - } - - if argIsPtr { - // We have already checked that arg is not pointing to nil, - // so it is safe to dereference here. - arg = refVal.Elem().Interface() - return eqb.encodeExtendedParamValue(ci, oid, formatCode, arg) - } - - if dt, ok := ci.DataTypeForOID(oid); ok { - value := dt.Value - err := value.Set(arg) - if err != nil { - { - if arg, ok := arg.(driver.Valuer); ok { - v, err := callValuerValue(arg) - if err != nil { - return nil, err - } - return eqb.encodeExtendedParamValue(ci, oid, formatCode, v) - } - } - - return nil, err - } - - return eqb.encodeExtendedParamValue(ci, oid, formatCode, value) - } - - // There is no data type registered for the destination OID, but maybe there is data type registered for the arg - // type. If so use it's text encoder (if available). - if dt, ok := ci.DataTypeForValue(arg); ok { - value := dt.Value - if textEncoder, ok := value.(pgtype.TextEncoder); ok { - err := value.Set(arg) - if err != nil { - return nil, err - } - - buf, err = textEncoder.EncodeText(ci, eqb.paramValueBytes) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - eqb.paramValueBytes = buf - return eqb.paramValueBytes[pos:], nil - } - } - - if strippedArg, ok := stripNamedType(&refVal); ok { - return eqb.encodeExtendedParamValue(ci, oid, formatCode, strippedArg) - } - return nil, SerializationError(fmt.Sprintf("Cannot encode %T into oid %v - %T must implement Encoder or be converted to a string", arg, oid, arg)) -} diff --git a/vendor/github.com/jackc/pgx/v4/go_stdlib.go b/vendor/github.com/jackc/pgx/v4/go_stdlib.go deleted file mode 100644 index 9372f9e..0000000 --- a/vendor/github.com/jackc/pgx/v4/go_stdlib.go +++ /dev/null @@ -1,61 +0,0 @@ -package pgx - -import ( - "database/sql/driver" - "reflect" -) - -// This file contains code copied from the Go standard library due to the -// required function not being public. - -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// From database/sql/convert.go - -var valuerReflectType = reflect.TypeOf((*driver.Valuer)(nil)).Elem() - -// callValuerValue returns vr.Value(), with one exception: -// If vr.Value is an auto-generated method on a pointer type and the -// pointer is nil, it would panic at runtime in the panicwrap -// method. Treat it like nil instead. -// Issue 8415. -// -// This is so people can implement driver.Value on value types and -// still use nil pointers to those types to mean nil/NULL, just like -// string/*string. -// -// This function is mirrored in the database/sql/driver package. -func callValuerValue(vr driver.Valuer) (v driver.Value, err error) { - if rv := reflect.ValueOf(vr); rv.Kind() == reflect.Ptr && - rv.IsNil() && - rv.Type().Elem().Implements(valuerReflectType) { - return nil, nil - } - return vr.Value() -} diff --git a/vendor/github.com/jackc/pgx/v4/logger.go b/vendor/github.com/jackc/pgx/v4/logger.go deleted file mode 100644 index 41f8b7e..0000000 --- a/vendor/github.com/jackc/pgx/v4/logger.go +++ /dev/null @@ -1,107 +0,0 @@ -package pgx - -import ( - "context" - "encoding/hex" - "errors" - "fmt" -) - -// The values for log levels are chosen such that the zero value means that no -// log level was specified. -const ( - LogLevelTrace = 6 - LogLevelDebug = 5 - LogLevelInfo = 4 - LogLevelWarn = 3 - LogLevelError = 2 - LogLevelNone = 1 -) - -// LogLevel represents the pgx logging level. See LogLevel* constants for -// possible values. -type LogLevel int - -func (ll LogLevel) String() string { - switch ll { - case LogLevelTrace: - return "trace" - case LogLevelDebug: - return "debug" - case LogLevelInfo: - return "info" - case LogLevelWarn: - return "warn" - case LogLevelError: - return "error" - case LogLevelNone: - return "none" - default: - return fmt.Sprintf("invalid level %d", ll) - } -} - -// Logger is the interface used to get logging from pgx internals. -type Logger interface { - // Log a message at the given level with data key/value pairs. data may be nil. - Log(ctx context.Context, level LogLevel, msg string, data map[string]interface{}) -} - -// LoggerFunc is a wrapper around a function to satisfy the pgx.Logger interface -type LoggerFunc func(ctx context.Context, level LogLevel, msg string, data map[string]interface{}) - -// Log delegates the logging request to the wrapped function -func (f LoggerFunc) Log(ctx context.Context, level LogLevel, msg string, data map[string]interface{}) { - f(ctx, level, msg, data) -} - -// LogLevelFromString converts log level string to constant -// -// Valid levels: -// -// trace -// debug -// info -// warn -// error -// none -func LogLevelFromString(s string) (LogLevel, error) { - switch s { - case "trace": - return LogLevelTrace, nil - case "debug": - return LogLevelDebug, nil - case "info": - return LogLevelInfo, nil - case "warn": - return LogLevelWarn, nil - case "error": - return LogLevelError, nil - case "none": - return LogLevelNone, nil - default: - return 0, errors.New("invalid log level") - } -} - -func logQueryArgs(args []interface{}) []interface{} { - logArgs := make([]interface{}, 0, len(args)) - - for _, a := range args { - switch v := a.(type) { - case []byte: - if len(v) < 64 { - a = hex.EncodeToString(v) - } else { - a = fmt.Sprintf("%x (truncated %d bytes)", v[:64], len(v)-64) - } - case string: - if len(v) > 64 { - a = fmt.Sprintf("%s (truncated %d bytes)", v[:64], len(v)-64) - } - } - logArgs = append(logArgs, a) - } - - return logArgs -} diff --git a/vendor/github.com/jackc/pgx/v4/messages.go b/vendor/github.com/jackc/pgx/v4/messages.go deleted file mode 100644 index 5324cbb..0000000 --- a/vendor/github.com/jackc/pgx/v4/messages.go +++ /dev/null @@ -1,23 +0,0 @@ -package pgx - -import ( - "database/sql/driver" - - "github.com/jackc/pgtype" -) - -func convertDriverValuers(args []interface{}) ([]interface{}, error) { - for i, arg := range args { - switch arg := arg.(type) { - case pgtype.BinaryEncoder: - case pgtype.TextEncoder: - case driver.Valuer: - v, err := callValuerValue(arg) - if err != nil { - return nil, err - } - args[i] = v - } - } - return args, nil -} diff --git a/vendor/github.com/jackc/pgx/v4/rows.go b/vendor/github.com/jackc/pgx/v4/rows.go deleted file mode 100644 index 4749ead..0000000 --- a/vendor/github.com/jackc/pgx/v4/rows.go +++ /dev/null @@ -1,351 +0,0 @@ -package pgx - -import ( - "context" - "errors" - "fmt" - "time" - - "github.com/jackc/pgconn" - "github.com/jackc/pgproto3/v2" - "github.com/jackc/pgtype" -) - -// Rows is the result set returned from *Conn.Query. Rows must be closed before -// the *Conn can be used again. Rows are closed by explicitly calling Close(), -// calling Next() until it returns false, or when a fatal error occurs. -// -// Once a Rows is closed the only methods that may be called are Close(), Err(), and CommandTag(). -// -// Rows is an interface instead of a struct to allow tests to mock Query. However, -// adding a method to an interface is technically a breaking change. Because of this -// the Rows interface is partially excluded from semantic version requirements. -// Methods will not be removed or changed, but new methods may be added. -type Rows interface { - // Close closes the rows, making the connection ready for use again. It is safe - // to call Close after rows is already closed. - Close() - - // Err returns any error that occurred while reading. - Err() error - - // CommandTag returns the command tag from this query. It is only available after Rows is closed. - CommandTag() pgconn.CommandTag - - FieldDescriptions() []pgproto3.FieldDescription - - // Next prepares the next row for reading. It returns true if there is another - // row and false if no more rows are available. It automatically closes rows - // when all rows are read. - Next() bool - - // Scan reads the values from the current row into dest values positionally. - // dest can include pointers to core types, values implementing the Scanner - // interface, and nil. nil will skip the value entirely. It is an error to - // call Scan without first calling Next() and checking that it returned true. - Scan(dest ...interface{}) error - - // Values returns the decoded row values. As with Scan(), it is an error to - // call Values without first calling Next() and checking that it returned - // true. - Values() ([]interface{}, error) - - // RawValues returns the unparsed bytes of the row values. The returned [][]byte is only valid until the next Next - // call or the Rows is closed. However, the underlying byte data is safe to retain a reference to and mutate. - RawValues() [][]byte -} - -// Row is a convenience wrapper over Rows that is returned by QueryRow. -// -// Row is an interface instead of a struct to allow tests to mock QueryRow. However, -// adding a method to an interface is technically a breaking change. Because of this -// the Row interface is partially excluded from semantic version requirements. -// Methods will not be removed or changed, but new methods may be added. -type Row interface { - // Scan works the same as Rows. with the following exceptions. If no - // rows were found it returns ErrNoRows. If multiple rows are returned it - // ignores all but the first. - Scan(dest ...interface{}) error -} - -// connRow implements the Row interface for Conn.QueryRow. -type connRow connRows - -func (r *connRow) Scan(dest ...interface{}) (err error) { - rows := (*connRows)(r) - - if rows.Err() != nil { - return rows.Err() - } - - if !rows.Next() { - if rows.Err() == nil { - return ErrNoRows - } - return rows.Err() - } - - rows.Scan(dest...) - rows.Close() - return rows.Err() -} - -type rowLog interface { - shouldLog(lvl LogLevel) bool - log(ctx context.Context, lvl LogLevel, msg string, data map[string]interface{}) -} - -// connRows implements the Rows interface for Conn.Query. -type connRows struct { - ctx context.Context - logger rowLog - connInfo *pgtype.ConnInfo - values [][]byte - rowCount int - err error - commandTag pgconn.CommandTag - startTime time.Time - sql string - args []interface{} - closed bool - conn *Conn - - resultReader *pgconn.ResultReader - multiResultReader *pgconn.MultiResultReader - - scanPlans []pgtype.ScanPlan -} - -func (rows *connRows) FieldDescriptions() []pgproto3.FieldDescription { - return rows.resultReader.FieldDescriptions() -} - -func (rows *connRows) Close() { - if rows.closed { - return - } - - rows.closed = true - - if rows.resultReader != nil { - var closeErr error - rows.commandTag, closeErr = rows.resultReader.Close() - if rows.err == nil { - rows.err = closeErr - } - } - - if rows.multiResultReader != nil { - closeErr := rows.multiResultReader.Close() - if rows.err == nil { - rows.err = closeErr - } - } - - if rows.logger != nil { - endTime := time.Now() - - if rows.err == nil { - if rows.logger.shouldLog(LogLevelInfo) { - rows.logger.log(rows.ctx, LogLevelInfo, "Query", map[string]interface{}{"sql": rows.sql, "args": logQueryArgs(rows.args), "time": endTime.Sub(rows.startTime), "rowCount": rows.rowCount}) - } - } else { - if rows.logger.shouldLog(LogLevelError) { - rows.logger.log(rows.ctx, LogLevelError, "Query", map[string]interface{}{"err": rows.err, "sql": rows.sql, "time": endTime.Sub(rows.startTime), "args": logQueryArgs(rows.args)}) - } - if rows.err != nil && rows.conn.stmtcache != nil { - rows.conn.stmtcache.StatementErrored(rows.sql, rows.err) - } - } - } -} - -func (rows *connRows) CommandTag() pgconn.CommandTag { - return rows.commandTag -} - -func (rows *connRows) Err() error { - return rows.err -} - -// fatal signals an error occurred after the query was sent to the server. It -// closes the rows automatically. -func (rows *connRows) fatal(err error) { - if rows.err != nil { - return - } - - rows.err = err - rows.Close() -} - -func (rows *connRows) Next() bool { - if rows.closed { - return false - } - - if rows.resultReader.NextRow() { - rows.rowCount++ - rows.values = rows.resultReader.Values() - return true - } else { - rows.Close() - return false - } -} - -func (rows *connRows) Scan(dest ...interface{}) error { - ci := rows.connInfo - fieldDescriptions := rows.FieldDescriptions() - values := rows.values - - if len(fieldDescriptions) != len(values) { - err := fmt.Errorf("number of field descriptions must equal number of values, got %d and %d", len(fieldDescriptions), len(values)) - rows.fatal(err) - return err - } - if len(fieldDescriptions) != len(dest) { - err := fmt.Errorf("number of field descriptions must equal number of destinations, got %d and %d", len(fieldDescriptions), len(dest)) - rows.fatal(err) - return err - } - - if rows.scanPlans == nil { - rows.scanPlans = make([]pgtype.ScanPlan, len(values)) - for i := range dest { - rows.scanPlans[i] = ci.PlanScan(fieldDescriptions[i].DataTypeOID, fieldDescriptions[i].Format, dest[i]) - } - } - - for i, dst := range dest { - if dst == nil { - continue - } - - err := rows.scanPlans[i].Scan(ci, fieldDescriptions[i].DataTypeOID, fieldDescriptions[i].Format, values[i], dst) - if err != nil { - err = ScanArgError{ColumnIndex: i, Err: err} - rows.fatal(err) - return err - } - } - - return nil -} - -func (rows *connRows) Values() ([]interface{}, error) { - if rows.closed { - return nil, errors.New("rows is closed") - } - - values := make([]interface{}, 0, len(rows.FieldDescriptions())) - - for i := range rows.FieldDescriptions() { - buf := rows.values[i] - fd := &rows.FieldDescriptions()[i] - - if buf == nil { - values = append(values, nil) - continue - } - - if dt, ok := rows.connInfo.DataTypeForOID(fd.DataTypeOID); ok { - value := dt.Value - - switch fd.Format { - case TextFormatCode: - decoder, ok := value.(pgtype.TextDecoder) - if !ok { - decoder = &pgtype.GenericText{} - } - err := decoder.DecodeText(rows.connInfo, buf) - if err != nil { - rows.fatal(err) - } - values = append(values, decoder.(pgtype.Value).Get()) - case BinaryFormatCode: - decoder, ok := value.(pgtype.BinaryDecoder) - if !ok { - decoder = &pgtype.GenericBinary{} - } - err := decoder.DecodeBinary(rows.connInfo, buf) - if err != nil { - rows.fatal(err) - } - values = append(values, value.Get()) - default: - rows.fatal(errors.New("Unknown format code")) - } - } else { - switch fd.Format { - case TextFormatCode: - decoder := &pgtype.GenericText{} - err := decoder.DecodeText(rows.connInfo, buf) - if err != nil { - rows.fatal(err) - } - values = append(values, decoder.Get()) - case BinaryFormatCode: - decoder := &pgtype.GenericBinary{} - err := decoder.DecodeBinary(rows.connInfo, buf) - if err != nil { - rows.fatal(err) - } - values = append(values, decoder.Get()) - default: - rows.fatal(errors.New("Unknown format code")) - } - } - - if rows.Err() != nil { - return nil, rows.Err() - } - } - - return values, rows.Err() -} - -func (rows *connRows) RawValues() [][]byte { - return rows.values -} - -type ScanArgError struct { - ColumnIndex int - Err error -} - -func (e ScanArgError) Error() string { - return fmt.Sprintf("can't scan into dest[%d]: %v", e.ColumnIndex, e.Err) -} - -func (e ScanArgError) Unwrap() error { - return e.Err -} - -// ScanRow decodes raw row data into dest. It can be used to scan rows read from the lower level pgconn interface. -// -// connInfo - OID to Go type mapping. -// fieldDescriptions - OID and format of values -// values - the raw data as returned from the PostgreSQL server -// dest - the destination that values will be decoded into -func ScanRow(connInfo *pgtype.ConnInfo, fieldDescriptions []pgproto3.FieldDescription, values [][]byte, dest ...interface{}) error { - if len(fieldDescriptions) != len(values) { - return fmt.Errorf("number of field descriptions must equal number of values, got %d and %d", len(fieldDescriptions), len(values)) - } - if len(fieldDescriptions) != len(dest) { - return fmt.Errorf("number of field descriptions must equal number of destinations, got %d and %d", len(fieldDescriptions), len(dest)) - } - - for i, d := range dest { - if d == nil { - continue - } - - err := connInfo.Scan(fieldDescriptions[i].DataTypeOID, fieldDescriptions[i].Format, values[i], d) - if err != nil { - return ScanArgError{ColumnIndex: i, Err: err} - } - } - - return nil -} diff --git a/vendor/github.com/jackc/pgx/v4/values.go b/vendor/github.com/jackc/pgx/v4/values.go deleted file mode 100644 index 1a94547..0000000 --- a/vendor/github.com/jackc/pgx/v4/values.go +++ /dev/null @@ -1,280 +0,0 @@ -package pgx - -import ( - "database/sql/driver" - "fmt" - "math" - "reflect" - "time" - - "github.com/jackc/pgio" - "github.com/jackc/pgtype" -) - -// PostgreSQL format codes -const ( - TextFormatCode = 0 - BinaryFormatCode = 1 -) - -// SerializationError occurs on failure to encode or decode a value -type SerializationError string - -func (e SerializationError) Error() string { - return string(e) -} - -func convertSimpleArgument(ci *pgtype.ConnInfo, arg interface{}) (interface{}, error) { - if arg == nil { - return nil, nil - } - - refVal := reflect.ValueOf(arg) - if refVal.Kind() == reflect.Ptr && refVal.IsNil() { - return nil, nil - } - - switch arg := arg.(type) { - - // https://github.com/jackc/pgx/issues/409 Changed JSON and JSONB to surface - // []byte to database/sql instead of string. But that caused problems with the - // simple protocol because the driver.Valuer case got taken before the - // pgtype.TextEncoder case. And driver.Valuer needed to be first in the usual - // case because of https://github.com/jackc/pgx/issues/339. So instead we - // special case JSON and JSONB. - case *pgtype.JSON: - buf, err := arg.EncodeText(ci, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - return string(buf), nil - case *pgtype.JSONB: - buf, err := arg.EncodeText(ci, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - return string(buf), nil - - case driver.Valuer: - return callValuerValue(arg) - case pgtype.TextEncoder: - buf, err := arg.EncodeText(ci, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - return string(buf), nil - case float32: - return float64(arg), nil - case float64: - return arg, nil - case bool: - return arg, nil - case time.Duration: - return fmt.Sprintf("%d microsecond", int64(arg)/1000), nil - case time.Time: - return arg, nil - case string: - return arg, nil - case []byte: - return arg, nil - case int8: - return int64(arg), nil - case int16: - return int64(arg), nil - case int32: - return int64(arg), nil - case int64: - return arg, nil - case int: - return int64(arg), nil - case uint8: - return int64(arg), nil - case uint16: - return int64(arg), nil - case uint32: - return int64(arg), nil - case uint64: - if arg > math.MaxInt64 { - return nil, fmt.Errorf("arg too big for int64: %v", arg) - } - return int64(arg), nil - case uint: - if uint64(arg) > math.MaxInt64 { - return nil, fmt.Errorf("arg too big for int64: %v", arg) - } - return int64(arg), nil - } - - if dt, found := ci.DataTypeForValue(arg); found { - v := dt.Value - err := v.Set(arg) - if err != nil { - return nil, err - } - buf, err := v.(pgtype.TextEncoder).EncodeText(ci, nil) - if err != nil { - return nil, err - } - if buf == nil { - return nil, nil - } - return string(buf), nil - } - - if refVal.Kind() == reflect.Ptr { - arg = refVal.Elem().Interface() - return convertSimpleArgument(ci, arg) - } - - if strippedArg, ok := stripNamedType(&refVal); ok { - return convertSimpleArgument(ci, strippedArg) - } - return nil, SerializationError(fmt.Sprintf("Cannot encode %T in simple protocol - %T must implement driver.Valuer, pgtype.TextEncoder, or be a native type", arg, arg)) -} - -func encodePreparedStatementArgument(ci *pgtype.ConnInfo, buf []byte, oid uint32, arg interface{}) ([]byte, error) { - if arg == nil { - return pgio.AppendInt32(buf, -1), nil - } - - switch arg := arg.(type) { - case pgtype.BinaryEncoder: - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - argBuf, err := arg.EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if argBuf != nil { - buf = argBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - return buf, nil - case pgtype.TextEncoder: - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - argBuf, err := arg.EncodeText(ci, buf) - if err != nil { - return nil, err - } - if argBuf != nil { - buf = argBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - return buf, nil - case string: - buf = pgio.AppendInt32(buf, int32(len(arg))) - buf = append(buf, arg...) - return buf, nil - } - - refVal := reflect.ValueOf(arg) - - if refVal.Kind() == reflect.Ptr { - if refVal.IsNil() { - return pgio.AppendInt32(buf, -1), nil - } - arg = refVal.Elem().Interface() - return encodePreparedStatementArgument(ci, buf, oid, arg) - } - - if dt, ok := ci.DataTypeForOID(oid); ok { - value := dt.Value - err := value.Set(arg) - if err != nil { - { - if arg, ok := arg.(driver.Valuer); ok { - v, err := callValuerValue(arg) - if err != nil { - return nil, err - } - return encodePreparedStatementArgument(ci, buf, oid, v) - } - } - - return nil, err - } - - sp := len(buf) - buf = pgio.AppendInt32(buf, -1) - argBuf, err := value.(pgtype.BinaryEncoder).EncodeBinary(ci, buf) - if err != nil { - return nil, err - } - if argBuf != nil { - buf = argBuf - pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) - } - return buf, nil - } - - if strippedArg, ok := stripNamedType(&refVal); ok { - return encodePreparedStatementArgument(ci, buf, oid, strippedArg) - } - return nil, SerializationError(fmt.Sprintf("Cannot encode %T into oid %v - %T must implement Encoder or be converted to a string", arg, oid, arg)) -} - -// chooseParameterFormatCode determines the correct format code for an -// argument to a prepared statement. It defaults to TextFormatCode if no -// determination can be made. -func chooseParameterFormatCode(ci *pgtype.ConnInfo, oid uint32, arg interface{}) int16 { - switch arg := arg.(type) { - case pgtype.ParamFormatPreferrer: - return arg.PreferredParamFormat() - case pgtype.BinaryEncoder: - return BinaryFormatCode - case string, *string, pgtype.TextEncoder: - return TextFormatCode - } - - return ci.ParamFormatCodeForOID(oid) -} - -func stripNamedType(val *reflect.Value) (interface{}, bool) { - switch val.Kind() { - case reflect.Int: - convVal := int(val.Int()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Int8: - convVal := int8(val.Int()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Int16: - convVal := int16(val.Int()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Int32: - convVal := int32(val.Int()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Int64: - convVal := int64(val.Int()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Uint: - convVal := uint(val.Uint()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Uint8: - convVal := uint8(val.Uint()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Uint16: - convVal := uint16(val.Uint()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Uint32: - convVal := uint32(val.Uint()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.Uint64: - convVal := uint64(val.Uint()) - return convVal, reflect.TypeOf(convVal) != val.Type() - case reflect.String: - convVal := val.String() - return convVal, reflect.TypeOf(convVal) != val.Type() - } - - return nil, false -} diff --git a/vendor/github.com/jackc/pgx/v4/.gitignore b/vendor/github.com/jackc/pgx/v5/.gitignore similarity index 92% rename from vendor/github.com/jackc/pgx/v4/.gitignore rename to vendor/github.com/jackc/pgx/v5/.gitignore index 39175a9..a2ebbe9 100644 --- a/vendor/github.com/jackc/pgx/v4/.gitignore +++ b/vendor/github.com/jackc/pgx/v5/.gitignore @@ -22,3 +22,6 @@ _testmain.go *.exe .envrc +/.testdb + +.DS_Store diff --git a/vendor/github.com/jackc/pgx/v5/CHANGELOG.md b/vendor/github.com/jackc/pgx/v5/CHANGELOG.md new file mode 100644 index 0000000..a0ff9ba --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/CHANGELOG.md @@ -0,0 +1,422 @@ +# 5.7.1 (September 10, 2024) + +* Fix data race in tracelog.TraceLog +* Update puddle to v2.2.2. This removes the import of nanotime via linkname. +* Update golang.org/x/crypto and golang.org/x/text + +# 5.7.0 (September 7, 2024) + +* Add support for sslrootcert=system (Yann Soubeyrand) +* Add LoadTypes to load multiple types in a single SQL query (Nick Farrell) +* Add XMLCodec supports encoding + scanning XML column type like json (nickcruess-soda) +* Add MultiTrace (Stepan Rabotkin) +* Add TraceLogConfig with customizable TimeKey (stringintech) +* pgx.ErrNoRows wraps sql.ErrNoRows to aid in database/sql compatibility with native pgx functions (merlin) +* Support scanning binary formatted uint32 into string / TextScanner (jennifersp) +* Fix interval encoding to allow 0s and avoid extra spaces (Carlos Pérez-Aradros Herce) +* Update pgservicefile - fixes panic when parsing invalid file +* Better error message when reading past end of batch +* Don't print url when url.Parse returns an error (Kevin Biju) +* Fix snake case name normalization collision in RowToStructByName with db tag (nolandseigler) +* Fix: Scan and encode types with underlying types of arrays + +# 5.6.0 (May 25, 2024) + +* Add StrictNamedArgs (Tomas Zahradnicek) +* Add support for macaddr8 type (Carlos Pérez-Aradros Herce) +* Add SeverityUnlocalized field to PgError / Notice +* Performance optimization of RowToStructByPos/Name (Zach Olstein) +* Allow customizing context canceled behavior for pgconn +* Add ScanLocation to pgtype.Timestamp[tz]Codec +* Add custom data to pgconn.PgConn +* Fix ResultReader.Read() to handle nil values +* Do not encode interval microseconds when they are 0 (Carlos Pérez-Aradros Herce) +* pgconn.SafeToRetry checks for wrapped errors (tjasko) +* Failed connection attempts include all errors +* Optimize LargeObject.Read (Mitar) +* Add tracing for connection acquire and release from pool (ngavinsir) +* Fix encode driver.Valuer not called when nil +* Add support for custom JSON marshal and unmarshal (Mitar) +* Use Go default keepalive for TCP connections (Hans-Joachim Kliemeck) + +# 5.5.5 (March 9, 2024) + +Use spaces instead of parentheses for SQL sanitization. + +This still solves the problem of negative numbers creating a line comment, but this avoids breaking edge cases such as +`set foo to $1` where the substitution is taking place in a location where an arbitrary expression is not allowed. + +# 5.5.4 (March 4, 2024) + +Fix CVE-2024-27304 + +SQL injection can occur if an attacker can cause a single query or bind message to exceed 4 GB in size. An integer +overflow in the calculated message size can cause the one large message to be sent as multiple messages under the +attacker's control. + +Thanks to Paul Gerste for reporting this issue. + +* Fix behavior of CollectRows to return empty slice if Rows are empty (Felix) +* Fix simple protocol encoding of json.RawMessage +* Fix *Pipeline.getResults should close pipeline on error +* Fix panic in TryFindUnderlyingTypeScanPlan (David Kurman) +* Fix deallocation of invalidated cached statements in a transaction +* Handle invalid sslkey file +* Fix scan float4 into sql.Scanner +* Fix pgtype.Bits not making copy of data from read buffer. This would cause the data to be corrupted by future reads. + +# 5.5.3 (February 3, 2024) + +* Fix: prepared statement already exists +* Improve CopyFrom auto-conversion of text-ish values +* Add ltree type support (Florent Viel) +* Make some properties of Batch and QueuedQuery public (Pavlo Golub) +* Add AppendRows function (Edoardo Spadolini) +* Optimize convert UUID [16]byte to string (Kirill Malikov) +* Fix: LargeObject Read and Write of more than ~1GB at a time (Mitar) + +# 5.5.2 (January 13, 2024) + +* Allow NamedArgs to start with underscore +* pgproto3: Maximum message body length support (jeremy.spriet) +* Upgrade golang.org/x/crypto to v0.17.0 +* Add snake_case support to RowToStructByName (Tikhon Fedulov) +* Fix: update description cache after exec prepare (James Hartig) +* Fix: pipeline checks if it is closed (James Hartig and Ryan Fowler) +* Fix: normalize timeout / context errors during TLS startup (Samuel Stauffer) +* Add OnPgError for easier centralized error handling (James Hartig) + +# 5.5.1 (December 9, 2023) + +* Add CopyFromFunc helper function. (robford) +* Add PgConn.Deallocate method that uses PostgreSQL protocol Close message. +* pgx uses new PgConn.Deallocate method. This allows deallocating statements to work in a failed transaction. This fixes a case where the prepared statement map could become invalid. +* Fix: Prefer driver.Valuer over json.Marshaler for json fields. (Jacopo) +* Fix: simple protocol SQL sanitizer previously panicked if an invalid $0 placeholder was used. This now returns an error instead. (maksymnevajdev) +* Add pgtype.Numeric.ScanScientific (Eshton Robateau) + +# 5.5.0 (November 4, 2023) + +* Add CollectExactlyOneRow. (Julien GOTTELAND) +* Add OpenDBFromPool to create *database/sql.DB from *pgxpool.Pool. (Lev Zakharov) +* Prepare can automatically choose statement name based on sql. This makes it easier to explicitly manage prepared statements. +* Statement cache now uses deterministic, stable statement names. +* database/sql prepared statement names are deterministically generated. +* Fix: SendBatch wasn't respecting context cancellation. +* Fix: Timeout error from pipeline is now normalized. +* Fix: database/sql encoding json.RawMessage to []byte. +* CancelRequest: Wait for the cancel request to be acknowledged by the server. This should improve PgBouncer compatibility. (Anton Levakin) +* stdlib: Use Ping instead of CheckConn in ResetSession +* Add json.Marshaler and json.Unmarshaler for Float4, Float8 (Kirill Mironov) + +# 5.4.3 (August 5, 2023) + +* Fix: QCharArrayOID was defined with the wrong OID (Christoph Engelbert) +* Fix: connect_timeout for sslmode=allow|prefer (smaher-edb) +* Fix: pgxpool: background health check cannot overflow pool +* Fix: Check for nil in defer when sending batch (recover properly from panic) +* Fix: json scan of non-string pointer to pointer +* Fix: zeronull.Timestamptz should use pgtype.Timestamptz +* Fix: NewConnsCount was not correctly counting connections created by Acquire directly. (James Hartig) +* RowTo(AddrOf)StructByPos ignores fields with "-" db tag +* Optimization: improve text format numeric parsing (horpto) + +# 5.4.2 (July 11, 2023) + +* Fix: RowScanner errors are fatal to Rows +* Fix: Enable failover efforts when pg_hba.conf disallows non-ssl connections (Brandon Kauffman) +* Hstore text codec internal improvements (Evan Jones) +* Fix: Stop timers for background reader when not in use. Fixes memory leak when closing connections (Adrian-Stefan Mares) +* Fix: Stop background reader as soon as possible. +* Add PgConn.SyncConn(). This combined with the above fix makes it safe to directly use the underlying net.Conn. + +# 5.4.1 (June 18, 2023) + +* Fix: concurrency bug with pgtypeDefaultMap and simple protocol (Lev Zakharov) +* Add TxOptions.BeginQuery to allow overriding the default BEGIN query + +# 5.4.0 (June 14, 2023) + +* Replace platform specific syscalls for non-blocking IO with more traditional goroutines and deadlines. This returns to the v4 approach with some additional improvements and fixes. This restores the ability to use a pgx.Conn over an ssh.Conn as well as other non-TCP or Unix socket connections. In addition, it is a significantly simpler implementation that is less likely to have cross platform issues. +* Optimization: The default type registrations are now shared among all connections. This saves about 100KB of memory per connection. `pgtype.Type` and `pgtype.Codec` values are now required to be immutable after registration. This was already necessary in most cases but wasn't documented until now. (Lev Zakharov) +* Fix: Ensure pgxpool.Pool.QueryRow.Scan releases connection on panic +* CancelRequest: don't try to read the reply (Nicola Murino) +* Fix: correctly handle bool type aliases (Wichert Akkerman) +* Fix: pgconn.CancelRequest: Fix unix sockets: don't use RemoteAddr() +* Fix: pgx.Conn memory leak with prepared statement caching (Evan Jones) +* Add BeforeClose to pgxpool.Pool (Evan Cordell) +* Fix: various hstore fixes and optimizations (Evan Jones) +* Fix: RowToStructByPos with embedded unexported struct +* Support different bool string representations (Lev Zakharov) +* Fix: error when using BatchResults.Exec on a select that returns an error after some rows. +* Fix: pipelineBatchResults.Exec() not returning error from ResultReader +* Fix: pipeline batch results not closing pipeline when error occurs while reading directly from results instead of using + a callback. +* Fix: scanning a table type into a struct +* Fix: scan array of record to pointer to slice of struct +* Fix: handle null for json (Cemre Mengu) +* Batch Query callback is called even when there is an error +* Add RowTo(AddrOf)StructByNameLax (Audi P. Risa P) + +# 5.3.1 (February 27, 2023) + +* Fix: Support v4 and v5 stdlib in same program (Tomáš Procházka) +* Fix: sql.Scanner not being used in certain cases +* Add text format jsonpath support +* Fix: fake non-blocking read adaptive wait time + +# 5.3.0 (February 11, 2023) + +* Fix: json values work with sql.Scanner +* Fixed / improved error messages (Mark Chambers and Yevgeny Pats) +* Fix: support scan into single dimensional arrays +* Fix: MaxConnLifetimeJitter setting actually jitter (Ben Weintraub) +* Fix: driver.Value representation of bytea should be []byte not string +* Fix: better handling of unregistered OIDs +* CopyFrom can use query cache to avoid extra round trip to get OIDs (Alejandro Do Nascimento Mora) +* Fix: encode to json ignoring driver.Valuer +* Support sql.Scanner on renamed base type +* Fix: pgtype.Numeric text encoding of negative numbers (Mark Chambers) +* Fix: connect with multiple hostnames when one can't be resolved +* Upgrade puddle to remove dependency on uber/atomic and fix alignment issue on 32-bit platform +* Fix: scanning json column into **string +* Multiple reductions in memory allocations +* Fake non-blocking read adapts its max wait time +* Improve CopyFrom performance and reduce memory usage +* Fix: encode []any to array +* Fix: LoadType for composite with dropped attributes (Felix Röhrich) +* Support v4 and v5 stdlib in same program +* Fix: text format array decoding with string of "NULL" +* Prefer binary format for arrays + +# 5.2.0 (December 5, 2022) + +* `tracelog.TraceLog` implements the pgx.PrepareTracer interface. (Vitalii Solodilov) +* Optimize creating begin transaction SQL string (Petr Evdokimov and ksco) +* `Conn.LoadType` supports range and multirange types (Vitalii Solodilov) +* Fix scan `uint` and `uint64` `ScanNumeric`. This resolves a PostgreSQL `numeric` being incorrectly scanned into `uint` and `uint64`. + +# 5.1.1 (November 17, 2022) + +* Fix simple query sanitizer where query text contains a Unicode replacement character. +* Remove erroneous `name` argument from `DeallocateAll()`. Technically, this is a breaking change, but given that method was only added 5 days ago this change was accepted. (Bodo Kaiser) + +# 5.1.0 (November 12, 2022) + +* Update puddle to v2.1.2. This resolves a race condition and a deadlock in pgxpool. +* `QueryRewriter.RewriteQuery` now returns an error. Technically, this is a breaking change for any external implementers, but given the minimal likelihood that there are actually any external implementers this change was accepted. +* Expose `GetSSLPassword` support to pgx. +* Fix encode `ErrorResponse` unknown field handling. This would only affect pgproto3 being used directly as a proxy with a non-PostgreSQL server that included additional error fields. +* Fix date text format encoding with 5 digit years. +* Fix date values passed to a `sql.Scanner` as `string` instead of `time.Time`. +* DateCodec.DecodeValue can return `pgtype.InfinityModifier` instead of `string` for infinite values. This now matches the behavior of the timestamp types. +* Add domain type support to `Conn.LoadType()`. +* Add `RowToStructByName` and `RowToAddrOfStructByName`. (Pavlo Golub) +* Add `Conn.DeallocateAll()` to clear all prepared statements including the statement cache. (Bodo Kaiser) + +# 5.0.4 (October 24, 2022) + +* Fix: CollectOneRow prefers PostgreSQL error over pgx.ErrorNoRows +* Fix: some reflect Kind checks to first check for nil +* Bump golang.org/x/text dependency to placate snyk +* Fix: RowToStructByPos on structs with multiple anonymous sub-structs (Baptiste Fontaine) +* Fix: Exec checks if tx is closed + +# 5.0.3 (October 14, 2022) + +* Fix `driver.Valuer` handling edge cases that could cause infinite loop or crash + +# v5.0.2 (October 8, 2022) + +* Fix date encoding in text format to always use 2 digits for month and day +* Prefer driver.Valuer over wrap plans when encoding +* Fix scan to pointer to pointer to renamed type +* Allow scanning NULL even if PG and Go types are incompatible + +# v5.0.1 (September 24, 2022) + +* Fix 32-bit atomic usage +* Add MarshalJSON for Float8 (yogipristiawan) +* Add `[` and `]` to text encoding of `Lseg` +* Fix sqlScannerWrapper NULL handling + +# v5.0.0 (September 17, 2022) + +## Merged Packages + +`github.com/jackc/pgtype`, `github.com/jackc/pgconn`, and `github.com/jackc/pgproto3` are now included in the main +`github.com/jackc/pgx` repository. Previously there was confusion as to where issues should be reported, additional +release work due to releasing multiple packages, and less clear changelogs. + +## pgconn + +`CommandTag` is now an opaque type instead of directly exposing an underlying `[]byte`. + +The return value `ResultReader.Values()` is no longer safe to retain a reference to after a subsequent call to `NextRow()` or `Close()`. + +`Trace()` method adds low level message tracing similar to the `PQtrace` function in `libpq`. + +pgconn now uses non-blocking IO. This is a significant internal restructuring, but it should not cause any visible changes on its own. However, it is important in implementing other new features. + +`CheckConn()` checks a connection's liveness by doing a non-blocking read. This can be used to detect database restarts or network interruptions without executing a query or a ping. + +pgconn now supports pipeline mode. + +`*PgConn.ReceiveResults` removed. Use pipeline mode instead. + +`Timeout()` no longer considers `context.Canceled` as a timeout error. `context.DeadlineExceeded` still is considered a timeout error. + +## pgxpool + +`Connect` and `ConnectConfig` have been renamed to `New` and `NewWithConfig` respectively. The `LazyConnect` option has been removed. Pools always lazily connect. + +## pgtype + +The `pgtype` package has been significantly changed. + +### NULL Representation + +Previously, types had a `Status` field that could be `Undefined`, `Null`, or `Present`. This has been changed to a +`Valid` `bool` field to harmonize with how `database/sql` represents `NULL` and to make the zero value useable. + +Previously, a type that implemented `driver.Valuer` would have the `Value` method called even on a nil pointer. All nils +whether typed or untyped now represent `NULL`. + +### Codec and Value Split + +Previously, the type system combined decoding and encoding values with the value types. e.g. Type `Int8` both handled +encoding and decoding the PostgreSQL representation and acted as a value object. This caused some difficulties when +there was not an exact 1 to 1 relationship between the Go types and the PostgreSQL types For example, scanning a +PostgreSQL binary `numeric` into a Go `float64` was awkward (see https://github.com/jackc/pgtype/issues/147). This +concepts have been separated. A `Codec` only has responsibility for encoding and decoding values. Value types are +generally defined by implementing an interface that a particular `Codec` understands (e.g. `PointScanner` and +`PointValuer` for the PostgreSQL `point` type). + +### Array Types + +All array types are now handled by `ArrayCodec` instead of using code generation for each new array type. This also +means that less common array types such as `point[]` are now supported. `Array[T]` supports PostgreSQL multi-dimensional +arrays. + +### Composite Types + +Composite types must be registered before use. `CompositeFields` may still be used to construct and destruct composite +values, but any type may now implement `CompositeIndexGetter` and `CompositeIndexScanner` to be used as a composite. + +### Range Types + +Range types are now handled with types `RangeCodec` and `Range[T]`. This allows additional user defined range types to +easily be handled. Multirange types are handled similarly with `MultirangeCodec` and `Multirange[T]`. + +### pgxtype + +`LoadDataType` moved to `*Conn` as `LoadType`. + +### Bytea + +The `Bytea` and `GenericBinary` types have been replaced. Use the following instead: + +* `[]byte` - For normal usage directly use `[]byte`. +* `DriverBytes` - Uses driver memory only available until next database method call. Avoids a copy and an allocation. +* `PreallocBytes` - Uses preallocated byte slice to avoid an allocation. +* `UndecodedBytes` - Avoids any decoding. Allows working with raw bytes. + +### Dropped lib/pq Support + +`pgtype` previously supported and was tested against [lib/pq](https://github.com/lib/pq). While it will continue to work +in most cases this is no longer supported. + +### database/sql Scan + +Previously, most `Scan` implementations would convert `[]byte` to `string` automatically to decode a text value. Now +only `string` is handled. This is to allow the possibility of future binary support in `database/sql` mode by +considering `[]byte` to be binary format and `string` text format. This change should have no effect for any use with +`pgx`. The previous behavior was only necessary for `lib/pq` compatibility. + +Added `*Map.SQLScanner` to create a `sql.Scanner` for types such as `[]int32` and `Range[T]` that do not implement +`sql.Scanner` directly. + +### Number Type Fields Include Bit size + +`Int2`, `Int4`, `Int8`, `Float4`, `Float8`, and `Uint32` fields now include bit size. e.g. `Int` is renamed to `Int64`. +This matches the convention set by `database/sql`. In addition, for comparable types like `pgtype.Int8` and +`sql.NullInt64` the structures are identical. This means they can be directly converted one to another. + +### 3rd Party Type Integrations + +* Extracted integrations with https://github.com/shopspring/decimal and https://github.com/gofrs/uuid to + https://github.com/jackc/pgx-shopspring-decimal and https://github.com/jackc/pgx-gofrs-uuid respectively. This trims + the pgx dependency tree. + +### Other Changes + +* `Bit` and `Varbit` are both replaced by the `Bits` type. +* `CID`, `OID`, `OIDValue`, and `XID` are replaced by the `Uint32` type. +* `Hstore` is now defined as `map[string]*string`. +* `JSON` and `JSONB` types removed. Use `[]byte` or `string` directly. +* `QChar` type removed. Use `rune` or `byte` directly. +* `Inet` and `Cidr` types removed. Use `netip.Addr` and `netip.Prefix` directly. These types are more memory efficient than the previous `net.IPNet`. +* `Macaddr` type removed. Use `net.HardwareAddr` directly. +* Renamed `pgtype.ConnInfo` to `pgtype.Map`. +* Renamed `pgtype.DataType` to `pgtype.Type`. +* Renamed `pgtype.None` to `pgtype.Finite`. +* `RegisterType` now accepts a `*Type` instead of `Type`. +* Assorted array helper methods and types made private. + +## stdlib + +* Removed `AcquireConn` and `ReleaseConn` as that functionality has been built in since Go 1.13. + +## Reduced Memory Usage by Reusing Read Buffers + +Previously, the connection read buffer would allocate large chunks of memory and never reuse them. This allowed +transferring ownership to anything such as scanned values without incurring an additional allocation and memory copy. +However, this came at the cost of overall increased memory allocation size. But worse it was also possible to pin large +chunks of memory by retaining a reference to a small value that originally came directly from the read buffer. Now +ownership remains with the read buffer and anything needing to retain a value must make a copy. + +## Query Execution Modes + +Control over automatic prepared statement caching and simple protocol use are now combined into query execution mode. +See documentation for `QueryExecMode`. + +## QueryRewriter Interface and NamedArgs + +pgx now supports named arguments with the `NamedArgs` type. This is implemented via the new `QueryRewriter` interface which +allows arbitrary rewriting of query SQL and arguments. + +## RowScanner Interface + +The `RowScanner` interface allows a single argument to Rows.Scan to scan the entire row. + +## Rows Result Helpers + +* `CollectRows` and `RowTo*` functions simplify collecting results into a slice. +* `CollectOneRow` collects one row using `RowTo*` functions. +* `ForEachRow` simplifies scanning each row and executing code using the scanned values. `ForEachRow` replaces `QueryFunc`. + +## Tx Helpers + +Rather than every type that implemented `Begin` or `BeginTx` methods also needing to implement `BeginFunc` and +`BeginTxFunc` these methods have been converted to functions that take a db that implements `Begin` or `BeginTx`. + +## Improved Batch Query Ergonomics + +Previously, the code for building a batch went in one place before the call to `SendBatch`, and the code for reading the +results went in one place after the call to `SendBatch`. This could make it difficult to match up the query and the code +to handle the results. Now `Queue` returns a `QueuedQuery` which has methods `Query`, `QueryRow`, and `Exec` which can +be used to register a callback function that will handle the result. Callback functions are called automatically when +`BatchResults.Close` is called. + +## SendBatch Uses Pipeline Mode When Appropriate + +Previously, a batch with 10 unique parameterized statements executed 100 times would entail 11 network round trips. 1 +for each prepare / describe and 1 for executing them all. Now pipeline mode is used to prepare / describe all statements +in a single network round trip. So it would only take 2 round trips. + +## Tracing and Logging + +Internal logging support has been replaced with tracing hooks. This allows custom tracing integration with tools like OpenTelemetry. Package tracelog provides an adapter for pgx v4 loggers to act as a tracer. + +All integrations with 3rd party loggers have been extracted to separate repositories. This trims the pgx dependency +tree. diff --git a/vendor/github.com/jackc/pgx/v5/CONTRIBUTING.md b/vendor/github.com/jackc/pgx/v5/CONTRIBUTING.md new file mode 100644 index 0000000..c975a93 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/CONTRIBUTING.md @@ -0,0 +1,121 @@ +# Contributing + +## Discuss Significant Changes + +Before you invest a significant amount of time on a change, please create a discussion or issue describing your +proposal. This will help to ensure your proposed change has a reasonable chance of being merged. + +## Avoid Dependencies + +Adding a dependency is a big deal. While on occasion a new dependency may be accepted, the default answer to any change +that adds a dependency is no. + +## Development Environment Setup + +pgx tests naturally require a PostgreSQL database. It will connect to the database specified in the `PGX_TEST_DATABASE` +environment variable. The `PGX_TEST_DATABASE` environment variable can either be a URL or key-value pairs. In addition, +the standard `PG*` environment variables will be respected. Consider using [direnv](https://github.com/direnv/direnv) to +simplify environment variable handling. + +### Using an Existing PostgreSQL Cluster + +If you already have a PostgreSQL development server this is the quickest way to start and run the majority of the pgx +test suite. Some tests will be skipped that require server configuration changes (e.g. those testing different +authentication methods). + +Create and setup a test database: + +``` +export PGDATABASE=pgx_test +createdb +psql -c 'create extension hstore;' +psql -c 'create extension ltree;' +psql -c 'create domain uint64 as numeric(20,0);' +``` + +Ensure a `postgres` user exists. This happens by default in normal PostgreSQL installs, but some installation methods +such as Homebrew do not. + +``` +createuser -s postgres +``` + +Ensure your `PGX_TEST_DATABASE` environment variable points to the database you just created and run the tests. + +``` +export PGX_TEST_DATABASE="host=/private/tmp database=pgx_test" +go test ./... +``` + +This will run the vast majority of the tests, but some tests will be skipped (e.g. those testing different connection methods). + +### Creating a New PostgreSQL Cluster Exclusively for Testing + +The following environment variables need to be set both for initial setup and whenever the tests are run. (direnv is +highly recommended). Depending on your platform, you may need to change the host for `PGX_TEST_UNIX_SOCKET_CONN_STRING`. + +``` +export PGPORT=5015 +export PGUSER=postgres +export PGDATABASE=pgx_test +export POSTGRESQL_DATA_DIR=postgresql + +export PGX_TEST_DATABASE="host=127.0.0.1 database=pgx_test user=pgx_md5 password=secret" +export PGX_TEST_UNIX_SOCKET_CONN_STRING="host=/private/tmp database=pgx_test" +export PGX_TEST_TCP_CONN_STRING="host=127.0.0.1 database=pgx_test user=pgx_md5 password=secret" +export PGX_TEST_SCRAM_PASSWORD_CONN_STRING="host=127.0.0.1 user=pgx_scram password=secret database=pgx_test" +export PGX_TEST_MD5_PASSWORD_CONN_STRING="host=127.0.0.1 database=pgx_test user=pgx_md5 password=secret" +export PGX_TEST_PLAIN_PASSWORD_CONN_STRING="host=127.0.0.1 user=pgx_pw password=secret" +export PGX_TEST_TLS_CONN_STRING="host=localhost user=pgx_ssl password=secret sslmode=verify-full sslrootcert=`pwd`/.testdb/ca.pem" +export PGX_SSL_PASSWORD=certpw +export PGX_TEST_TLS_CLIENT_CONN_STRING="host=localhost user=pgx_sslcert sslmode=verify-full sslrootcert=`pwd`/.testdb/ca.pem database=pgx_test sslcert=`pwd`/.testdb/pgx_sslcert.crt sslkey=`pwd`/.testdb/pgx_sslcert.key" +``` + +Create a new database cluster. + +``` +initdb --locale=en_US -E UTF-8 --username=postgres .testdb/$POSTGRESQL_DATA_DIR + +echo "listen_addresses = '127.0.0.1'" >> .testdb/$POSTGRESQL_DATA_DIR/postgresql.conf +echo "port = $PGPORT" >> .testdb/$POSTGRESQL_DATA_DIR/postgresql.conf +cat testsetup/postgresql_ssl.conf >> .testdb/$POSTGRESQL_DATA_DIR/postgresql.conf +cp testsetup/pg_hba.conf .testdb/$POSTGRESQL_DATA_DIR/pg_hba.conf + +cd .testdb + +# Generate CA, server, and encrypted client certificates. +go run ../testsetup/generate_certs.go + +# Copy certificates to server directory and set permissions. +cp ca.pem $POSTGRESQL_DATA_DIR/root.crt +cp localhost.key $POSTGRESQL_DATA_DIR/server.key +chmod 600 $POSTGRESQL_DATA_DIR/server.key +cp localhost.crt $POSTGRESQL_DATA_DIR/server.crt + +cd .. +``` + + +Start the new cluster. This will be necessary whenever you are running pgx tests. + +``` +postgres -D .testdb/$POSTGRESQL_DATA_DIR +``` + +Setup the test database in the new cluster. + +``` +createdb +psql --no-psqlrc -f testsetup/postgresql_setup.sql +``` + +### PgBouncer + +There are tests specific for PgBouncer that will be executed if `PGX_TEST_PGBOUNCER_CONN_STRING` is set. + +### Optional Tests + +pgx supports multiple connection types and means of authentication. These tests are optional. They will only run if the +appropriate environment variables are set. In addition, there may be tests specific to particular PostgreSQL versions, +non-PostgreSQL servers (e.g. CockroachDB), or connection poolers (e.g. PgBouncer). `go test ./... -v | grep SKIP` to see +if any tests are being skipped. diff --git a/vendor/github.com/jackc/pgtype/LICENSE b/vendor/github.com/jackc/pgx/v5/LICENSE similarity index 100% rename from vendor/github.com/jackc/pgtype/LICENSE rename to vendor/github.com/jackc/pgx/v5/LICENSE diff --git a/vendor/github.com/jackc/pgx/v5/README.md b/vendor/github.com/jackc/pgx/v5/README.md new file mode 100644 index 0000000..0cf2c29 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/README.md @@ -0,0 +1,174 @@ +[![Go Reference](https://pkg.go.dev/badge/github.com/jackc/pgx/v5.svg)](https://pkg.go.dev/github.com/jackc/pgx/v5) +[![Build Status](https://github.com/jackc/pgx/actions/workflows/ci.yml/badge.svg)](https://github.com/jackc/pgx/actions/workflows/ci.yml) + +# pgx - PostgreSQL Driver and Toolkit + +pgx is a pure Go driver and toolkit for PostgreSQL. + +The pgx driver is a low-level, high performance interface that exposes PostgreSQL-specific features such as `LISTEN` / +`NOTIFY` and `COPY`. It also includes an adapter for the standard `database/sql` interface. + +The toolkit component is a related set of packages that implement PostgreSQL functionality such as parsing the wire protocol +and type mapping between PostgreSQL and Go. These underlying packages can be used to implement alternative drivers, +proxies, load balancers, logical replication clients, etc. + +## Example Usage + +```go +package main + +import ( + "context" + "fmt" + "os" + + "github.com/jackc/pgx/v5" +) + +func main() { + // urlExample := "postgres://username:password@localhost:5432/database_name" + conn, err := pgx.Connect(context.Background(), os.Getenv("DATABASE_URL")) + if err != nil { + fmt.Fprintf(os.Stderr, "Unable to connect to database: %v\n", err) + os.Exit(1) + } + defer conn.Close(context.Background()) + + var name string + var weight int64 + err = conn.QueryRow(context.Background(), "select name, weight from widgets where id=$1", 42).Scan(&name, &weight) + if err != nil { + fmt.Fprintf(os.Stderr, "QueryRow failed: %v\n", err) + os.Exit(1) + } + + fmt.Println(name, weight) +} +``` + +See the [getting started guide](https://github.com/jackc/pgx/wiki/Getting-started-with-pgx) for more information. + +## Features + +* Support for approximately 70 different PostgreSQL types +* Automatic statement preparation and caching +* Batch queries +* Single-round trip query mode +* Full TLS connection control +* Binary format support for custom types (allows for much quicker encoding/decoding) +* `COPY` protocol support for faster bulk data loads +* Tracing and logging support +* Connection pool with after-connect hook for arbitrary connection setup +* `LISTEN` / `NOTIFY` +* Conversion of PostgreSQL arrays to Go slice mappings for integers, floats, and strings +* `hstore` support +* `json` and `jsonb` support +* Maps `inet` and `cidr` PostgreSQL types to `netip.Addr` and `netip.Prefix` +* Large object support +* NULL mapping to pointer to pointer +* Supports `database/sql.Scanner` and `database/sql/driver.Valuer` interfaces for custom types +* Notice response handling +* Simulated nested transactions with savepoints + +## Choosing Between the pgx and database/sql Interfaces + +The pgx interface is faster. Many PostgreSQL specific features such as `LISTEN` / `NOTIFY` and `COPY` are not available +through the `database/sql` interface. + +The pgx interface is recommended when: + +1. The application only targets PostgreSQL. +2. No other libraries that require `database/sql` are in use. + +It is also possible to use the `database/sql` interface and convert a connection to the lower-level pgx interface as needed. + +## Testing + +See CONTRIBUTING.md for setup instructions. + +## Architecture + +See the presentation at Golang Estonia, [PGX Top to Bottom](https://www.youtube.com/watch?v=sXMSWhcHCf8) for a description of pgx architecture. + +## Supported Go and PostgreSQL Versions + +pgx supports the same versions of Go and PostgreSQL that are supported by their respective teams. For [Go](https://golang.org/doc/devel/release.html#policy) that is the two most recent major releases and for [PostgreSQL](https://www.postgresql.org/support/versioning/) the major releases in the last 5 years. This means pgx supports Go 1.21 and higher and PostgreSQL 12 and higher. pgx also is tested against the latest version of [CockroachDB](https://www.cockroachlabs.com/product/). + +## Version Policy + +pgx follows semantic versioning for the documented public API on stable releases. `v5` is the latest stable major version. + +## PGX Family Libraries + +### [github.com/jackc/pglogrepl](https://github.com/jackc/pglogrepl) + +pglogrepl provides functionality to act as a client for PostgreSQL logical replication. + +### [github.com/jackc/pgmock](https://github.com/jackc/pgmock) + +pgmock offers the ability to create a server that mocks the PostgreSQL wire protocol. This is used internally to test pgx by purposely inducing unusual errors. pgproto3 and pgmock together provide most of the foundational tooling required to implement a PostgreSQL proxy or MitM (such as for a custom connection pooler). + +### [github.com/jackc/tern](https://github.com/jackc/tern) + +tern is a stand-alone SQL migration system. + +### [github.com/jackc/pgerrcode](https://github.com/jackc/pgerrcode) + +pgerrcode contains constants for the PostgreSQL error codes. + +## Adapters for 3rd Party Types + +* [github.com/jackc/pgx-gofrs-uuid](https://github.com/jackc/pgx-gofrs-uuid) +* [github.com/jackc/pgx-shopspring-decimal](https://github.com/jackc/pgx-shopspring-decimal) +* [github.com/twpayne/pgx-geos](https://github.com/twpayne/pgx-geos) ([PostGIS](https://postgis.net/) and [GEOS](https://libgeos.org/) via [go-geos](https://github.com/twpayne/go-geos)) +* [github.com/vgarvardt/pgx-google-uuid](https://github.com/vgarvardt/pgx-google-uuid) + + +## Adapters for 3rd Party Tracers + +* [https://github.com/jackhopner/pgx-xray-tracer](https://github.com/jackhopner/pgx-xray-tracer) + +## Adapters for 3rd Party Loggers + +These adapters can be used with the tracelog package. + +* [github.com/jackc/pgx-go-kit-log](https://github.com/jackc/pgx-go-kit-log) +* [github.com/jackc/pgx-log15](https://github.com/jackc/pgx-log15) +* [github.com/jackc/pgx-logrus](https://github.com/jackc/pgx-logrus) +* [github.com/jackc/pgx-zap](https://github.com/jackc/pgx-zap) +* [github.com/jackc/pgx-zerolog](https://github.com/jackc/pgx-zerolog) +* [github.com/mcosta74/pgx-slog](https://github.com/mcosta74/pgx-slog) +* [github.com/kataras/pgx-golog](https://github.com/kataras/pgx-golog) + +## 3rd Party Libraries with PGX Support + +### [github.com/pashagolub/pgxmock](https://github.com/pashagolub/pgxmock) + +pgxmock is a mock library implementing pgx interfaces. +pgxmock has one and only purpose - to simulate pgx behavior in tests, without needing a real database connection. + +### [github.com/georgysavva/scany](https://github.com/georgysavva/scany) + +Library for scanning data from a database into Go structs and more. + +### [github.com/vingarcia/ksql](https://github.com/vingarcia/ksql) + +A carefully designed SQL client for making using SQL easier, +more productive, and less error-prone on Golang. + +### [https://github.com/otan/gopgkrb5](https://github.com/otan/gopgkrb5) + +Adds GSSAPI / Kerberos authentication support. + +### [github.com/wcamarao/pmx](https://github.com/wcamarao/pmx) + +Explicit data mapping and scanning library for Go structs and slices. + +### [github.com/stephenafamo/scan](https://github.com/stephenafamo/scan) + +Type safe and flexible package for scanning database data into Go types. +Supports, structs, maps, slices and custom mapping functions. + +### [https://github.com/z0ne-dev/mgx](https://github.com/z0ne-dev/mgx) + +Code first migration library for native pgx (no database/sql abstraction). diff --git a/vendor/github.com/jackc/pgx/v5/Rakefile b/vendor/github.com/jackc/pgx/v5/Rakefile new file mode 100644 index 0000000..d957573 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/Rakefile @@ -0,0 +1,18 @@ +require "erb" + +rule '.go' => '.go.erb' do |task| + erb = ERB.new(File.read(task.source)) + File.write(task.name, "// Do not edit. Generated from #{task.source}\n" + erb.result(binding)) + sh "goimports", "-w", task.name +end + +generated_code_files = [ + "pgtype/int.go", + "pgtype/int_test.go", + "pgtype/integration_benchmark_test.go", + "pgtype/zeronull/int.go", + "pgtype/zeronull/int_test.go" +] + +desc "Generate code" +task generate: generated_code_files diff --git a/vendor/github.com/jackc/pgx/v5/batch.go b/vendor/github.com/jackc/pgx/v5/batch.go new file mode 100644 index 0000000..c3c2834 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/batch.go @@ -0,0 +1,443 @@ +package pgx + +import ( + "context" + "errors" + "fmt" + + "github.com/jackc/pgx/v5/pgconn" +) + +// QueuedQuery is a query that has been queued for execution via a Batch. +type QueuedQuery struct { + SQL string + Arguments []any + Fn batchItemFunc + sd *pgconn.StatementDescription +} + +type batchItemFunc func(br BatchResults) error + +// Query sets fn to be called when the response to qq is received. +func (qq *QueuedQuery) Query(fn func(rows Rows) error) { + qq.Fn = func(br BatchResults) error { + rows, _ := br.Query() + defer rows.Close() + + err := fn(rows) + if err != nil { + return err + } + rows.Close() + + return rows.Err() + } +} + +// Query sets fn to be called when the response to qq is received. +func (qq *QueuedQuery) QueryRow(fn func(row Row) error) { + qq.Fn = func(br BatchResults) error { + row := br.QueryRow() + return fn(row) + } +} + +// Exec sets fn to be called when the response to qq is received. +func (qq *QueuedQuery) Exec(fn func(ct pgconn.CommandTag) error) { + qq.Fn = func(br BatchResults) error { + ct, err := br.Exec() + if err != nil { + return err + } + + return fn(ct) + } +} + +// Batch queries are a way of bundling multiple queries together to avoid +// unnecessary network round trips. A Batch must only be sent once. +type Batch struct { + QueuedQueries []*QueuedQuery +} + +// Queue queues a query to batch b. query can be an SQL query or the name of a prepared statement. The only pgx option +// argument that is supported is QueryRewriter. Queries are executed using the connection's DefaultQueryExecMode. +// +// While query can contain multiple statements if the connection's DefaultQueryExecMode is QueryModeSimple, this should +// be avoided. QueuedQuery.Fn must not be set as it will only be called for the first query. That is, QueuedQuery.Query, +// QueuedQuery.QueryRow, and QueuedQuery.Exec must not be called. In addition, any error messages or tracing that +// include the current query may reference the wrong query. +func (b *Batch) Queue(query string, arguments ...any) *QueuedQuery { + qq := &QueuedQuery{ + SQL: query, + Arguments: arguments, + } + b.QueuedQueries = append(b.QueuedQueries, qq) + return qq +} + +// Len returns number of queries that have been queued so far. +func (b *Batch) Len() int { + return len(b.QueuedQueries) +} + +type BatchResults interface { + // Exec reads the results from the next query in the batch as if the query has been sent with Conn.Exec. Prefer + // calling Exec on the QueuedQuery. + Exec() (pgconn.CommandTag, error) + + // Query reads the results from the next query in the batch as if the query has been sent with Conn.Query. Prefer + // calling Query on the QueuedQuery. + Query() (Rows, error) + + // QueryRow reads the results from the next query in the batch as if the query has been sent with Conn.QueryRow. + // Prefer calling QueryRow on the QueuedQuery. + QueryRow() Row + + // Close closes the batch operation. All unread results are read and any callback functions registered with + // QueuedQuery.Query, QueuedQuery.QueryRow, or QueuedQuery.Exec will be called. If a callback function returns an + // error or the batch encounters an error subsequent callback functions will not be called. + // + // Close must be called before the underlying connection can be used again. Any error that occurred during a batch + // operation may have made it impossible to resyncronize the connection with the server. In this case the underlying + // connection will have been closed. + // + // Close is safe to call multiple times. If it returns an error subsequent calls will return the same error. Callback + // functions will not be rerun. + Close() error +} + +type batchResults struct { + ctx context.Context + conn *Conn + mrr *pgconn.MultiResultReader + err error + b *Batch + qqIdx int + closed bool + endTraced bool +} + +// Exec reads the results from the next query in the batch as if the query has been sent with Exec. +func (br *batchResults) Exec() (pgconn.CommandTag, error) { + if br.err != nil { + return pgconn.CommandTag{}, br.err + } + if br.closed { + return pgconn.CommandTag{}, fmt.Errorf("batch already closed") + } + + query, arguments, _ := br.nextQueryAndArgs() + + if !br.mrr.NextResult() { + err := br.mrr.Close() + if err == nil { + err = errors.New("no more results in batch") + } + if br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchQuery(br.ctx, br.conn, TraceBatchQueryData{ + SQL: query, + Args: arguments, + Err: err, + }) + } + return pgconn.CommandTag{}, err + } + + commandTag, err := br.mrr.ResultReader().Close() + if err != nil { + br.err = err + br.mrr.Close() + } + + if br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchQuery(br.ctx, br.conn, TraceBatchQueryData{ + SQL: query, + Args: arguments, + CommandTag: commandTag, + Err: br.err, + }) + } + + return commandTag, br.err +} + +// Query reads the results from the next query in the batch as if the query has been sent with Query. +func (br *batchResults) Query() (Rows, error) { + query, arguments, ok := br.nextQueryAndArgs() + if !ok { + query = "batch query" + } + + if br.err != nil { + return &baseRows{err: br.err, closed: true}, br.err + } + + if br.closed { + alreadyClosedErr := fmt.Errorf("batch already closed") + return &baseRows{err: alreadyClosedErr, closed: true}, alreadyClosedErr + } + + rows := br.conn.getRows(br.ctx, query, arguments) + rows.batchTracer = br.conn.batchTracer + + if !br.mrr.NextResult() { + rows.err = br.mrr.Close() + if rows.err == nil { + rows.err = errors.New("no more results in batch") + } + rows.closed = true + + if br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchQuery(br.ctx, br.conn, TraceBatchQueryData{ + SQL: query, + Args: arguments, + Err: rows.err, + }) + } + + return rows, rows.err + } + + rows.resultReader = br.mrr.ResultReader() + return rows, nil +} + +// QueryRow reads the results from the next query in the batch as if the query has been sent with QueryRow. +func (br *batchResults) QueryRow() Row { + rows, _ := br.Query() + return (*connRow)(rows.(*baseRows)) + +} + +// Close closes the batch operation. Any error that occurred during a batch operation may have made it impossible to +// resyncronize the connection with the server. In this case the underlying connection will have been closed. +func (br *batchResults) Close() error { + defer func() { + if !br.endTraced { + if br.conn != nil && br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchEnd(br.ctx, br.conn, TraceBatchEndData{Err: br.err}) + } + br.endTraced = true + } + }() + + if br.err != nil { + return br.err + } + + if br.closed { + return nil + } + + // Read and run fn for all remaining items + for br.err == nil && !br.closed && br.b != nil && br.qqIdx < len(br.b.QueuedQueries) { + if br.b.QueuedQueries[br.qqIdx].Fn != nil { + err := br.b.QueuedQueries[br.qqIdx].Fn(br) + if err != nil { + br.err = err + } + } else { + br.Exec() + } + } + + br.closed = true + + err := br.mrr.Close() + if br.err == nil { + br.err = err + } + + return br.err +} + +func (br *batchResults) earlyError() error { + return br.err +} + +func (br *batchResults) nextQueryAndArgs() (query string, args []any, ok bool) { + if br.b != nil && br.qqIdx < len(br.b.QueuedQueries) { + bi := br.b.QueuedQueries[br.qqIdx] + query = bi.SQL + args = bi.Arguments + ok = true + br.qqIdx++ + } + return +} + +type pipelineBatchResults struct { + ctx context.Context + conn *Conn + pipeline *pgconn.Pipeline + lastRows *baseRows + err error + b *Batch + qqIdx int + closed bool + endTraced bool +} + +// Exec reads the results from the next query in the batch as if the query has been sent with Exec. +func (br *pipelineBatchResults) Exec() (pgconn.CommandTag, error) { + if br.err != nil { + return pgconn.CommandTag{}, br.err + } + if br.closed { + return pgconn.CommandTag{}, fmt.Errorf("batch already closed") + } + if br.lastRows != nil && br.lastRows.err != nil { + return pgconn.CommandTag{}, br.err + } + + query, arguments, err := br.nextQueryAndArgs() + if err != nil { + return pgconn.CommandTag{}, err + } + + results, err := br.pipeline.GetResults() + if err != nil { + br.err = err + return pgconn.CommandTag{}, br.err + } + var commandTag pgconn.CommandTag + switch results := results.(type) { + case *pgconn.ResultReader: + commandTag, br.err = results.Close() + default: + return pgconn.CommandTag{}, fmt.Errorf("unexpected pipeline result: %T", results) + } + + if br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchQuery(br.ctx, br.conn, TraceBatchQueryData{ + SQL: query, + Args: arguments, + CommandTag: commandTag, + Err: br.err, + }) + } + + return commandTag, br.err +} + +// Query reads the results from the next query in the batch as if the query has been sent with Query. +func (br *pipelineBatchResults) Query() (Rows, error) { + if br.err != nil { + return &baseRows{err: br.err, closed: true}, br.err + } + + if br.closed { + alreadyClosedErr := fmt.Errorf("batch already closed") + return &baseRows{err: alreadyClosedErr, closed: true}, alreadyClosedErr + } + + if br.lastRows != nil && br.lastRows.err != nil { + br.err = br.lastRows.err + return &baseRows{err: br.err, closed: true}, br.err + } + + query, arguments, err := br.nextQueryAndArgs() + if err != nil { + return &baseRows{err: err, closed: true}, err + } + + rows := br.conn.getRows(br.ctx, query, arguments) + rows.batchTracer = br.conn.batchTracer + br.lastRows = rows + + results, err := br.pipeline.GetResults() + if err != nil { + br.err = err + rows.err = err + rows.closed = true + + if br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchQuery(br.ctx, br.conn, TraceBatchQueryData{ + SQL: query, + Args: arguments, + Err: err, + }) + } + } else { + switch results := results.(type) { + case *pgconn.ResultReader: + rows.resultReader = results + default: + err = fmt.Errorf("unexpected pipeline result: %T", results) + br.err = err + rows.err = err + rows.closed = true + } + } + + return rows, rows.err +} + +// QueryRow reads the results from the next query in the batch as if the query has been sent with QueryRow. +func (br *pipelineBatchResults) QueryRow() Row { + rows, _ := br.Query() + return (*connRow)(rows.(*baseRows)) + +} + +// Close closes the batch operation. Any error that occurred during a batch operation may have made it impossible to +// resyncronize the connection with the server. In this case the underlying connection will have been closed. +func (br *pipelineBatchResults) Close() error { + defer func() { + if !br.endTraced { + if br.conn.batchTracer != nil { + br.conn.batchTracer.TraceBatchEnd(br.ctx, br.conn, TraceBatchEndData{Err: br.err}) + } + br.endTraced = true + } + }() + + if br.err == nil && br.lastRows != nil && br.lastRows.err != nil { + br.err = br.lastRows.err + return br.err + } + + if br.closed { + return br.err + } + + // Read and run fn for all remaining items + for br.err == nil && !br.closed && br.b != nil && br.qqIdx < len(br.b.QueuedQueries) { + if br.b.QueuedQueries[br.qqIdx].Fn != nil { + err := br.b.QueuedQueries[br.qqIdx].Fn(br) + if err != nil { + br.err = err + } + } else { + br.Exec() + } + } + + br.closed = true + + err := br.pipeline.Close() + if br.err == nil { + br.err = err + } + + return br.err +} + +func (br *pipelineBatchResults) earlyError() error { + return br.err +} + +func (br *pipelineBatchResults) nextQueryAndArgs() (query string, args []any, err error) { + if br.b == nil { + return "", nil, errors.New("no reference to batch") + } + + if br.qqIdx >= len(br.b.QueuedQueries) { + return "", nil, errors.New("no more results in batch") + } + + bi := br.b.QueuedQueries[br.qqIdx] + br.qqIdx++ + return bi.SQL, bi.Arguments, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/conn.go b/vendor/github.com/jackc/pgx/v5/conn.go new file mode 100644 index 0000000..187b3dd --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/conn.go @@ -0,0 +1,1413 @@ +package pgx + +import ( + "context" + "crypto/sha256" + "database/sql" + "encoding/hex" + "errors" + "fmt" + "strconv" + "strings" + "time" + + "github.com/jackc/pgx/v5/internal/sanitize" + "github.com/jackc/pgx/v5/internal/stmtcache" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgtype" +) + +// ConnConfig contains all the options used to establish a connection. It must be created by ParseConfig and +// then it can be modified. A manually initialized ConnConfig will cause ConnectConfig to panic. +type ConnConfig struct { + pgconn.Config + + Tracer QueryTracer + + // Original connection string that was parsed into config. + connString string + + // StatementCacheCapacity is maximum size of the statement cache used when executing a query with "cache_statement" + // query exec mode. + StatementCacheCapacity int + + // DescriptionCacheCapacity is the maximum size of the description cache used when executing a query with + // "cache_describe" query exec mode. + DescriptionCacheCapacity int + + // DefaultQueryExecMode controls the default mode for executing queries. By default pgx uses the extended protocol + // and automatically prepares and caches prepared statements. However, this may be incompatible with proxies such as + // PGBouncer. In this case it may be preferable to use QueryExecModeExec or QueryExecModeSimpleProtocol. The same + // functionality can be controlled on a per query basis by passing a QueryExecMode as the first query argument. + DefaultQueryExecMode QueryExecMode + + createdByParseConfig bool // Used to enforce created by ParseConfig rule. +} + +// ParseConfigOptions contains options that control how a config is built such as getsslpassword. +type ParseConfigOptions struct { + pgconn.ParseConfigOptions +} + +// Copy returns a deep copy of the config that is safe to use and modify. +// The only exception is the tls.Config: +// according to the tls.Config docs it must not be modified after creation. +func (cc *ConnConfig) Copy() *ConnConfig { + newConfig := new(ConnConfig) + *newConfig = *cc + newConfig.Config = *newConfig.Config.Copy() + return newConfig +} + +// ConnString returns the connection string as parsed by pgx.ParseConfig into pgx.ConnConfig. +func (cc *ConnConfig) ConnString() string { return cc.connString } + +// Conn is a PostgreSQL connection handle. It is not safe for concurrent usage. Use a connection pool to manage access +// to multiple database connections from multiple goroutines. +type Conn struct { + pgConn *pgconn.PgConn + config *ConnConfig // config used when establishing this connection + preparedStatements map[string]*pgconn.StatementDescription + statementCache stmtcache.Cache + descriptionCache stmtcache.Cache + + queryTracer QueryTracer + batchTracer BatchTracer + copyFromTracer CopyFromTracer + prepareTracer PrepareTracer + + notifications []*pgconn.Notification + + doneChan chan struct{} + closedChan chan error + + typeMap *pgtype.Map + + wbuf []byte + eqb ExtendedQueryBuilder +} + +// Identifier a PostgreSQL identifier or name. Identifiers can be composed of +// multiple parts such as ["schema", "table"] or ["table", "column"]. +type Identifier []string + +// Sanitize returns a sanitized string safe for SQL interpolation. +func (ident Identifier) Sanitize() string { + parts := make([]string, len(ident)) + for i := range ident { + s := strings.ReplaceAll(ident[i], string([]byte{0}), "") + parts[i] = `"` + strings.ReplaceAll(s, `"`, `""`) + `"` + } + return strings.Join(parts, ".") +} + +var ( + // ErrNoRows occurs when rows are expected but none are returned. + ErrNoRows = newProxyErr(sql.ErrNoRows, "no rows in result set") + // ErrTooManyRows occurs when more rows than expected are returned. + ErrTooManyRows = errors.New("too many rows in result set") +) + +func newProxyErr(background error, msg string) error { + return &proxyError{ + msg: msg, + background: background, + } +} + +type proxyError struct { + msg string + background error +} + +func (err *proxyError) Error() string { return err.msg } + +func (err *proxyError) Unwrap() error { return err.background } + +var ( + errDisabledStatementCache = fmt.Errorf("cannot use QueryExecModeCacheStatement with disabled statement cache") + errDisabledDescriptionCache = fmt.Errorf("cannot use QueryExecModeCacheDescribe with disabled description cache") +) + +// Connect establishes a connection with a PostgreSQL server with a connection string. See +// pgconn.Connect for details. +func Connect(ctx context.Context, connString string) (*Conn, error) { + connConfig, err := ParseConfig(connString) + if err != nil { + return nil, err + } + return connect(ctx, connConfig) +} + +// ConnectWithOptions behaves exactly like Connect with the addition of options. At the present options is only used to +// provide a GetSSLPassword function. +func ConnectWithOptions(ctx context.Context, connString string, options ParseConfigOptions) (*Conn, error) { + connConfig, err := ParseConfigWithOptions(connString, options) + if err != nil { + return nil, err + } + return connect(ctx, connConfig) +} + +// ConnectConfig establishes a connection with a PostgreSQL server with a configuration struct. +// connConfig must have been created by ParseConfig. +func ConnectConfig(ctx context.Context, connConfig *ConnConfig) (*Conn, error) { + // In general this improves safety. In particular avoid the config.Config.OnNotification mutation from affecting other + // connections with the same config. See https://github.com/jackc/pgx/issues/618. + connConfig = connConfig.Copy() + + return connect(ctx, connConfig) +} + +// ParseConfigWithOptions behaves exactly as ParseConfig does with the addition of options. At the present options is +// only used to provide a GetSSLPassword function. +func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*ConnConfig, error) { + config, err := pgconn.ParseConfigWithOptions(connString, options.ParseConfigOptions) + if err != nil { + return nil, err + } + + statementCacheCapacity := 512 + if s, ok := config.RuntimeParams["statement_cache_capacity"]; ok { + delete(config.RuntimeParams, "statement_cache_capacity") + n, err := strconv.ParseInt(s, 10, 32) + if err != nil { + return nil, fmt.Errorf("cannot parse statement_cache_capacity: %w", err) + } + statementCacheCapacity = int(n) + } + + descriptionCacheCapacity := 512 + if s, ok := config.RuntimeParams["description_cache_capacity"]; ok { + delete(config.RuntimeParams, "description_cache_capacity") + n, err := strconv.ParseInt(s, 10, 32) + if err != nil { + return nil, fmt.Errorf("cannot parse description_cache_capacity: %w", err) + } + descriptionCacheCapacity = int(n) + } + + defaultQueryExecMode := QueryExecModeCacheStatement + if s, ok := config.RuntimeParams["default_query_exec_mode"]; ok { + delete(config.RuntimeParams, "default_query_exec_mode") + switch s { + case "cache_statement": + defaultQueryExecMode = QueryExecModeCacheStatement + case "cache_describe": + defaultQueryExecMode = QueryExecModeCacheDescribe + case "describe_exec": + defaultQueryExecMode = QueryExecModeDescribeExec + case "exec": + defaultQueryExecMode = QueryExecModeExec + case "simple_protocol": + defaultQueryExecMode = QueryExecModeSimpleProtocol + default: + return nil, fmt.Errorf("invalid default_query_exec_mode: %s", s) + } + } + + connConfig := &ConnConfig{ + Config: *config, + createdByParseConfig: true, + StatementCacheCapacity: statementCacheCapacity, + DescriptionCacheCapacity: descriptionCacheCapacity, + DefaultQueryExecMode: defaultQueryExecMode, + connString: connString, + } + + return connConfig, nil +} + +// ParseConfig creates a ConnConfig from a connection string. ParseConfig handles all options that [pgconn.ParseConfig] +// does. In addition, it accepts the following options: +// +// - default_query_exec_mode. +// Possible values: "cache_statement", "cache_describe", "describe_exec", "exec", and "simple_protocol". See +// QueryExecMode constant documentation for the meaning of these values. Default: "cache_statement". +// +// - statement_cache_capacity. +// The maximum size of the statement cache used when executing a query with "cache_statement" query exec mode. +// Default: 512. +// +// - description_cache_capacity. +// The maximum size of the description cache used when executing a query with "cache_describe" query exec mode. +// Default: 512. +func ParseConfig(connString string) (*ConnConfig, error) { + return ParseConfigWithOptions(connString, ParseConfigOptions{}) +} + +// connect connects to a database. connect takes ownership of config. The caller must not use or access it again. +func connect(ctx context.Context, config *ConnConfig) (c *Conn, err error) { + if connectTracer, ok := config.Tracer.(ConnectTracer); ok { + ctx = connectTracer.TraceConnectStart(ctx, TraceConnectStartData{ConnConfig: config}) + defer func() { + connectTracer.TraceConnectEnd(ctx, TraceConnectEndData{Conn: c, Err: err}) + }() + } + + // Default values are set in ParseConfig. Enforce initial creation by ParseConfig rather than setting defaults from + // zero values. + if !config.createdByParseConfig { + panic("config must be created by ParseConfig") + } + + c = &Conn{ + config: config, + typeMap: pgtype.NewMap(), + queryTracer: config.Tracer, + } + + if t, ok := c.queryTracer.(BatchTracer); ok { + c.batchTracer = t + } + if t, ok := c.queryTracer.(CopyFromTracer); ok { + c.copyFromTracer = t + } + if t, ok := c.queryTracer.(PrepareTracer); ok { + c.prepareTracer = t + } + + // Only install pgx notification system if no other callback handler is present. + if config.Config.OnNotification == nil { + config.Config.OnNotification = c.bufferNotifications + } + + c.pgConn, err = pgconn.ConnectConfig(ctx, &config.Config) + if err != nil { + return nil, err + } + + c.preparedStatements = make(map[string]*pgconn.StatementDescription) + c.doneChan = make(chan struct{}) + c.closedChan = make(chan error) + c.wbuf = make([]byte, 0, 1024) + + if c.config.StatementCacheCapacity > 0 { + c.statementCache = stmtcache.NewLRUCache(c.config.StatementCacheCapacity) + } + + if c.config.DescriptionCacheCapacity > 0 { + c.descriptionCache = stmtcache.NewLRUCache(c.config.DescriptionCacheCapacity) + } + + return c, nil +} + +// Close closes a connection. It is safe to call Close on an already closed +// connection. +func (c *Conn) Close(ctx context.Context) error { + if c.IsClosed() { + return nil + } + + err := c.pgConn.Close(ctx) + return err +} + +// Prepare creates a prepared statement with name and sql. sql can contain placeholders for bound parameters. These +// placeholders are referenced positionally as $1, $2, etc. name can be used instead of sql with Query, QueryRow, and +// Exec to execute the statement. It can also be used with Batch.Queue. +// +// The underlying PostgreSQL identifier for the prepared statement will be name if name != sql or a digest of sql if +// name == sql. +// +// Prepare is idempotent; i.e. it is safe to call Prepare multiple times with the same name and sql arguments. This +// allows a code path to Prepare and Query/Exec without concern for if the statement has already been prepared. +func (c *Conn) Prepare(ctx context.Context, name, sql string) (sd *pgconn.StatementDescription, err error) { + if c.prepareTracer != nil { + ctx = c.prepareTracer.TracePrepareStart(ctx, c, TracePrepareStartData{Name: name, SQL: sql}) + } + + if name != "" { + var ok bool + if sd, ok = c.preparedStatements[name]; ok && sd.SQL == sql { + if c.prepareTracer != nil { + c.prepareTracer.TracePrepareEnd(ctx, c, TracePrepareEndData{AlreadyPrepared: true}) + } + return sd, nil + } + } + + if c.prepareTracer != nil { + defer func() { + c.prepareTracer.TracePrepareEnd(ctx, c, TracePrepareEndData{Err: err}) + }() + } + + var psName, psKey string + if name == sql { + digest := sha256.Sum256([]byte(sql)) + psName = "stmt_" + hex.EncodeToString(digest[0:24]) + psKey = sql + } else { + psName = name + psKey = name + } + + sd, err = c.pgConn.Prepare(ctx, psName, sql, nil) + if err != nil { + return nil, err + } + + if psKey != "" { + c.preparedStatements[psKey] = sd + } + + return sd, nil +} + +// Deallocate releases a prepared statement. Calling Deallocate on a non-existent prepared statement will succeed. +func (c *Conn) Deallocate(ctx context.Context, name string) error { + var psName string + sd := c.preparedStatements[name] + if sd != nil { + psName = sd.Name + } else { + psName = name + } + + err := c.pgConn.Deallocate(ctx, psName) + if err != nil { + return err + } + + if sd != nil { + delete(c.preparedStatements, name) + } + + return nil +} + +// DeallocateAll releases all previously prepared statements from the server and client, where it also resets the statement and description cache. +func (c *Conn) DeallocateAll(ctx context.Context) error { + c.preparedStatements = map[string]*pgconn.StatementDescription{} + if c.config.StatementCacheCapacity > 0 { + c.statementCache = stmtcache.NewLRUCache(c.config.StatementCacheCapacity) + } + if c.config.DescriptionCacheCapacity > 0 { + c.descriptionCache = stmtcache.NewLRUCache(c.config.DescriptionCacheCapacity) + } + _, err := c.pgConn.Exec(ctx, "deallocate all").ReadAll() + return err +} + +func (c *Conn) bufferNotifications(_ *pgconn.PgConn, n *pgconn.Notification) { + c.notifications = append(c.notifications, n) +} + +// WaitForNotification waits for a PostgreSQL notification. It wraps the underlying pgconn notification system in a +// slightly more convenient form. +func (c *Conn) WaitForNotification(ctx context.Context) (*pgconn.Notification, error) { + var n *pgconn.Notification + + // Return already received notification immediately + if len(c.notifications) > 0 { + n = c.notifications[0] + c.notifications = c.notifications[1:] + return n, nil + } + + err := c.pgConn.WaitForNotification(ctx) + if len(c.notifications) > 0 { + n = c.notifications[0] + c.notifications = c.notifications[1:] + } + return n, err +} + +// IsClosed reports if the connection has been closed. +func (c *Conn) IsClosed() bool { + return c.pgConn.IsClosed() +} + +func (c *Conn) die(err error) { + if c.IsClosed() { + return + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // force immediate hard cancel + c.pgConn.Close(ctx) +} + +func quoteIdentifier(s string) string { + return `"` + strings.ReplaceAll(s, `"`, `""`) + `"` +} + +// Ping delegates to the underlying *pgconn.PgConn.Ping. +func (c *Conn) Ping(ctx context.Context) error { + return c.pgConn.Ping(ctx) +} + +// PgConn returns the underlying *pgconn.PgConn. This is an escape hatch method that allows lower level access to the +// PostgreSQL connection than pgx exposes. +// +// It is strongly recommended that the connection be idle (no in-progress queries) before the underlying *pgconn.PgConn +// is used and the connection must be returned to the same state before any *pgx.Conn methods are again used. +func (c *Conn) PgConn() *pgconn.PgConn { return c.pgConn } + +// TypeMap returns the connection info used for this connection. +func (c *Conn) TypeMap() *pgtype.Map { return c.typeMap } + +// Config returns a copy of config that was used to establish this connection. +func (c *Conn) Config() *ConnConfig { return c.config.Copy() } + +// Exec executes sql. sql can be either a prepared statement name or an SQL string. arguments should be referenced +// positionally from the sql string as $1, $2, etc. +func (c *Conn) Exec(ctx context.Context, sql string, arguments ...any) (pgconn.CommandTag, error) { + if c.queryTracer != nil { + ctx = c.queryTracer.TraceQueryStart(ctx, c, TraceQueryStartData{SQL: sql, Args: arguments}) + } + + if err := c.deallocateInvalidatedCachedStatements(ctx); err != nil { + return pgconn.CommandTag{}, err + } + + commandTag, err := c.exec(ctx, sql, arguments...) + + if c.queryTracer != nil { + c.queryTracer.TraceQueryEnd(ctx, c, TraceQueryEndData{CommandTag: commandTag, Err: err}) + } + + return commandTag, err +} + +func (c *Conn) exec(ctx context.Context, sql string, arguments ...any) (commandTag pgconn.CommandTag, err error) { + mode := c.config.DefaultQueryExecMode + var queryRewriter QueryRewriter + +optionLoop: + for len(arguments) > 0 { + switch arg := arguments[0].(type) { + case QueryExecMode: + mode = arg + arguments = arguments[1:] + case QueryRewriter: + queryRewriter = arg + arguments = arguments[1:] + default: + break optionLoop + } + } + + if queryRewriter != nil { + sql, arguments, err = queryRewriter.RewriteQuery(ctx, c, sql, arguments) + if err != nil { + return pgconn.CommandTag{}, fmt.Errorf("rewrite query failed: %w", err) + } + } + + // Always use simple protocol when there are no arguments. + if len(arguments) == 0 { + mode = QueryExecModeSimpleProtocol + } + + if sd, ok := c.preparedStatements[sql]; ok { + return c.execPrepared(ctx, sd, arguments) + } + + switch mode { + case QueryExecModeCacheStatement: + if c.statementCache == nil { + return pgconn.CommandTag{}, errDisabledStatementCache + } + sd := c.statementCache.Get(sql) + if sd == nil { + sd, err = c.Prepare(ctx, stmtcache.StatementName(sql), sql) + if err != nil { + return pgconn.CommandTag{}, err + } + c.statementCache.Put(sd) + } + + return c.execPrepared(ctx, sd, arguments) + case QueryExecModeCacheDescribe: + if c.descriptionCache == nil { + return pgconn.CommandTag{}, errDisabledDescriptionCache + } + sd := c.descriptionCache.Get(sql) + if sd == nil { + sd, err = c.Prepare(ctx, "", sql) + if err != nil { + return pgconn.CommandTag{}, err + } + c.descriptionCache.Put(sd) + } + + return c.execParams(ctx, sd, arguments) + case QueryExecModeDescribeExec: + sd, err := c.Prepare(ctx, "", sql) + if err != nil { + return pgconn.CommandTag{}, err + } + return c.execPrepared(ctx, sd, arguments) + case QueryExecModeExec: + return c.execSQLParams(ctx, sql, arguments) + case QueryExecModeSimpleProtocol: + return c.execSimpleProtocol(ctx, sql, arguments) + default: + return pgconn.CommandTag{}, fmt.Errorf("unknown QueryExecMode: %v", mode) + } +} + +func (c *Conn) execSimpleProtocol(ctx context.Context, sql string, arguments []any) (commandTag pgconn.CommandTag, err error) { + if len(arguments) > 0 { + sql, err = c.sanitizeForSimpleQuery(sql, arguments...) + if err != nil { + return pgconn.CommandTag{}, err + } + } + + mrr := c.pgConn.Exec(ctx, sql) + for mrr.NextResult() { + commandTag, _ = mrr.ResultReader().Close() + } + err = mrr.Close() + return commandTag, err +} + +func (c *Conn) execParams(ctx context.Context, sd *pgconn.StatementDescription, arguments []any) (pgconn.CommandTag, error) { + err := c.eqb.Build(c.typeMap, sd, arguments) + if err != nil { + return pgconn.CommandTag{}, err + } + + result := c.pgConn.ExecParams(ctx, sd.SQL, c.eqb.ParamValues, sd.ParamOIDs, c.eqb.ParamFormats, c.eqb.ResultFormats).Read() + c.eqb.reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. + return result.CommandTag, result.Err +} + +func (c *Conn) execPrepared(ctx context.Context, sd *pgconn.StatementDescription, arguments []any) (pgconn.CommandTag, error) { + err := c.eqb.Build(c.typeMap, sd, arguments) + if err != nil { + return pgconn.CommandTag{}, err + } + + result := c.pgConn.ExecPrepared(ctx, sd.Name, c.eqb.ParamValues, c.eqb.ParamFormats, c.eqb.ResultFormats).Read() + c.eqb.reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. + return result.CommandTag, result.Err +} + +type unknownArgumentTypeQueryExecModeExecError struct { + arg any +} + +func (e *unknownArgumentTypeQueryExecModeExecError) Error() string { + return fmt.Sprintf("cannot use unregistered type %T as query argument in QueryExecModeExec", e.arg) +} + +func (c *Conn) execSQLParams(ctx context.Context, sql string, args []any) (pgconn.CommandTag, error) { + err := c.eqb.Build(c.typeMap, nil, args) + if err != nil { + return pgconn.CommandTag{}, err + } + + result := c.pgConn.ExecParams(ctx, sql, c.eqb.ParamValues, nil, c.eqb.ParamFormats, c.eqb.ResultFormats).Read() + c.eqb.reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. + return result.CommandTag, result.Err +} + +func (c *Conn) getRows(ctx context.Context, sql string, args []any) *baseRows { + r := &baseRows{} + + r.ctx = ctx + r.queryTracer = c.queryTracer + r.typeMap = c.typeMap + r.startTime = time.Now() + r.sql = sql + r.args = args + r.conn = c + + return r +} + +type QueryExecMode int32 + +const ( + _ QueryExecMode = iota + + // Automatically prepare and cache statements. This uses the extended protocol. Queries are executed in a single round + // trip after the statement is cached. This is the default. If the database schema is modified or the search_path is + // changed after a statement is cached then the first execution of a previously cached query may fail. e.g. If the + // number of columns returned by a "SELECT *" changes or the type of a column is changed. + QueryExecModeCacheStatement + + // Cache statement descriptions (i.e. argument and result types) and assume they do not change. This uses the extended + // protocol. Queries are executed in a single round trip after the description is cached. If the database schema is + // modified or the search_path is changed after a statement is cached then the first execution of a previously cached + // query may fail. e.g. If the number of columns returned by a "SELECT *" changes or the type of a column is changed. + QueryExecModeCacheDescribe + + // Get the statement description on every execution. This uses the extended protocol. Queries require two round trips + // to execute. It does not use named prepared statements. But it does use the unnamed prepared statement to get the + // statement description on the first round trip and then uses it to execute the query on the second round trip. This + // may cause problems with connection poolers that switch the underlying connection between round trips. It is safe + // even when the database schema is modified concurrently. + QueryExecModeDescribeExec + + // Assume the PostgreSQL query parameter types based on the Go type of the arguments. This uses the extended protocol + // with text formatted parameters and results. Queries are executed in a single round trip. Type mappings can be + // registered with pgtype.Map.RegisterDefaultPgType. Queries will be rejected that have arguments that are + // unregistered or ambiguous. e.g. A map[string]string may have the PostgreSQL type json or hstore. Modes that know + // the PostgreSQL type can use a map[string]string directly as an argument. This mode cannot. + QueryExecModeExec + + // Use the simple protocol. Assume the PostgreSQL query parameter types based on the Go type of the arguments. + // Queries are executed in a single round trip. Type mappings can be registered with + // pgtype.Map.RegisterDefaultPgType. Queries will be rejected that have arguments that are unregistered or ambiguous. + // e.g. A map[string]string may have the PostgreSQL type json or hstore. Modes that know the PostgreSQL type can use + // a map[string]string directly as an argument. This mode cannot. + // + // QueryExecModeSimpleProtocol should have the user application visible behavior as QueryExecModeExec with minor + // exceptions such as behavior when multiple result returning queries are erroneously sent in a single string. + // + // QueryExecModeSimpleProtocol uses client side parameter interpolation. All values are quoted and escaped. Prefer + // QueryExecModeExec over QueryExecModeSimpleProtocol whenever possible. In general QueryExecModeSimpleProtocol + // should only be used if connecting to a proxy server, connection pool server, or non-PostgreSQL server that does + // not support the extended protocol. + QueryExecModeSimpleProtocol +) + +func (m QueryExecMode) String() string { + switch m { + case QueryExecModeCacheStatement: + return "cache statement" + case QueryExecModeCacheDescribe: + return "cache describe" + case QueryExecModeDescribeExec: + return "describe exec" + case QueryExecModeExec: + return "exec" + case QueryExecModeSimpleProtocol: + return "simple protocol" + default: + return "invalid" + } +} + +// QueryResultFormats controls the result format (text=0, binary=1) of a query by result column position. +type QueryResultFormats []int16 + +// QueryResultFormatsByOID controls the result format (text=0, binary=1) of a query by the result column OID. +type QueryResultFormatsByOID map[uint32]int16 + +// QueryRewriter rewrites a query when used as the first arguments to a query method. +type QueryRewriter interface { + RewriteQuery(ctx context.Context, conn *Conn, sql string, args []any) (newSQL string, newArgs []any, err error) +} + +// Query sends a query to the server and returns a Rows to read the results. Only errors encountered sending the query +// and initializing Rows will be returned. Err() on the returned Rows must be checked after the Rows is closed to +// determine if the query executed successfully. +// +// The returned Rows must be closed before the connection can be used again. It is safe to attempt to read from the +// returned Rows even if an error is returned. The error will be the available in rows.Err() after rows are closed. It +// is allowed to ignore the error returned from Query and handle it in Rows. +// +// It is possible for a call of FieldDescriptions on the returned Rows to return nil even if the Query call did not +// return an error. +// +// It is possible for a query to return one or more rows before encountering an error. In most cases the rows should be +// collected before processing rather than processed while receiving each row. This avoids the possibility of the +// application processing rows from a query that the server rejected. The CollectRows function is useful here. +// +// An implementor of QueryRewriter may be passed as the first element of args. It can rewrite the sql and change or +// replace args. For example, NamedArgs is QueryRewriter that implements named arguments. +// +// For extra control over how the query is executed, the types QueryExecMode, QueryResultFormats, and +// QueryResultFormatsByOID may be used as the first args to control exactly how the query is executed. This is rarely +// needed. See the documentation for those types for details. +func (c *Conn) Query(ctx context.Context, sql string, args ...any) (Rows, error) { + if c.queryTracer != nil { + ctx = c.queryTracer.TraceQueryStart(ctx, c, TraceQueryStartData{SQL: sql, Args: args}) + } + + if err := c.deallocateInvalidatedCachedStatements(ctx); err != nil { + if c.queryTracer != nil { + c.queryTracer.TraceQueryEnd(ctx, c, TraceQueryEndData{Err: err}) + } + return &baseRows{err: err, closed: true}, err + } + + var resultFormats QueryResultFormats + var resultFormatsByOID QueryResultFormatsByOID + mode := c.config.DefaultQueryExecMode + var queryRewriter QueryRewriter + +optionLoop: + for len(args) > 0 { + switch arg := args[0].(type) { + case QueryResultFormats: + resultFormats = arg + args = args[1:] + case QueryResultFormatsByOID: + resultFormatsByOID = arg + args = args[1:] + case QueryExecMode: + mode = arg + args = args[1:] + case QueryRewriter: + queryRewriter = arg + args = args[1:] + default: + break optionLoop + } + } + + if queryRewriter != nil { + var err error + originalSQL := sql + originalArgs := args + sql, args, err = queryRewriter.RewriteQuery(ctx, c, sql, args) + if err != nil { + rows := c.getRows(ctx, originalSQL, originalArgs) + err = fmt.Errorf("rewrite query failed: %w", err) + rows.fatal(err) + return rows, err + } + } + + // Bypass any statement caching. + if sql == "" { + mode = QueryExecModeSimpleProtocol + } + + c.eqb.reset() + rows := c.getRows(ctx, sql, args) + + var err error + sd, explicitPreparedStatement := c.preparedStatements[sql] + if sd != nil || mode == QueryExecModeCacheStatement || mode == QueryExecModeCacheDescribe || mode == QueryExecModeDescribeExec { + if sd == nil { + sd, err = c.getStatementDescription(ctx, mode, sql) + if err != nil { + rows.fatal(err) + return rows, err + } + } + + if len(sd.ParamOIDs) != len(args) { + rows.fatal(fmt.Errorf("expected %d arguments, got %d", len(sd.ParamOIDs), len(args))) + return rows, rows.err + } + + rows.sql = sd.SQL + + err = c.eqb.Build(c.typeMap, sd, args) + if err != nil { + rows.fatal(err) + return rows, rows.err + } + + if resultFormatsByOID != nil { + resultFormats = make([]int16, len(sd.Fields)) + for i := range resultFormats { + resultFormats[i] = resultFormatsByOID[uint32(sd.Fields[i].DataTypeOID)] + } + } + + if resultFormats == nil { + resultFormats = c.eqb.ResultFormats + } + + if !explicitPreparedStatement && mode == QueryExecModeCacheDescribe { + rows.resultReader = c.pgConn.ExecParams(ctx, sql, c.eqb.ParamValues, sd.ParamOIDs, c.eqb.ParamFormats, resultFormats) + } else { + rows.resultReader = c.pgConn.ExecPrepared(ctx, sd.Name, c.eqb.ParamValues, c.eqb.ParamFormats, resultFormats) + } + } else if mode == QueryExecModeExec { + err := c.eqb.Build(c.typeMap, nil, args) + if err != nil { + rows.fatal(err) + return rows, rows.err + } + + rows.resultReader = c.pgConn.ExecParams(ctx, sql, c.eqb.ParamValues, nil, c.eqb.ParamFormats, c.eqb.ResultFormats) + } else if mode == QueryExecModeSimpleProtocol { + sql, err = c.sanitizeForSimpleQuery(sql, args...) + if err != nil { + rows.fatal(err) + return rows, err + } + + mrr := c.pgConn.Exec(ctx, sql) + if mrr.NextResult() { + rows.resultReader = mrr.ResultReader() + rows.multiResultReader = mrr + } else { + err = mrr.Close() + rows.fatal(err) + return rows, err + } + + return rows, nil + } else { + err = fmt.Errorf("unknown QueryExecMode: %v", mode) + rows.fatal(err) + return rows, rows.err + } + + c.eqb.reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. + + return rows, rows.err +} + +// getStatementDescription returns the statement description of the sql query +// according to the given mode. +// +// If the mode is one that doesn't require to know the param and result OIDs +// then nil is returned without error. +func (c *Conn) getStatementDescription( + ctx context.Context, + mode QueryExecMode, + sql string, +) (sd *pgconn.StatementDescription, err error) { + switch mode { + case QueryExecModeCacheStatement: + if c.statementCache == nil { + return nil, errDisabledStatementCache + } + sd = c.statementCache.Get(sql) + if sd == nil { + sd, err = c.Prepare(ctx, stmtcache.StatementName(sql), sql) + if err != nil { + return nil, err + } + c.statementCache.Put(sd) + } + case QueryExecModeCacheDescribe: + if c.descriptionCache == nil { + return nil, errDisabledDescriptionCache + } + sd = c.descriptionCache.Get(sql) + if sd == nil { + sd, err = c.Prepare(ctx, "", sql) + if err != nil { + return nil, err + } + c.descriptionCache.Put(sd) + } + case QueryExecModeDescribeExec: + return c.Prepare(ctx, "", sql) + } + return sd, err +} + +// QueryRow is a convenience wrapper over Query. Any error that occurs while +// querying is deferred until calling Scan on the returned Row. That Row will +// error with ErrNoRows if no rows are returned. +func (c *Conn) QueryRow(ctx context.Context, sql string, args ...any) Row { + rows, _ := c.Query(ctx, sql, args...) + return (*connRow)(rows.(*baseRows)) +} + +// SendBatch sends all queued queries to the server at once. All queries are run in an implicit transaction unless +// explicit transaction control statements are executed. The returned BatchResults must be closed before the connection +// is used again. +func (c *Conn) SendBatch(ctx context.Context, b *Batch) (br BatchResults) { + if c.batchTracer != nil { + ctx = c.batchTracer.TraceBatchStart(ctx, c, TraceBatchStartData{Batch: b}) + defer func() { + err := br.(interface{ earlyError() error }).earlyError() + if err != nil { + c.batchTracer.TraceBatchEnd(ctx, c, TraceBatchEndData{Err: err}) + } + }() + } + + if err := c.deallocateInvalidatedCachedStatements(ctx); err != nil { + return &batchResults{ctx: ctx, conn: c, err: err} + } + + for _, bi := range b.QueuedQueries { + var queryRewriter QueryRewriter + sql := bi.SQL + arguments := bi.Arguments + + optionLoop: + for len(arguments) > 0 { + // Update Batch.Queue function comment when additional options are implemented + switch arg := arguments[0].(type) { + case QueryRewriter: + queryRewriter = arg + arguments = arguments[1:] + default: + break optionLoop + } + } + + if queryRewriter != nil { + var err error + sql, arguments, err = queryRewriter.RewriteQuery(ctx, c, sql, arguments) + if err != nil { + return &batchResults{ctx: ctx, conn: c, err: fmt.Errorf("rewrite query failed: %w", err)} + } + } + + bi.SQL = sql + bi.Arguments = arguments + } + + // TODO: changing mode per batch? Update Batch.Queue function comment when implemented + mode := c.config.DefaultQueryExecMode + if mode == QueryExecModeSimpleProtocol { + return c.sendBatchQueryExecModeSimpleProtocol(ctx, b) + } + + // All other modes use extended protocol and thus can use prepared statements. + for _, bi := range b.QueuedQueries { + if sd, ok := c.preparedStatements[bi.SQL]; ok { + bi.sd = sd + } + } + + switch mode { + case QueryExecModeExec: + return c.sendBatchQueryExecModeExec(ctx, b) + case QueryExecModeCacheStatement: + return c.sendBatchQueryExecModeCacheStatement(ctx, b) + case QueryExecModeCacheDescribe: + return c.sendBatchQueryExecModeCacheDescribe(ctx, b) + case QueryExecModeDescribeExec: + return c.sendBatchQueryExecModeDescribeExec(ctx, b) + default: + panic("unknown QueryExecMode") + } +} + +func (c *Conn) sendBatchQueryExecModeSimpleProtocol(ctx context.Context, b *Batch) *batchResults { + var sb strings.Builder + for i, bi := range b.QueuedQueries { + if i > 0 { + sb.WriteByte(';') + } + sql, err := c.sanitizeForSimpleQuery(bi.SQL, bi.Arguments...) + if err != nil { + return &batchResults{ctx: ctx, conn: c, err: err} + } + sb.WriteString(sql) + } + mrr := c.pgConn.Exec(ctx, sb.String()) + return &batchResults{ + ctx: ctx, + conn: c, + mrr: mrr, + b: b, + qqIdx: 0, + } +} + +func (c *Conn) sendBatchQueryExecModeExec(ctx context.Context, b *Batch) *batchResults { + batch := &pgconn.Batch{} + + for _, bi := range b.QueuedQueries { + sd := bi.sd + if sd != nil { + err := c.eqb.Build(c.typeMap, sd, bi.Arguments) + if err != nil { + return &batchResults{ctx: ctx, conn: c, err: err} + } + + batch.ExecPrepared(sd.Name, c.eqb.ParamValues, c.eqb.ParamFormats, c.eqb.ResultFormats) + } else { + err := c.eqb.Build(c.typeMap, nil, bi.Arguments) + if err != nil { + return &batchResults{ctx: ctx, conn: c, err: err} + } + batch.ExecParams(bi.SQL, c.eqb.ParamValues, nil, c.eqb.ParamFormats, c.eqb.ResultFormats) + } + } + + c.eqb.reset() // Allow c.eqb internal memory to be GC'ed as soon as possible. + + mrr := c.pgConn.ExecBatch(ctx, batch) + + return &batchResults{ + ctx: ctx, + conn: c, + mrr: mrr, + b: b, + qqIdx: 0, + } +} + +func (c *Conn) sendBatchQueryExecModeCacheStatement(ctx context.Context, b *Batch) (pbr *pipelineBatchResults) { + if c.statementCache == nil { + return &pipelineBatchResults{ctx: ctx, conn: c, err: errDisabledStatementCache, closed: true} + } + + distinctNewQueries := []*pgconn.StatementDescription{} + distinctNewQueriesIdxMap := make(map[string]int) + + for _, bi := range b.QueuedQueries { + if bi.sd == nil { + sd := c.statementCache.Get(bi.SQL) + if sd != nil { + bi.sd = sd + } else { + if idx, present := distinctNewQueriesIdxMap[bi.SQL]; present { + bi.sd = distinctNewQueries[idx] + } else { + sd = &pgconn.StatementDescription{ + Name: stmtcache.StatementName(bi.SQL), + SQL: bi.SQL, + } + distinctNewQueriesIdxMap[sd.SQL] = len(distinctNewQueries) + distinctNewQueries = append(distinctNewQueries, sd) + bi.sd = sd + } + } + } + } + + return c.sendBatchExtendedWithDescription(ctx, b, distinctNewQueries, c.statementCache) +} + +func (c *Conn) sendBatchQueryExecModeCacheDescribe(ctx context.Context, b *Batch) (pbr *pipelineBatchResults) { + if c.descriptionCache == nil { + return &pipelineBatchResults{ctx: ctx, conn: c, err: errDisabledDescriptionCache, closed: true} + } + + distinctNewQueries := []*pgconn.StatementDescription{} + distinctNewQueriesIdxMap := make(map[string]int) + + for _, bi := range b.QueuedQueries { + if bi.sd == nil { + sd := c.descriptionCache.Get(bi.SQL) + if sd != nil { + bi.sd = sd + } else { + if idx, present := distinctNewQueriesIdxMap[bi.SQL]; present { + bi.sd = distinctNewQueries[idx] + } else { + sd = &pgconn.StatementDescription{ + SQL: bi.SQL, + } + distinctNewQueriesIdxMap[sd.SQL] = len(distinctNewQueries) + distinctNewQueries = append(distinctNewQueries, sd) + bi.sd = sd + } + } + } + } + + return c.sendBatchExtendedWithDescription(ctx, b, distinctNewQueries, c.descriptionCache) +} + +func (c *Conn) sendBatchQueryExecModeDescribeExec(ctx context.Context, b *Batch) (pbr *pipelineBatchResults) { + distinctNewQueries := []*pgconn.StatementDescription{} + distinctNewQueriesIdxMap := make(map[string]int) + + for _, bi := range b.QueuedQueries { + if bi.sd == nil { + if idx, present := distinctNewQueriesIdxMap[bi.SQL]; present { + bi.sd = distinctNewQueries[idx] + } else { + sd := &pgconn.StatementDescription{ + SQL: bi.SQL, + } + distinctNewQueriesIdxMap[sd.SQL] = len(distinctNewQueries) + distinctNewQueries = append(distinctNewQueries, sd) + bi.sd = sd + } + } + } + + return c.sendBatchExtendedWithDescription(ctx, b, distinctNewQueries, nil) +} + +func (c *Conn) sendBatchExtendedWithDescription(ctx context.Context, b *Batch, distinctNewQueries []*pgconn.StatementDescription, sdCache stmtcache.Cache) (pbr *pipelineBatchResults) { + pipeline := c.pgConn.StartPipeline(ctx) + defer func() { + if pbr != nil && pbr.err != nil { + pipeline.Close() + } + }() + + // Prepare any needed queries + if len(distinctNewQueries) > 0 { + for _, sd := range distinctNewQueries { + pipeline.SendPrepare(sd.Name, sd.SQL, nil) + } + + err := pipeline.Sync() + if err != nil { + return &pipelineBatchResults{ctx: ctx, conn: c, err: err, closed: true} + } + + for _, sd := range distinctNewQueries { + results, err := pipeline.GetResults() + if err != nil { + return &pipelineBatchResults{ctx: ctx, conn: c, err: err, closed: true} + } + + resultSD, ok := results.(*pgconn.StatementDescription) + if !ok { + return &pipelineBatchResults{ctx: ctx, conn: c, err: fmt.Errorf("expected statement description, got %T", results), closed: true} + } + + // Fill in the previously empty / pending statement descriptions. + sd.ParamOIDs = resultSD.ParamOIDs + sd.Fields = resultSD.Fields + } + + results, err := pipeline.GetResults() + if err != nil { + return &pipelineBatchResults{ctx: ctx, conn: c, err: err, closed: true} + } + + _, ok := results.(*pgconn.PipelineSync) + if !ok { + return &pipelineBatchResults{ctx: ctx, conn: c, err: fmt.Errorf("expected sync, got %T", results), closed: true} + } + } + + // Put all statements into the cache. It's fine if it overflows because HandleInvalidated will clean them up later. + if sdCache != nil { + for _, sd := range distinctNewQueries { + sdCache.Put(sd) + } + } + + // Queue the queries. + for _, bi := range b.QueuedQueries { + err := c.eqb.Build(c.typeMap, bi.sd, bi.Arguments) + if err != nil { + // we wrap the error so we the user can understand which query failed inside the batch + err = fmt.Errorf("error building query %s: %w", bi.SQL, err) + return &pipelineBatchResults{ctx: ctx, conn: c, err: err, closed: true} + } + + if bi.sd.Name == "" { + pipeline.SendQueryParams(bi.sd.SQL, c.eqb.ParamValues, bi.sd.ParamOIDs, c.eqb.ParamFormats, c.eqb.ResultFormats) + } else { + pipeline.SendQueryPrepared(bi.sd.Name, c.eqb.ParamValues, c.eqb.ParamFormats, c.eqb.ResultFormats) + } + } + + err := pipeline.Sync() + if err != nil { + return &pipelineBatchResults{ctx: ctx, conn: c, err: err, closed: true} + } + + return &pipelineBatchResults{ + ctx: ctx, + conn: c, + pipeline: pipeline, + b: b, + } +} + +func (c *Conn) sanitizeForSimpleQuery(sql string, args ...any) (string, error) { + if c.pgConn.ParameterStatus("standard_conforming_strings") != "on" { + return "", errors.New("simple protocol queries must be run with standard_conforming_strings=on") + } + + if c.pgConn.ParameterStatus("client_encoding") != "UTF8" { + return "", errors.New("simple protocol queries must be run with client_encoding=UTF8") + } + + var err error + valueArgs := make([]any, len(args)) + for i, a := range args { + valueArgs[i], err = convertSimpleArgument(c.typeMap, a) + if err != nil { + return "", err + } + } + + return sanitize.SanitizeSQL(sql, valueArgs...) +} + +// LoadType inspects the database for typeName and produces a pgtype.Type suitable for registration. typeName must be +// the name of a type where the underlying type(s) is already understood by pgx. It is for derived types. In particular, +// typeName must be one of the following: +// - An array type name of a type that is already registered. e.g. "_foo" when "foo" is registered. +// - A composite type name where all field types are already registered. +// - A domain type name where the base type is already registered. +// - An enum type name. +// - A range type name where the element type is already registered. +// - A multirange type name where the element type is already registered. +func (c *Conn) LoadType(ctx context.Context, typeName string) (*pgtype.Type, error) { + var oid uint32 + + err := c.QueryRow(ctx, "select $1::text::regtype::oid;", typeName).Scan(&oid) + if err != nil { + return nil, err + } + + var typtype string + var typbasetype uint32 + + err = c.QueryRow(ctx, "select typtype::text, typbasetype from pg_type where oid=$1", oid).Scan(&typtype, &typbasetype) + if err != nil { + return nil, err + } + + switch typtype { + case "b": // array + elementOID, err := c.getArrayElementOID(ctx, oid) + if err != nil { + return nil, err + } + + dt, ok := c.TypeMap().TypeForOID(elementOID) + if !ok { + return nil, errors.New("array element OID not registered") + } + + return &pgtype.Type{Name: typeName, OID: oid, Codec: &pgtype.ArrayCodec{ElementType: dt}}, nil + case "c": // composite + fields, err := c.getCompositeFields(ctx, oid) + if err != nil { + return nil, err + } + + return &pgtype.Type{Name: typeName, OID: oid, Codec: &pgtype.CompositeCodec{Fields: fields}}, nil + case "d": // domain + dt, ok := c.TypeMap().TypeForOID(typbasetype) + if !ok { + return nil, errors.New("domain base type OID not registered") + } + + return &pgtype.Type{Name: typeName, OID: oid, Codec: dt.Codec}, nil + case "e": // enum + return &pgtype.Type{Name: typeName, OID: oid, Codec: &pgtype.EnumCodec{}}, nil + case "r": // range + elementOID, err := c.getRangeElementOID(ctx, oid) + if err != nil { + return nil, err + } + + dt, ok := c.TypeMap().TypeForOID(elementOID) + if !ok { + return nil, errors.New("range element OID not registered") + } + + return &pgtype.Type{Name: typeName, OID: oid, Codec: &pgtype.RangeCodec{ElementType: dt}}, nil + case "m": // multirange + elementOID, err := c.getMultiRangeElementOID(ctx, oid) + if err != nil { + return nil, err + } + + dt, ok := c.TypeMap().TypeForOID(elementOID) + if !ok { + return nil, errors.New("multirange element OID not registered") + } + + return &pgtype.Type{Name: typeName, OID: oid, Codec: &pgtype.MultirangeCodec{ElementType: dt}}, nil + default: + return &pgtype.Type{}, errors.New("unknown typtype") + } +} + +func (c *Conn) getArrayElementOID(ctx context.Context, oid uint32) (uint32, error) { + var typelem uint32 + + err := c.QueryRow(ctx, "select typelem from pg_type where oid=$1", oid).Scan(&typelem) + if err != nil { + return 0, err + } + + return typelem, nil +} + +func (c *Conn) getRangeElementOID(ctx context.Context, oid uint32) (uint32, error) { + var typelem uint32 + + err := c.QueryRow(ctx, "select rngsubtype from pg_range where rngtypid=$1", oid).Scan(&typelem) + if err != nil { + return 0, err + } + + return typelem, nil +} + +func (c *Conn) getMultiRangeElementOID(ctx context.Context, oid uint32) (uint32, error) { + var typelem uint32 + + err := c.QueryRow(ctx, "select rngtypid from pg_range where rngmultitypid=$1", oid).Scan(&typelem) + if err != nil { + return 0, err + } + + return typelem, nil +} + +func (c *Conn) getCompositeFields(ctx context.Context, oid uint32) ([]pgtype.CompositeCodecField, error) { + var typrelid uint32 + + err := c.QueryRow(ctx, "select typrelid from pg_type where oid=$1", oid).Scan(&typrelid) + if err != nil { + return nil, err + } + + var fields []pgtype.CompositeCodecField + var fieldName string + var fieldOID uint32 + rows, _ := c.Query(ctx, `select attname, atttypid +from pg_attribute +where attrelid=$1 + and not attisdropped + and attnum > 0 +order by attnum`, + typrelid, + ) + _, err = ForEachRow(rows, []any{&fieldName, &fieldOID}, func() error { + dt, ok := c.TypeMap().TypeForOID(fieldOID) + if !ok { + return fmt.Errorf("unknown composite type field OID: %v", fieldOID) + } + fields = append(fields, pgtype.CompositeCodecField{Name: fieldName, Type: dt}) + return nil + }) + if err != nil { + return nil, err + } + + return fields, nil +} + +func (c *Conn) deallocateInvalidatedCachedStatements(ctx context.Context) error { + if txStatus := c.pgConn.TxStatus(); txStatus != 'I' && txStatus != 'T' { + return nil + } + + if c.descriptionCache != nil { + c.descriptionCache.RemoveInvalidated() + } + + var invalidatedStatements []*pgconn.StatementDescription + if c.statementCache != nil { + invalidatedStatements = c.statementCache.GetInvalidated() + } + + if len(invalidatedStatements) == 0 { + return nil + } + + pipeline := c.pgConn.StartPipeline(ctx) + defer pipeline.Close() + + for _, sd := range invalidatedStatements { + pipeline.SendDeallocate(sd.Name) + } + + err := pipeline.Sync() + if err != nil { + return fmt.Errorf("failed to deallocate cached statement(s): %w", err) + } + + err = pipeline.Close() + if err != nil { + return fmt.Errorf("failed to deallocate cached statement(s): %w", err) + } + + c.statementCache.RemoveInvalidated() + for _, sd := range invalidatedStatements { + delete(c.preparedStatements, sd.Name) + } + + return nil +} diff --git a/vendor/github.com/jackc/pgx/v4/copy_from.go b/vendor/github.com/jackc/pgx/v5/copy_from.go similarity index 51% rename from vendor/github.com/jackc/pgx/v4/copy_from.go rename to vendor/github.com/jackc/pgx/v5/copy_from.go index 49139d0..abcd223 100644 --- a/vendor/github.com/jackc/pgx/v4/copy_from.go +++ b/vendor/github.com/jackc/pgx/v5/copy_from.go @@ -5,20 +5,19 @@ import ( "context" "fmt" "io" - "time" - "github.com/jackc/pgconn" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" + "github.com/jackc/pgx/v5/pgconn" ) // CopyFromRows returns a CopyFromSource interface over the provided rows slice // making it usable by *Conn.CopyFrom. -func CopyFromRows(rows [][]interface{}) CopyFromSource { +func CopyFromRows(rows [][]any) CopyFromSource { return ©FromRows{rows: rows, idx: -1} } type copyFromRows struct { - rows [][]interface{} + rows [][]any idx int } @@ -27,7 +26,7 @@ func (ctr *copyFromRows) Next() bool { return ctr.idx < len(ctr.rows) } -func (ctr *copyFromRows) Values() ([]interface{}, error) { +func (ctr *copyFromRows) Values() ([]any, error) { return ctr.rows[ctr.idx], nil } @@ -37,12 +36,12 @@ func (ctr *copyFromRows) Err() error { // CopyFromSlice returns a CopyFromSource interface over a dynamic func // making it usable by *Conn.CopyFrom. -func CopyFromSlice(length int, next func(int) ([]interface{}, error)) CopyFromSource { +func CopyFromSlice(length int, next func(int) ([]any, error)) CopyFromSource { return ©FromSlice{next: next, idx: -1, len: length} } type copyFromSlice struct { - next func(int) ([]interface{}, error) + next func(int) ([]any, error) idx int len int err error @@ -53,7 +52,7 @@ func (cts *copyFromSlice) Next() bool { return cts.idx < cts.len } -func (cts *copyFromSlice) Values() ([]interface{}, error) { +func (cts *copyFromSlice) Values() ([]any, error) { values, err := cts.next(cts.idx) if err != nil { cts.err = err @@ -65,6 +64,33 @@ func (cts *copyFromSlice) Err() error { return cts.err } +// CopyFromFunc returns a CopyFromSource interface that relies on nxtf for values. +// nxtf returns rows until it either signals an 'end of data' by returning row=nil and err=nil, +// or it returns an error. If nxtf returns an error, the copy is aborted. +func CopyFromFunc(nxtf func() (row []any, err error)) CopyFromSource { + return ©FromFunc{next: nxtf} +} + +type copyFromFunc struct { + next func() ([]any, error) + valueRow []any + err error +} + +func (g *copyFromFunc) Next() bool { + g.valueRow, g.err = g.next() + // only return true if valueRow exists and no error + return g.valueRow != nil && g.err == nil +} + +func (g *copyFromFunc) Values() ([]any, error) { + return g.valueRow, g.err +} + +func (g *copyFromFunc) Err() error { + return g.err +} + // CopyFromSource is the interface used by *Conn.CopyFrom as the source for copy data. type CopyFromSource interface { // Next returns true if there is another row and makes the next row data @@ -73,7 +99,7 @@ type CopyFromSource interface { Next() bool // Values returns the values for the current row. - Values() ([]interface{}, error) + Values() ([]any, error) // Err returns any error that has been encountered by the CopyFromSource. If // this is not nil *Conn.CopyFrom will abort the copy. @@ -86,9 +112,17 @@ type copyFrom struct { columnNames []string rowSrc CopyFromSource readerErrChan chan error + mode QueryExecMode } func (ct *copyFrom) run(ctx context.Context) (int64, error) { + if ct.conn.copyFromTracer != nil { + ctx = ct.conn.copyFromTracer.TraceCopyFromStart(ctx, ct.conn, TraceCopyFromStartData{ + TableName: ct.tableName, + ColumnNames: ct.columnNames, + }) + } + quotedTableName := ct.tableName.Sanitize() cbuf := &bytes.Buffer{} for i, cn := range ct.columnNames { @@ -99,9 +133,29 @@ func (ct *copyFrom) run(ctx context.Context) (int64, error) { } quotedColumnNames := cbuf.String() - sd, err := ct.conn.Prepare(ctx, "", fmt.Sprintf("select %s from %s", quotedColumnNames, quotedTableName)) - if err != nil { - return 0, err + var sd *pgconn.StatementDescription + switch ct.mode { + case QueryExecModeExec, QueryExecModeSimpleProtocol: + // These modes don't support the binary format. Before the inclusion of the + // QueryExecModes, Conn.Prepare was called on every COPY operation to get + // the OIDs. These prepared statements were not cached. + // + // Since that's the same behavior provided by QueryExecModeDescribeExec, + // we'll default to that mode. + ct.mode = QueryExecModeDescribeExec + fallthrough + case QueryExecModeCacheStatement, QueryExecModeCacheDescribe, QueryExecModeDescribeExec: + var err error + sd, err = ct.conn.getStatementDescription( + ctx, + ct.mode, + fmt.Sprintf("select %s from %s", quotedColumnNames, quotedTableName), + ) + if err != nil { + return 0, fmt.Errorf("statement description failed: %w", err) + } + default: + return 0, fmt.Errorf("unknown QueryExecMode: %v", ct.mode) } r, w := io.Pipe() @@ -145,29 +199,29 @@ func (ct *copyFrom) run(ctx context.Context) (int64, error) { w.Close() }() - startTime := time.Now() - commandTag, err := ct.conn.pgConn.CopyFrom(ctx, r, fmt.Sprintf("copy %s ( %s ) from stdin binary;", quotedTableName, quotedColumnNames)) r.Close() <-doneChan - rowsAffected := commandTag.RowsAffected() - endTime := time.Now() - if err == nil { - if ct.conn.shouldLog(LogLevelInfo) { - ct.conn.log(ctx, LogLevelInfo, "CopyFrom", map[string]interface{}{"tableName": ct.tableName, "columnNames": ct.columnNames, "time": endTime.Sub(startTime), "rowCount": rowsAffected}) - } - } else if ct.conn.shouldLog(LogLevelError) { - ct.conn.log(ctx, LogLevelError, "CopyFrom", map[string]interface{}{"err": err, "tableName": ct.tableName, "columnNames": ct.columnNames, "time": endTime.Sub(startTime)}) + if ct.conn.copyFromTracer != nil { + ct.conn.copyFromTracer.TraceCopyFromEnd(ctx, ct.conn, TraceCopyFromEndData{ + CommandTag: commandTag, + Err: err, + }) } - return rowsAffected, err + return commandTag.RowsAffected(), err } func (ct *copyFrom) buildCopyBuf(buf []byte, sd *pgconn.StatementDescription) (bool, []byte, error) { + const sendBufSize = 65536 - 5 // The packet has a 5-byte header + lastBufLen := 0 + largestRowLen := 0 for ct.rowSrc.Next() { + lastBufLen = len(buf) + values, err := ct.rowSrc.Values() if err != nil { return false, nil, err @@ -178,13 +232,21 @@ func (ct *copyFrom) buildCopyBuf(buf []byte, sd *pgconn.StatementDescription) (b buf = pgio.AppendInt16(buf, int16(len(ct.columnNames))) for i, val := range values { - buf, err = encodePreparedStatementArgument(ct.conn.connInfo, buf, sd.Fields[i].DataTypeOID, val) + buf, err = encodeCopyValue(ct.conn.typeMap, buf, sd.Fields[i].DataTypeOID, val) if err != nil { return false, nil, err } } - if len(buf) > 65536 { + rowLen := len(buf) - lastBufLen + if rowLen > largestRowLen { + largestRowLen = rowLen + } + + // Try not to overflow size of the buffer PgConn.CopyFrom will be reading into. If that happens then the nature of + // io.Pipe means that the next Read will be short. This can lead to pathological send sizes such as 65531, 13, 65531 + // 13, 65531, 13, 65531, 13. + if len(buf) > sendBufSize-largestRowLen { return true, buf, nil } } @@ -192,12 +254,14 @@ func (ct *copyFrom) buildCopyBuf(buf []byte, sd *pgconn.StatementDescription) (b return false, buf, nil } -// CopyFrom uses the PostgreSQL copy protocol to perform bulk data insertion. -// It returns the number of rows copied and an error. +// CopyFrom uses the PostgreSQL copy protocol to perform bulk data insertion. It returns the number of rows copied and +// an error. +// +// CopyFrom requires all values use the binary format. A pgtype.Type that supports the binary format must be registered +// for the type of each column. Almost all types implemented by pgx support the binary format. // -// CopyFrom requires all values use the binary format. Almost all types -// implemented by pgx use the binary format by default. Types implementing -// Encoder can only be used if they encode to the binary format. +// Even though enum types appear to be strings they still must be registered to use with CopyFrom. This can be done with +// Conn.LoadType and pgtype.Map.RegisterType. func (c *Conn) CopyFrom(ctx context.Context, tableName Identifier, columnNames []string, rowSrc CopyFromSource) (int64, error) { ct := ©From{ conn: c, @@ -205,6 +269,7 @@ func (c *Conn) CopyFrom(ctx context.Context, tableName Identifier, columnNames [ columnNames: columnNames, rowSrc: rowSrc, readerErrChan: make(chan error), + mode: c.config.DefaultQueryExecMode, } return ct.run(ctx) diff --git a/vendor/github.com/jackc/pgx/v5/derived_types.go b/vendor/github.com/jackc/pgx/v5/derived_types.go new file mode 100644 index 0000000..22ab069 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/derived_types.go @@ -0,0 +1,262 @@ +package pgx + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/pgtype" +) + +/* +buildLoadDerivedTypesSQL generates the correct query for retrieving type information. + + pgVersion: the major version of the PostgreSQL server + typeNames: the names of the types to load. If nil, load all types. +*/ +func buildLoadDerivedTypesSQL(pgVersion int64, typeNames []string) string { + supportsMultirange := (pgVersion >= 14) + var typeNamesClause string + + if typeNames == nil { + // This should not occur; this will not return any types + typeNamesClause = "= ''" + } else { + typeNamesClause = "= ANY($1)" + } + parts := make([]string, 0, 10) + + // Each of the type names provided might be found in pg_class or pg_type. + // Additionally, it may or may not include a schema portion. + parts = append(parts, ` +WITH RECURSIVE +-- find the OIDs in pg_class which match one of the provided type names +selected_classes(oid,reltype) AS ( + -- this query uses the namespace search path, so will match type names without a schema prefix + SELECT pg_class.oid, pg_class.reltype + FROM pg_catalog.pg_class + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = pg_class.relnamespace + WHERE pg_catalog.pg_table_is_visible(pg_class.oid) + AND relname `, typeNamesClause, ` +UNION ALL + -- this query will only match type names which include the schema prefix + SELECT pg_class.oid, pg_class.reltype + FROM pg_class + INNER JOIN pg_namespace ON (pg_class.relnamespace = pg_namespace.oid) + WHERE nspname || '.' || relname `, typeNamesClause, ` +), +selected_types(oid) AS ( + -- collect the OIDs from pg_types which correspond to the selected classes + SELECT reltype AS oid + FROM selected_classes +UNION ALL + -- as well as any other type names which match our criteria + SELECT pg_type.oid + FROM pg_type + LEFT OUTER JOIN pg_namespace ON (pg_type.typnamespace = pg_namespace.oid) + WHERE typname `, typeNamesClause, ` + OR nspname || '.' || typname `, typeNamesClause, ` +), +-- this builds a parent/child mapping of objects, allowing us to know +-- all the child (ie: dependent) types that a parent (type) requires +-- As can be seen, there are 3 ways this can occur (the last of which +-- is due to being a composite class, where the composite fields are children) +pc(parent, child) AS ( + SELECT parent.oid, parent.typelem + FROM pg_type parent + WHERE parent.typtype = 'b' AND parent.typelem != 0 +UNION ALL + SELECT parent.oid, parent.typbasetype + FROM pg_type parent + WHERE parent.typtypmod = -1 AND parent.typbasetype != 0 +UNION ALL + SELECT pg_type.oid, atttypid + FROM pg_attribute + INNER JOIN pg_class ON (pg_class.oid = pg_attribute.attrelid) + INNER JOIN pg_type ON (pg_type.oid = pg_class.reltype) + WHERE NOT attisdropped + AND attnum > 0 +), +-- Now construct a recursive query which includes a 'depth' element. +-- This is used to ensure that the "youngest" children are registered before +-- their parents. +relationships(parent, child, depth) AS ( + SELECT DISTINCT 0::OID, selected_types.oid, 0 + FROM selected_types +UNION ALL + SELECT pg_type.oid AS parent, pg_attribute.atttypid AS child, 1 + FROM selected_classes c + inner join pg_type ON (c.reltype = pg_type.oid) + inner join pg_attribute on (c.oid = pg_attribute.attrelid) +UNION ALL + SELECT pc.parent, pc.child, relationships.depth + 1 + FROM pc + INNER JOIN relationships ON (pc.parent = relationships.child) +), +-- composite fields need to be encapsulated as a couple of arrays to provide the required information for registration +composite AS ( + SELECT pg_type.oid, ARRAY_AGG(attname ORDER BY attnum) AS attnames, ARRAY_AGG(atttypid ORDER BY ATTNUM) AS atttypids + FROM pg_attribute + INNER JOIN pg_class ON (pg_class.oid = pg_attribute.attrelid) + INNER JOIN pg_type ON (pg_type.oid = pg_class.reltype) + WHERE NOT attisdropped + AND attnum > 0 + GROUP BY pg_type.oid +) +-- Bring together this information, showing all the information which might possibly be required +-- to complete the registration, applying filters to only show the items which relate to the selected +-- types/classes. +SELECT typname, + pg_namespace.nspname, + typtype, + typbasetype, + typelem, + pg_type.oid,`) + if supportsMultirange { + parts = append(parts, ` + COALESCE(multirange.rngtypid, 0) AS rngtypid,`) + } else { + parts = append(parts, ` + 0 AS rngtypid,`) + } + parts = append(parts, ` + COALESCE(pg_range.rngsubtype, 0) AS rngsubtype, + attnames, atttypids + FROM relationships + INNER JOIN pg_type ON (pg_type.oid = relationships.child) + LEFT OUTER JOIN pg_range ON (pg_type.oid = pg_range.rngtypid)`) + if supportsMultirange { + parts = append(parts, ` + LEFT OUTER JOIN pg_range multirange ON (pg_type.oid = multirange.rngmultitypid)`) + } + + parts = append(parts, ` + LEFT OUTER JOIN composite USING (oid) + LEFT OUTER JOIN pg_namespace ON (pg_type.typnamespace = pg_namespace.oid) + WHERE NOT (typtype = 'b' AND typelem = 0)`) + parts = append(parts, ` + GROUP BY typname, pg_namespace.nspname, typtype, typbasetype, typelem, pg_type.oid, pg_range.rngsubtype,`) + if supportsMultirange { + parts = append(parts, ` + multirange.rngtypid,`) + } + parts = append(parts, ` + attnames, atttypids + ORDER BY MAX(depth) desc, typname;`) + return strings.Join(parts, "") +} + +type derivedTypeInfo struct { + Oid, Typbasetype, Typelem, Rngsubtype, Rngtypid uint32 + TypeName, Typtype, NspName string + Attnames []string + Atttypids []uint32 +} + +// LoadTypes performs a single (complex) query, returning all the required +// information to register the named types, as well as any other types directly +// or indirectly required to complete the registration. +// The result of this call can be passed into RegisterTypes to complete the process. +func (c *Conn) LoadTypes(ctx context.Context, typeNames []string) ([]*pgtype.Type, error) { + m := c.TypeMap() + if typeNames == nil || len(typeNames) == 0 { + return nil, fmt.Errorf("No type names were supplied.") + } + + // Disregard server version errors. This will result in + // the SQL not support recent structures such as multirange + serverVersion, _ := serverVersion(c) + sql := buildLoadDerivedTypesSQL(serverVersion, typeNames) + var rows Rows + var err error + if typeNames == nil { + rows, err = c.Query(ctx, sql, QueryExecModeSimpleProtocol) + } else { + rows, err = c.Query(ctx, sql, QueryExecModeSimpleProtocol, typeNames) + } + if err != nil { + return nil, fmt.Errorf("While generating load types query: %w", err) + } + defer rows.Close() + result := make([]*pgtype.Type, 0, 100) + for rows.Next() { + ti := derivedTypeInfo{} + err = rows.Scan(&ti.TypeName, &ti.NspName, &ti.Typtype, &ti.Typbasetype, &ti.Typelem, &ti.Oid, &ti.Rngtypid, &ti.Rngsubtype, &ti.Attnames, &ti.Atttypids) + if err != nil { + return nil, fmt.Errorf("While scanning type information: %w", err) + } + var type_ *pgtype.Type + switch ti.Typtype { + case "b": // array + dt, ok := m.TypeForOID(ti.Typelem) + if !ok { + return nil, fmt.Errorf("Array element OID %v not registered while loading pgtype %q", ti.Typelem, ti.TypeName) + } + type_ = &pgtype.Type{Name: ti.TypeName, OID: ti.Oid, Codec: &pgtype.ArrayCodec{ElementType: dt}} + case "c": // composite + var fields []pgtype.CompositeCodecField + for i, fieldName := range ti.Attnames { + dt, ok := m.TypeForOID(ti.Atttypids[i]) + if !ok { + return nil, fmt.Errorf("Unknown field for composite type %q: field %q (OID %v) is not already registered.", ti.TypeName, fieldName, ti.Atttypids[i]) + } + fields = append(fields, pgtype.CompositeCodecField{Name: fieldName, Type: dt}) + } + + type_ = &pgtype.Type{Name: ti.TypeName, OID: ti.Oid, Codec: &pgtype.CompositeCodec{Fields: fields}} + case "d": // domain + dt, ok := m.TypeForOID(ti.Typbasetype) + if !ok { + return nil, fmt.Errorf("Domain base type OID %v was not already registered, needed for %q", ti.Typbasetype, ti.TypeName) + } + + type_ = &pgtype.Type{Name: ti.TypeName, OID: ti.Oid, Codec: dt.Codec} + case "e": // enum + type_ = &pgtype.Type{Name: ti.TypeName, OID: ti.Oid, Codec: &pgtype.EnumCodec{}} + case "r": // range + dt, ok := m.TypeForOID(ti.Rngsubtype) + if !ok { + return nil, fmt.Errorf("Range element OID %v was not already registered, needed for %q", ti.Rngsubtype, ti.TypeName) + } + + type_ = &pgtype.Type{Name: ti.TypeName, OID: ti.Oid, Codec: &pgtype.RangeCodec{ElementType: dt}} + case "m": // multirange + dt, ok := m.TypeForOID(ti.Rngtypid) + if !ok { + return nil, fmt.Errorf("Multirange element OID %v was not already registered, needed for %q", ti.Rngtypid, ti.TypeName) + } + + type_ = &pgtype.Type{Name: ti.TypeName, OID: ti.Oid, Codec: &pgtype.MultirangeCodec{ElementType: dt}} + default: + return nil, fmt.Errorf("Unknown typtype %q was found while registering %q", ti.Typtype, ti.TypeName) + } + if type_ != nil { + m.RegisterType(type_) + if ti.NspName != "" { + nspType := &pgtype.Type{Name: ti.NspName + "." + type_.Name, OID: type_.OID, Codec: type_.Codec} + m.RegisterType(nspType) + result = append(result, nspType) + } + result = append(result, type_) + } + } + return result, nil +} + +// serverVersion returns the postgresql server version. +func serverVersion(c *Conn) (int64, error) { + serverVersionStr := c.PgConn().ParameterStatus("server_version") + serverVersionStr = regexp.MustCompile(`^[0-9]+`).FindString(serverVersionStr) + // if not PostgreSQL do nothing + if serverVersionStr == "" { + return 0, fmt.Errorf("Cannot identify server version in %q", serverVersionStr) + } + + version, err := strconv.ParseInt(serverVersionStr, 10, 64) + if err != nil { + return 0, fmt.Errorf("postgres version parsing failed: %w", err) + } + return version, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/doc.go b/vendor/github.com/jackc/pgx/v5/doc.go new file mode 100644 index 0000000..0e91d64 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/doc.go @@ -0,0 +1,194 @@ +// Package pgx is a PostgreSQL database driver. +/* +pgx provides a native PostgreSQL driver and can act as a database/sql driver. The native PostgreSQL interface is similar +to the database/sql interface while providing better speed and access to PostgreSQL specific features. Use +github.com/jackc/pgx/v5/stdlib to use pgx as a database/sql compatible driver. See that package's documentation for +details. + +Establishing a Connection + +The primary way of establishing a connection is with [pgx.Connect]: + + conn, err := pgx.Connect(context.Background(), os.Getenv("DATABASE_URL")) + +The database connection string can be in URL or key/value format. Both PostgreSQL settings and pgx settings can be +specified here. In addition, a config struct can be created by [ParseConfig] and modified before establishing the +connection with [ConnectConfig] to configure settings such as tracing that cannot be configured with a connection +string. + +Connection Pool + +[*pgx.Conn] represents a single connection to the database and is not concurrency safe. Use package +github.com/jackc/pgx/v5/pgxpool for a concurrency safe connection pool. + +Query Interface + +pgx implements Query in the familiar database/sql style. However, pgx provides generic functions such as CollectRows and +ForEachRow that are a simpler and safer way of processing rows than manually calling defer rows.Close(), rows.Next(), +rows.Scan, and rows.Err(). + +CollectRows can be used collect all returned rows into a slice. + + rows, _ := conn.Query(context.Background(), "select generate_series(1,$1)", 5) + numbers, err := pgx.CollectRows(rows, pgx.RowTo[int32]) + if err != nil { + return err + } + // numbers => [1 2 3 4 5] + +ForEachRow can be used to execute a callback function for every row. This is often easier than iterating over rows +directly. + + var sum, n int32 + rows, _ := conn.Query(context.Background(), "select generate_series(1,$1)", 10) + _, err := pgx.ForEachRow(rows, []any{&n}, func() error { + sum += n + return nil + }) + if err != nil { + return err + } + +pgx also implements QueryRow in the same style as database/sql. + + var name string + var weight int64 + err := conn.QueryRow(context.Background(), "select name, weight from widgets where id=$1", 42).Scan(&name, &weight) + if err != nil { + return err + } + +Use Exec to execute a query that does not return a result set. + + commandTag, err := conn.Exec(context.Background(), "delete from widgets where id=$1", 42) + if err != nil { + return err + } + if commandTag.RowsAffected() != 1 { + return errors.New("No row found to delete") + } + +PostgreSQL Data Types + +pgx uses the pgtype package to converting Go values to and from PostgreSQL values. It supports many PostgreSQL types +directly and is customizable and extendable. User defined data types such as enums, domains, and composite types may +require type registration. See that package's documentation for details. + +Transactions + +Transactions are started by calling Begin. + + tx, err := conn.Begin(context.Background()) + if err != nil { + return err + } + // Rollback is safe to call even if the tx is already closed, so if + // the tx commits successfully, this is a no-op + defer tx.Rollback(context.Background()) + + _, err = tx.Exec(context.Background(), "insert into foo(id) values (1)") + if err != nil { + return err + } + + err = tx.Commit(context.Background()) + if err != nil { + return err + } + +The Tx returned from Begin also implements the Begin method. This can be used to implement pseudo nested transactions. +These are internally implemented with savepoints. + +Use BeginTx to control the transaction mode. BeginTx also can be used to ensure a new transaction is created instead of +a pseudo nested transaction. + +BeginFunc and BeginTxFunc are functions that begin a transaction, execute a function, and commit or rollback the +transaction depending on the return value of the function. These can be simpler and less error prone to use. + + err = pgx.BeginFunc(context.Background(), conn, func(tx pgx.Tx) error { + _, err := tx.Exec(context.Background(), "insert into foo(id) values (1)") + return err + }) + if err != nil { + return err + } + +Prepared Statements + +Prepared statements can be manually created with the Prepare method. However, this is rarely necessary because pgx +includes an automatic statement cache by default. Queries run through the normal Query, QueryRow, and Exec functions are +automatically prepared on first execution and the prepared statement is reused on subsequent executions. See ParseConfig +for information on how to customize or disable the statement cache. + +Copy Protocol + +Use CopyFrom to efficiently insert multiple rows at a time using the PostgreSQL copy protocol. CopyFrom accepts a +CopyFromSource interface. If the data is already in a [][]any use CopyFromRows to wrap it in a CopyFromSource interface. +Or implement CopyFromSource to avoid buffering the entire data set in memory. + + rows := [][]any{ + {"John", "Smith", int32(36)}, + {"Jane", "Doe", int32(29)}, + } + + copyCount, err := conn.CopyFrom( + context.Background(), + pgx.Identifier{"people"}, + []string{"first_name", "last_name", "age"}, + pgx.CopyFromRows(rows), + ) + +When you already have a typed array using CopyFromSlice can be more convenient. + + rows := []User{ + {"John", "Smith", 36}, + {"Jane", "Doe", 29}, + } + + copyCount, err := conn.CopyFrom( + context.Background(), + pgx.Identifier{"people"}, + []string{"first_name", "last_name", "age"}, + pgx.CopyFromSlice(len(rows), func(i int) ([]any, error) { + return []any{rows[i].FirstName, rows[i].LastName, rows[i].Age}, nil + }), + ) + +CopyFrom can be faster than an insert with as few as 5 rows. + +Listen and Notify + +pgx can listen to the PostgreSQL notification system with the `Conn.WaitForNotification` method. It blocks until a +notification is received or the context is canceled. + + _, err := conn.Exec(context.Background(), "listen channelname") + if err != nil { + return err + } + + notification, err := conn.WaitForNotification(context.Background()) + if err != nil { + return err + } + // do something with notification + + +Tracing and Logging + +pgx supports tracing by setting ConnConfig.Tracer. To combine several tracers you can use the multitracer.Tracer. + +In addition, the tracelog package provides the TraceLog type which lets a traditional logger act as a Tracer. + +For debug tracing of the actual PostgreSQL wire protocol messages see github.com/jackc/pgx/v5/pgproto3. + +Lower Level PostgreSQL Functionality + +github.com/jackc/pgx/v5/pgconn contains a lower level PostgreSQL driver roughly at the level of libpq. pgx.Conn in +implemented on top of pgconn. The Conn.PgConn() method can be used to access this lower layer. + +PgBouncer + +By default pgx automatically uses prepared statements. Prepared statements are incompatible with PgBouncer. This can be +disabled by setting a different QueryExecMode in ConnConfig.DefaultQueryExecMode. +*/ +package pgx diff --git a/vendor/github.com/jackc/pgx/v5/extended_query_builder.go b/vendor/github.com/jackc/pgx/v5/extended_query_builder.go new file mode 100644 index 0000000..526b0e9 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/extended_query_builder.go @@ -0,0 +1,146 @@ +package pgx + +import ( + "fmt" + + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgtype" +) + +// ExtendedQueryBuilder is used to choose the parameter formats, to format the parameters and to choose the result +// formats for an extended query. +type ExtendedQueryBuilder struct { + ParamValues [][]byte + paramValueBytes []byte + ParamFormats []int16 + ResultFormats []int16 +} + +// Build sets ParamValues, ParamFormats, and ResultFormats for use with *PgConn.ExecParams or *PgConn.ExecPrepared. If +// sd is nil then QueryExecModeExec behavior will be used. +func (eqb *ExtendedQueryBuilder) Build(m *pgtype.Map, sd *pgconn.StatementDescription, args []any) error { + eqb.reset() + + if sd == nil { + for i := range args { + err := eqb.appendParam(m, 0, pgtype.TextFormatCode, args[i]) + if err != nil { + err = fmt.Errorf("failed to encode args[%d]: %w", i, err) + return err + } + } + return nil + } + + if len(sd.ParamOIDs) != len(args) { + return fmt.Errorf("mismatched param and argument count") + } + + for i := range args { + err := eqb.appendParam(m, sd.ParamOIDs[i], -1, args[i]) + if err != nil { + err = fmt.Errorf("failed to encode args[%d]: %w", i, err) + return err + } + } + + for i := range sd.Fields { + eqb.appendResultFormat(m.FormatCodeForOID(sd.Fields[i].DataTypeOID)) + } + + return nil +} + +// appendParam appends a parameter to the query. format may be -1 to automatically choose the format. If arg is nil it +// must be an untyped nil. +func (eqb *ExtendedQueryBuilder) appendParam(m *pgtype.Map, oid uint32, format int16, arg any) error { + if format == -1 { + preferredFormat := eqb.chooseParameterFormatCode(m, oid, arg) + preferredErr := eqb.appendParam(m, oid, preferredFormat, arg) + if preferredErr == nil { + return nil + } + + var otherFormat int16 + if preferredFormat == TextFormatCode { + otherFormat = BinaryFormatCode + } else { + otherFormat = TextFormatCode + } + + otherErr := eqb.appendParam(m, oid, otherFormat, arg) + if otherErr == nil { + return nil + } + + return preferredErr // return the error from the preferred format + } + + v, err := eqb.encodeExtendedParamValue(m, oid, format, arg) + if err != nil { + return err + } + + eqb.ParamFormats = append(eqb.ParamFormats, format) + eqb.ParamValues = append(eqb.ParamValues, v) + + return nil +} + +// appendResultFormat appends a result format to the query. +func (eqb *ExtendedQueryBuilder) appendResultFormat(format int16) { + eqb.ResultFormats = append(eqb.ResultFormats, format) +} + +// reset readies eqb to build another query. +func (eqb *ExtendedQueryBuilder) reset() { + eqb.ParamValues = eqb.ParamValues[0:0] + eqb.paramValueBytes = eqb.paramValueBytes[0:0] + eqb.ParamFormats = eqb.ParamFormats[0:0] + eqb.ResultFormats = eqb.ResultFormats[0:0] + + if cap(eqb.ParamValues) > 64 { + eqb.ParamValues = make([][]byte, 0, 64) + } + + if cap(eqb.paramValueBytes) > 256 { + eqb.paramValueBytes = make([]byte, 0, 256) + } + + if cap(eqb.ParamFormats) > 64 { + eqb.ParamFormats = make([]int16, 0, 64) + } + if cap(eqb.ResultFormats) > 64 { + eqb.ResultFormats = make([]int16, 0, 64) + } +} + +func (eqb *ExtendedQueryBuilder) encodeExtendedParamValue(m *pgtype.Map, oid uint32, formatCode int16, arg any) ([]byte, error) { + if eqb.paramValueBytes == nil { + eqb.paramValueBytes = make([]byte, 0, 128) + } + + pos := len(eqb.paramValueBytes) + + buf, err := m.Encode(oid, formatCode, arg, eqb.paramValueBytes) + if err != nil { + return nil, err + } + if buf == nil { + return nil, nil + } + eqb.paramValueBytes = buf + return eqb.paramValueBytes[pos:], nil +} + +// chooseParameterFormatCode determines the correct format code for an +// argument to a prepared statement. It defaults to TextFormatCode if no +// determination can be made. +func (eqb *ExtendedQueryBuilder) chooseParameterFormatCode(m *pgtype.Map, oid uint32, arg any) int16 { + switch arg.(type) { + case string, *string: + return TextFormatCode + } + + return m.FormatCodeForOID(oid) +} diff --git a/vendor/github.com/jackc/pgx/v5/internal/iobufpool/iobufpool.go b/vendor/github.com/jackc/pgx/v5/internal/iobufpool/iobufpool.go new file mode 100644 index 0000000..89e0c22 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/internal/iobufpool/iobufpool.go @@ -0,0 +1,70 @@ +// Package iobufpool implements a global segregated-fit pool of buffers for IO. +// +// It uses *[]byte instead of []byte to avoid the sync.Pool allocation with Put. Unfortunately, using a pointer to avoid +// an allocation is purposely not documented. https://github.com/golang/go/issues/16323 +package iobufpool + +import "sync" + +const minPoolExpOf2 = 8 + +var pools [18]*sync.Pool + +func init() { + for i := range pools { + bufLen := 1 << (minPoolExpOf2 + i) + pools[i] = &sync.Pool{ + New: func() any { + buf := make([]byte, bufLen) + return &buf + }, + } + } +} + +// Get gets a []byte of len size with cap <= size*2. +func Get(size int) *[]byte { + i := getPoolIdx(size) + if i >= len(pools) { + buf := make([]byte, size) + return &buf + } + + ptrBuf := (pools[i].Get().(*[]byte)) + *ptrBuf = (*ptrBuf)[:size] + + return ptrBuf +} + +func getPoolIdx(size int) int { + size-- + size >>= minPoolExpOf2 + i := 0 + for size > 0 { + size >>= 1 + i++ + } + + return i +} + +// Put returns buf to the pool. +func Put(buf *[]byte) { + i := putPoolIdx(cap(*buf)) + if i < 0 { + return + } + + pools[i].Put(buf) +} + +func putPoolIdx(size int) int { + minPoolSize := 1 << minPoolExpOf2 + for i := range pools { + if size == minPoolSize< 0") + } + if argIdx >= len(args) { return "", fmt.Errorf("insufficient arguments") } @@ -61,7 +66,7 @@ func (q *Query) Sanitize(args ...interface{}) (string, error) { // Prevent SQL injection via Line Comment Creation // https://github.com/jackc/pgx/security/advisories/GHSA-m7wr-2xf7-cm9p - str = "(" + str + ")" + str = " " + str + " " default: return "", fmt.Errorf("invalid Part type: %T", part) } @@ -317,7 +322,7 @@ func multilineCommentState(l *sqlLexer) stateFn { // SanitizeSQL replaces placeholder values with args. It quotes and escapes args // as necessary. This function is only safe when standard_conforming_strings is // on. -func SanitizeSQL(sql string, args ...interface{}) (string, error) { +func SanitizeSQL(sql string, args ...any) (string, error) { query, err := NewQuery(sql) if err != nil { return "", err diff --git a/vendor/github.com/jackc/pgx/v5/internal/stmtcache/lru_cache.go b/vendor/github.com/jackc/pgx/v5/internal/stmtcache/lru_cache.go new file mode 100644 index 0000000..dec83f4 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/internal/stmtcache/lru_cache.go @@ -0,0 +1,112 @@ +package stmtcache + +import ( + "container/list" + + "github.com/jackc/pgx/v5/pgconn" +) + +// LRUCache implements Cache with a Least Recently Used (LRU) cache. +type LRUCache struct { + cap int + m map[string]*list.Element + l *list.List + invalidStmts []*pgconn.StatementDescription +} + +// NewLRUCache creates a new LRUCache. cap is the maximum size of the cache. +func NewLRUCache(cap int) *LRUCache { + return &LRUCache{ + cap: cap, + m: make(map[string]*list.Element), + l: list.New(), + } +} + +// Get returns the statement description for sql. Returns nil if not found. +func (c *LRUCache) Get(key string) *pgconn.StatementDescription { + if el, ok := c.m[key]; ok { + c.l.MoveToFront(el) + return el.Value.(*pgconn.StatementDescription) + } + + return nil + +} + +// Put stores sd in the cache. Put panics if sd.SQL is "". Put does nothing if sd.SQL already exists in the cache or +// sd.SQL has been invalidated and HandleInvalidated has not been called yet. +func (c *LRUCache) Put(sd *pgconn.StatementDescription) { + if sd.SQL == "" { + panic("cannot store statement description with empty SQL") + } + + if _, present := c.m[sd.SQL]; present { + return + } + + // The statement may have been invalidated but not yet handled. Do not readd it to the cache. + for _, invalidSD := range c.invalidStmts { + if invalidSD.SQL == sd.SQL { + return + } + } + + if c.l.Len() == c.cap { + c.invalidateOldest() + } + + el := c.l.PushFront(sd) + c.m[sd.SQL] = el +} + +// Invalidate invalidates statement description identified by sql. Does nothing if not found. +func (c *LRUCache) Invalidate(sql string) { + if el, ok := c.m[sql]; ok { + delete(c.m, sql) + c.invalidStmts = append(c.invalidStmts, el.Value.(*pgconn.StatementDescription)) + c.l.Remove(el) + } +} + +// InvalidateAll invalidates all statement descriptions. +func (c *LRUCache) InvalidateAll() { + el := c.l.Front() + for el != nil { + c.invalidStmts = append(c.invalidStmts, el.Value.(*pgconn.StatementDescription)) + el = el.Next() + } + + c.m = make(map[string]*list.Element) + c.l = list.New() +} + +// GetInvalidated returns a slice of all statement descriptions invalidated since the last call to RemoveInvalidated. +func (c *LRUCache) GetInvalidated() []*pgconn.StatementDescription { + return c.invalidStmts +} + +// RemoveInvalidated removes all invalidated statement descriptions. No other calls to Cache must be made between a +// call to GetInvalidated and RemoveInvalidated or RemoveInvalidated may remove statement descriptions that were +// never seen by the call to GetInvalidated. +func (c *LRUCache) RemoveInvalidated() { + c.invalidStmts = nil +} + +// Len returns the number of cached prepared statement descriptions. +func (c *LRUCache) Len() int { + return c.l.Len() +} + +// Cap returns the maximum number of cached prepared statement descriptions. +func (c *LRUCache) Cap() int { + return c.cap +} + +func (c *LRUCache) invalidateOldest() { + oldest := c.l.Back() + sd := oldest.Value.(*pgconn.StatementDescription) + c.invalidStmts = append(c.invalidStmts, sd) + delete(c.m, sd.SQL) + c.l.Remove(oldest) +} diff --git a/vendor/github.com/jackc/pgx/v5/internal/stmtcache/stmtcache.go b/vendor/github.com/jackc/pgx/v5/internal/stmtcache/stmtcache.go new file mode 100644 index 0000000..d57bdd2 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/internal/stmtcache/stmtcache.go @@ -0,0 +1,45 @@ +// Package stmtcache is a cache for statement descriptions. +package stmtcache + +import ( + "crypto/sha256" + "encoding/hex" + + "github.com/jackc/pgx/v5/pgconn" +) + +// StatementName returns a statement name that will be stable for sql across multiple connections and program +// executions. +func StatementName(sql string) string { + digest := sha256.Sum256([]byte(sql)) + return "stmtcache_" + hex.EncodeToString(digest[0:24]) +} + +// Cache caches statement descriptions. +type Cache interface { + // Get returns the statement description for sql. Returns nil if not found. + Get(sql string) *pgconn.StatementDescription + + // Put stores sd in the cache. Put panics if sd.SQL is "". Put does nothing if sd.SQL already exists in the cache. + Put(sd *pgconn.StatementDescription) + + // Invalidate invalidates statement description identified by sql. Does nothing if not found. + Invalidate(sql string) + + // InvalidateAll invalidates all statement descriptions. + InvalidateAll() + + // GetInvalidated returns a slice of all statement descriptions invalidated since the last call to RemoveInvalidated. + GetInvalidated() []*pgconn.StatementDescription + + // RemoveInvalidated removes all invalidated statement descriptions. No other calls to Cache must be made between a + // call to GetInvalidated and RemoveInvalidated or RemoveInvalidated may remove statement descriptions that were + // never seen by the call to GetInvalidated. + RemoveInvalidated() + + // Len returns the number of cached prepared statement descriptions. + Len() int + + // Cap returns the maximum number of cached prepared statement descriptions. + Cap() int +} diff --git a/vendor/github.com/jackc/pgx/v5/internal/stmtcache/unlimited_cache.go b/vendor/github.com/jackc/pgx/v5/internal/stmtcache/unlimited_cache.go new file mode 100644 index 0000000..6964132 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/internal/stmtcache/unlimited_cache.go @@ -0,0 +1,77 @@ +package stmtcache + +import ( + "math" + + "github.com/jackc/pgx/v5/pgconn" +) + +// UnlimitedCache implements Cache with no capacity limit. +type UnlimitedCache struct { + m map[string]*pgconn.StatementDescription + invalidStmts []*pgconn.StatementDescription +} + +// NewUnlimitedCache creates a new UnlimitedCache. +func NewUnlimitedCache() *UnlimitedCache { + return &UnlimitedCache{ + m: make(map[string]*pgconn.StatementDescription), + } +} + +// Get returns the statement description for sql. Returns nil if not found. +func (c *UnlimitedCache) Get(sql string) *pgconn.StatementDescription { + return c.m[sql] +} + +// Put stores sd in the cache. Put panics if sd.SQL is "". Put does nothing if sd.SQL already exists in the cache. +func (c *UnlimitedCache) Put(sd *pgconn.StatementDescription) { + if sd.SQL == "" { + panic("cannot store statement description with empty SQL") + } + + if _, present := c.m[sd.SQL]; present { + return + } + + c.m[sd.SQL] = sd +} + +// Invalidate invalidates statement description identified by sql. Does nothing if not found. +func (c *UnlimitedCache) Invalidate(sql string) { + if sd, ok := c.m[sql]; ok { + delete(c.m, sql) + c.invalidStmts = append(c.invalidStmts, sd) + } +} + +// InvalidateAll invalidates all statement descriptions. +func (c *UnlimitedCache) InvalidateAll() { + for _, sd := range c.m { + c.invalidStmts = append(c.invalidStmts, sd) + } + + c.m = make(map[string]*pgconn.StatementDescription) +} + +// GetInvalidated returns a slice of all statement descriptions invalidated since the last call to RemoveInvalidated. +func (c *UnlimitedCache) GetInvalidated() []*pgconn.StatementDescription { + return c.invalidStmts +} + +// RemoveInvalidated removes all invalidated statement descriptions. No other calls to Cache must be made between a +// call to GetInvalidated and RemoveInvalidated or RemoveInvalidated may remove statement descriptions that were +// never seen by the call to GetInvalidated. +func (c *UnlimitedCache) RemoveInvalidated() { + c.invalidStmts = nil +} + +// Len returns the number of cached prepared statement descriptions. +func (c *UnlimitedCache) Len() int { + return len(c.m) +} + +// Cap returns the maximum number of cached prepared statement descriptions. +func (c *UnlimitedCache) Cap() int { + return math.MaxInt +} diff --git a/vendor/github.com/jackc/pgx/v4/large_objects.go b/vendor/github.com/jackc/pgx/v5/large_objects.go similarity index 63% rename from vendor/github.com/jackc/pgx/v4/large_objects.go rename to vendor/github.com/jackc/pgx/v5/large_objects.go index c238ab9..9d21afd 100644 --- a/vendor/github.com/jackc/pgx/v4/large_objects.go +++ b/vendor/github.com/jackc/pgx/v5/large_objects.go @@ -4,8 +4,15 @@ import ( "context" "errors" "io" + + "github.com/jackc/pgx/v5/pgtype" ) +// The PostgreSQL wire protocol has a limit of 1 GB - 1 per message. See definition of +// PQ_LARGE_MESSAGE_LIMIT in the PostgreSQL source code. To allow for the other data +// in the message,maxLargeObjectMessageLength should be no larger than 1 GB - 1 KB. +var maxLargeObjectMessageLength = 1024*1024*1024 - 1024 + // LargeObjects is a structure used to access the large objects API. It is only valid within the transaction where it // was created. // @@ -68,32 +75,65 @@ type LargeObject struct { // Write writes p to the large object and returns the number of bytes written and an error if not all of p was written. func (o *LargeObject) Write(p []byte) (int, error) { - var n int - err := o.tx.QueryRow(o.ctx, "select lowrite($1, $2)", o.fd, p).Scan(&n) - if err != nil { - return n, err + nTotal := 0 + for { + expected := len(p) - nTotal + if expected == 0 { + break + } else if expected > maxLargeObjectMessageLength { + expected = maxLargeObjectMessageLength + } + + var n int + err := o.tx.QueryRow(o.ctx, "select lowrite($1, $2)", o.fd, p[nTotal:nTotal+expected]).Scan(&n) + if err != nil { + return nTotal, err + } + + if n < 0 { + return nTotal, errors.New("failed to write to large object") + } + + nTotal += n + + if n < expected { + return nTotal, errors.New("short write to large object") + } else if n > expected { + return nTotal, errors.New("invalid write to large object") + } } - if n < 0 { - return 0, errors.New("failed to write to large object") - } - - return n, nil + return nTotal, nil } // Read reads up to len(p) bytes into p returning the number of bytes read. func (o *LargeObject) Read(p []byte) (int, error) { - var res []byte - err := o.tx.QueryRow(o.ctx, "select loread($1, $2)", o.fd, len(p)).Scan(&res) - copy(p, res) - if err != nil { - return len(res), err + nTotal := 0 + for { + expected := len(p) - nTotal + if expected == 0 { + break + } else if expected > maxLargeObjectMessageLength { + expected = maxLargeObjectMessageLength + } + + res := pgtype.PreallocBytes(p[nTotal:]) + err := o.tx.QueryRow(o.ctx, "select loread($1, $2)", o.fd, expected).Scan(&res) + // We compute expected so that it always fits into p, so it should never happen + // that PreallocBytes's ScanBytes had to allocate a new slice. + nTotal += len(res) + if err != nil { + return nTotal, err + } + + if len(res) < expected { + return nTotal, io.EOF + } else if len(res) > expected { + return nTotal, errors.New("invalid read of large object") + } } - if len(res) < len(p) { - err = io.EOF - } - return len(res), err + return nTotal, nil } // Seek moves the current location pointer to the new location specified by offset. diff --git a/vendor/github.com/jackc/pgx/v5/named_args.go b/vendor/github.com/jackc/pgx/v5/named_args.go new file mode 100644 index 0000000..c88991e --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/named_args.go @@ -0,0 +1,295 @@ +package pgx + +import ( + "context" + "fmt" + "strconv" + "strings" + "unicode/utf8" +) + +// NamedArgs can be used as the first argument to a query method. It will replace every '@' named placeholder with a '$' +// ordinal placeholder and construct the appropriate arguments. +// +// For example, the following two queries are equivalent: +// +// conn.Query(ctx, "select * from widgets where foo = @foo and bar = @bar", pgx.NamedArgs{"foo": 1, "bar": 2}) +// conn.Query(ctx, "select * from widgets where foo = $1 and bar = $2", 1, 2) +// +// Named placeholders are case sensitive and must start with a letter or underscore. Subsequent characters can be +// letters, numbers, or underscores. +type NamedArgs map[string]any + +// RewriteQuery implements the QueryRewriter interface. +func (na NamedArgs) RewriteQuery(ctx context.Context, conn *Conn, sql string, args []any) (newSQL string, newArgs []any, err error) { + return rewriteQuery(na, sql, false) +} + +// StrictNamedArgs can be used in the same way as NamedArgs, but provided arguments are also checked to include all +// named arguments that the sql query uses, and no extra arguments. +type StrictNamedArgs map[string]any + +// RewriteQuery implements the QueryRewriter interface. +func (sna StrictNamedArgs) RewriteQuery(ctx context.Context, conn *Conn, sql string, args []any) (newSQL string, newArgs []any, err error) { + return rewriteQuery(sna, sql, true) +} + +type namedArg string + +type sqlLexer struct { + src string + start int + pos int + nested int // multiline comment nesting level. + stateFn stateFn + parts []any + + nameToOrdinal map[namedArg]int +} + +type stateFn func(*sqlLexer) stateFn + +func rewriteQuery(na map[string]any, sql string, isStrict bool) (newSQL string, newArgs []any, err error) { + l := &sqlLexer{ + src: sql, + stateFn: rawState, + nameToOrdinal: make(map[namedArg]int, len(na)), + } + + for l.stateFn != nil { + l.stateFn = l.stateFn(l) + } + + sb := strings.Builder{} + for _, p := range l.parts { + switch p := p.(type) { + case string: + sb.WriteString(p) + case namedArg: + sb.WriteRune('$') + sb.WriteString(strconv.Itoa(l.nameToOrdinal[p])) + } + } + + newArgs = make([]any, len(l.nameToOrdinal)) + for name, ordinal := range l.nameToOrdinal { + var found bool + newArgs[ordinal-1], found = na[string(name)] + if isStrict && !found { + return "", nil, fmt.Errorf("argument %s found in sql query but not present in StrictNamedArgs", name) + } + } + + if isStrict { + for name := range na { + if _, found := l.nameToOrdinal[namedArg(name)]; !found { + return "", nil, fmt.Errorf("argument %s of StrictNamedArgs not found in sql query", name) + } + } + } + + return sb.String(), newArgs, nil +} + +func rawState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + switch r { + case 'e', 'E': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune == '\'' { + l.pos += width + return escapeStringState + } + case '\'': + return singleQuoteState + case '"': + return doubleQuoteState + case '@': + nextRune, _ := utf8.DecodeRuneInString(l.src[l.pos:]) + if isLetter(nextRune) || nextRune == '_' { + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos-width]) + } + l.start = l.pos + return namedArgState + } + case '-': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune == '-' { + l.pos += width + return oneLineCommentState + } + case '/': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune == '*' { + l.pos += width + return multilineCommentState + } + case utf8.RuneError: + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos]) + l.start = l.pos + } + return nil + } + } +} + +func isLetter(r rune) bool { + return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') +} + +func namedArgState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + if r == utf8.RuneError { + if l.pos-l.start > 0 { + na := namedArg(l.src[l.start:l.pos]) + if _, found := l.nameToOrdinal[na]; !found { + l.nameToOrdinal[na] = len(l.nameToOrdinal) + 1 + } + l.parts = append(l.parts, na) + l.start = l.pos + } + return nil + } else if !(isLetter(r) || (r >= '0' && r <= '9') || r == '_') { + l.pos -= width + na := namedArg(l.src[l.start:l.pos]) + if _, found := l.nameToOrdinal[na]; !found { + l.nameToOrdinal[na] = len(l.nameToOrdinal) + 1 + } + l.parts = append(l.parts, namedArg(na)) + l.start = l.pos + return rawState + } + } +} + +func singleQuoteState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + switch r { + case '\'': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune != '\'' { + return rawState + } + l.pos += width + case utf8.RuneError: + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos]) + l.start = l.pos + } + return nil + } + } +} + +func doubleQuoteState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + switch r { + case '"': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune != '"' { + return rawState + } + l.pos += width + case utf8.RuneError: + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos]) + l.start = l.pos + } + return nil + } + } +} + +func escapeStringState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + switch r { + case '\\': + _, width = utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + case '\'': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune != '\'' { + return rawState + } + l.pos += width + case utf8.RuneError: + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos]) + l.start = l.pos + } + return nil + } + } +} + +func oneLineCommentState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + switch r { + case '\\': + _, width = utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + case '\n', '\r': + return rawState + case utf8.RuneError: + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos]) + l.start = l.pos + } + return nil + } + } +} + +func multilineCommentState(l *sqlLexer) stateFn { + for { + r, width := utf8.DecodeRuneInString(l.src[l.pos:]) + l.pos += width + + switch r { + case '/': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune == '*' { + l.pos += width + l.nested++ + } + case '*': + nextRune, width := utf8.DecodeRuneInString(l.src[l.pos:]) + if nextRune != '/' { + continue + } + + l.pos += width + if l.nested == 0 { + return rawState + } + l.nested-- + + case utf8.RuneError: + if l.pos-l.start > 0 { + l.parts = append(l.parts, l.src[l.start:l.pos]) + l.start = l.pos + } + return nil + } + } +} diff --git a/vendor/github.com/jackc/pgx/v5/pgconn/README.md b/vendor/github.com/jackc/pgx/v5/pgconn/README.md new file mode 100644 index 0000000..1fe15c2 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgconn/README.md @@ -0,0 +1,29 @@ +# pgconn + +Package pgconn is a low-level PostgreSQL database driver. It operates at nearly the same level as the C library libpq. +It is primarily intended to serve as the foundation for higher level libraries such as https://github.com/jackc/pgx. +Applications should handle normal queries with a higher level library and only use pgconn directly when required for +low-level access to PostgreSQL functionality. + +## Example Usage + +```go +pgConn, err := pgconn.Connect(context.Background(), os.Getenv("DATABASE_URL")) +if err != nil { + log.Fatalln("pgconn failed to connect:", err) +} +defer pgConn.Close(context.Background()) + +result := pgConn.ExecParams(context.Background(), "SELECT email FROM users WHERE id=$1", [][]byte{[]byte("123")}, nil, nil, nil) +for result.NextRow() { + fmt.Println("User 123 has email:", string(result.Values()[0])) +} +_, err = result.Close() +if err != nil { + log.Fatalln("failed reading result:", err) +} +``` + +## Testing + +See CONTRIBUTING.md for setup instructions. diff --git a/vendor/github.com/jackc/pgconn/auth_scram.go b/vendor/github.com/jackc/pgx/v5/pgconn/auth_scram.go similarity index 95% rename from vendor/github.com/jackc/pgconn/auth_scram.go rename to vendor/github.com/jackc/pgx/v5/pgconn/auth_scram.go index 1545b7c..0649836 100644 --- a/vendor/github.com/jackc/pgconn/auth_scram.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/auth_scram.go @@ -22,7 +22,7 @@ import ( "fmt" "strconv" - "github.com/jackc/pgproto3/v2" + "github.com/jackc/pgx/v5/pgproto3" "golang.org/x/crypto/pbkdf2" "golang.org/x/text/secure/precis" ) @@ -41,16 +41,13 @@ func (c *PgConn) scramAuth(serverAuthMechanisms []string) error { AuthMechanism: "SCRAM-SHA-256", Data: sc.clientFirstMessage(), } - buf, err := saslInitialResponse.Encode(nil) - if err != nil { - return err - } - _, err = c.conn.Write(buf) + c.frontend.Send(saslInitialResponse) + err = c.flushWithPotentialWriteReadDeadlock() if err != nil { return err } - // Receive server-first-message payload in a AuthenticationSASLContinue. + // Receive server-first-message payload in an AuthenticationSASLContinue. saslContinue, err := c.rxSASLContinue() if err != nil { return err @@ -64,16 +61,13 @@ func (c *PgConn) scramAuth(serverAuthMechanisms []string) error { saslResponse := &pgproto3.SASLResponse{ Data: []byte(sc.clientFinalMessage()), } - buf, err = saslResponse.Encode(nil) - if err != nil { - return err - } - _, err = c.conn.Write(buf) + c.frontend.Send(saslResponse) + err = c.flushWithPotentialWriteReadDeadlock() if err != nil { return err } - // Receive server-final-message payload in a AuthenticationSASLFinal. + // Receive server-final-message payload in an AuthenticationSASLFinal. saslFinal, err := c.rxSASLFinal() if err != nil { return err diff --git a/vendor/github.com/jackc/pgconn/config.go b/vendor/github.com/jackc/pgx/v5/pgconn/config.go similarity index 82% rename from vendor/github.com/jackc/pgconn/config.go rename to vendor/github.com/jackc/pgx/v5/pgconn/config.go index 36b74c4..6a198e6 100644 --- a/vendor/github.com/jackc/pgconn/config.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/config.go @@ -8,7 +8,6 @@ import ( "errors" "fmt" "io" - "io/ioutil" "math" "net" "net/url" @@ -18,17 +17,17 @@ import ( "strings" "time" - "github.com/jackc/chunkreader/v2" "github.com/jackc/pgpassfile" - "github.com/jackc/pgproto3/v2" "github.com/jackc/pgservicefile" + "github.com/jackc/pgx/v5/pgconn/ctxwatch" + "github.com/jackc/pgx/v5/pgproto3" ) type AfterConnectFunc func(ctx context.Context, pgconn *PgConn) error type ValidateConnectFunc func(ctx context.Context, pgconn *PgConn) error type GetSSLPasswordFunc func(ctx context.Context) string -// Config is the settings used to establish a connection to a PostgreSQL server. It must be created by ParseConfig. A +// Config is the settings used to establish a connection to a PostgreSQL server. It must be created by [ParseConfig]. A // manually initialized Config will cause ConnectConfig to panic. type Config struct { Host string // host (e.g. localhost) or absolute path to unix domain socket directory (e.g. /private/tmp) @@ -41,7 +40,12 @@ type Config struct { DialFunc DialFunc // e.g. net.Dialer.DialContext LookupFunc LookupFunc // e.g. net.Resolver.LookupHost BuildFrontend BuildFrontendFunc - RuntimeParams map[string]string // Run-time parameters to set on connection as session default values (e.g. search_path or application_name) + + // BuildContextWatcherHandler is called to create a ContextWatcherHandler for a connection. The handler is called + // when a context passed to a PgConn method is canceled. + BuildContextWatcherHandler func(*PgConn) ctxwatch.Handler + + RuntimeParams map[string]string // Run-time parameters to set on connection as session default values (e.g. search_path or application_name) KerberosSrvName string KerberosSpn string @@ -62,12 +66,17 @@ type Config struct { // OnNotification is a callback function called when a notification from the LISTEN/NOTIFY system is received. OnNotification NotificationHandler + // OnPgError is a callback function called when a Postgres error is received by the server. The default handler will close + // the connection on any FATAL errors. If you override this handler you should call the previously set handler or ensure + // that you close on FATAL errors by returning false. + OnPgError PgErrorHandler + createdByParseConfig bool // Used to enforce created by ParseConfig rule. } -// ParseConfigOptions contains options that control how a config is built such as getsslpassword. +// ParseConfigOptions contains options that control how a config is built such as GetSSLPassword. type ParseConfigOptions struct { - // GetSSLPassword gets the password to decrypt a SSL client certificate. This is analogous to the the libpq function + // GetSSLPassword gets the password to decrypt a SSL client certificate. This is analogous to the libpq function // PQsetSSLKeyPassHook_OpenSSL. GetSSLPassword GetSSLPasswordFunc } @@ -109,6 +118,14 @@ type FallbackConfig struct { TLSConfig *tls.Config // nil disables TLS } +// connectOneConfig is the configuration for a single attempt to connect to a single host. +type connectOneConfig struct { + network string + address string + originalHostname string // original hostname before resolving + tlsConfig *tls.Config // nil disables TLS +} + // isAbsolutePath checks if the provided value is an absolute path either // beginning with a forward slash (as on Linux-based systems) or with a capital // letter A-Z followed by a colon and a backslash, e.g., "C:\", (as on Windows). @@ -143,15 +160,15 @@ func NetworkAddress(host string, port uint16) (network, address string) { // ParseConfig builds a *Config from connString with similar behavior to the PostgreSQL standard C library libpq. It // uses the same defaults as libpq (e.g. port=5432) and understands most PG* environment variables. ParseConfig closely -// matches the parsing behavior of libpq. connString may either be in URL format or keyword = value format (DSN style). -// See https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING for details. connString also may be -// empty to only read from the environment. If a password is not supplied it will attempt to read the .pgpass file. +// matches the parsing behavior of libpq. connString may either be in URL format or keyword = value format. See +// https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING for details. connString also may be empty +// to only read from the environment. If a password is not supplied it will attempt to read the .pgpass file. // -// # Example DSN -// user=jack password=secret host=pg.example.com port=5432 dbname=mydb sslmode=verify-ca +// # Example Keyword/Value +// user=jack password=secret host=pg.example.com port=5432 dbname=mydb sslmode=verify-ca // -// # Example URL -// postgres://jack:secret@pg.example.com:5432/mydb?sslmode=verify-ca +// # Example URL +// postgres://jack:secret@pg.example.com:5432/mydb?sslmode=verify-ca // // The returned *Config may be modified. However, it is strongly recommended that any configuration that can be done // through the connection string be done there. In particular the fields Host, Port, TLSConfig, and Fallbacks can be @@ -162,28 +179,28 @@ func NetworkAddress(host string, port uint16) (network, address string) { // values that will be tried in order. This can be used as part of a high availability system. See // https://www.postgresql.org/docs/11/libpq-connect.html#LIBPQ-MULTIPLE-HOSTS for more information. // -// # Example URL -// postgres://jack:secret@foo.example.com:5432,bar.example.com:5432/mydb +// # Example URL +// postgres://jack:secret@foo.example.com:5432,bar.example.com:5432/mydb // // ParseConfig currently recognizes the following environment variable and their parameter key word equivalents passed -// via database URL or DSN: +// via database URL or keyword/value: // -// PGHOST -// PGPORT -// PGDATABASE -// PGUSER -// PGPASSWORD -// PGPASSFILE -// PGSERVICE -// PGSERVICEFILE -// PGSSLMODE -// PGSSLCERT -// PGSSLKEY -// PGSSLROOTCERT -// PGSSLPASSWORD -// PGAPPNAME -// PGCONNECT_TIMEOUT -// PGTARGETSESSIONATTRS +// PGHOST +// PGPORT +// PGDATABASE +// PGUSER +// PGPASSWORD +// PGPASSFILE +// PGSERVICE +// PGSERVICEFILE +// PGSSLMODE +// PGSSLCERT +// PGSSLKEY +// PGSSLROOTCERT +// PGSSLPASSWORD +// PGAPPNAME +// PGCONNECT_TIMEOUT +// PGTARGETSESSIONATTRS // // See http://www.postgresql.org/docs/11/static/libpq-envars.html for details on the meaning of environment variables. // @@ -212,11 +229,9 @@ func NetworkAddress(host string, port uint16) (network, address string) { // // In addition, ParseConfig accepts the following options: // -// min_read_buffer_size -// The minimum size of the internal read buffer. Default 8192. -// servicefile -// libpq only reads servicefile from the PGSERVICEFILE environment variable. ParseConfig accepts servicefile as a -// part of the connection string. +// - servicefile. +// libpq only reads servicefile from the PGSERVICEFILE environment variable. ParseConfig accepts servicefile as a +// part of the connection string. func ParseConfig(connString string) (*Config, error) { var parseConfigOptions ParseConfigOptions return ParseConfigWithOptions(connString, parseConfigOptions) @@ -232,16 +247,16 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con connStringSettings := make(map[string]string) if connString != "" { var err error - // connString may be a database URL or a DSN + // connString may be a database URL or in PostgreSQL keyword/value format if strings.HasPrefix(connString, "postgres://") || strings.HasPrefix(connString, "postgresql://") { connStringSettings, err = parseURLSettings(connString) if err != nil { - return nil, &parseConfigError{connString: connString, msg: "failed to parse as URL", err: err} + return nil, &ParseConfigError{ConnString: connString, msg: "failed to parse as URL", err: err} } } else { - connStringSettings, err = parseDSNSettings(connString) + connStringSettings, err = parseKeywordValueSettings(connString) if err != nil { - return nil, &parseConfigError{connString: connString, msg: "failed to parse as DSN", err: err} + return nil, &ParseConfigError{ConnString: connString, msg: "failed to parse as keyword/value", err: err} } } } @@ -250,30 +265,37 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con if service, present := settings["service"]; present { serviceSettings, err := parseServiceSettings(settings["servicefile"], service) if err != nil { - return nil, &parseConfigError{connString: connString, msg: "failed to read service", err: err} + return nil, &ParseConfigError{ConnString: connString, msg: "failed to read service", err: err} } settings = mergeSettings(defaultSettings, envSettings, serviceSettings, connStringSettings) } - minReadBufferSize, err := strconv.ParseInt(settings["min_read_buffer_size"], 10, 32) - if err != nil { - return nil, &parseConfigError{connString: connString, msg: "cannot parse min_read_buffer_size", err: err} - } - config := &Config{ createdByParseConfig: true, Database: settings["database"], User: settings["user"], Password: settings["password"], RuntimeParams: make(map[string]string), - BuildFrontend: makeDefaultBuildFrontendFunc(int(minReadBufferSize)), + BuildFrontend: func(r io.Reader, w io.Writer) *pgproto3.Frontend { + return pgproto3.NewFrontend(r, w) + }, + BuildContextWatcherHandler: func(pgConn *PgConn) ctxwatch.Handler { + return &DeadlineContextWatcherHandler{Conn: pgConn.conn} + }, + OnPgError: func(_ *PgConn, pgErr *PgError) bool { + // we want to automatically close any fatal errors + if strings.EqualFold(pgErr.Severity, "FATAL") { + return false + } + return true + }, } if connectTimeoutSetting, present := settings["connect_timeout"]; present { connectTimeout, err := parseConnectTimeoutSetting(connectTimeoutSetting) if err != nil { - return nil, &parseConfigError{connString: connString, msg: "invalid connect_timeout", err: err} + return nil, &ParseConfigError{ConnString: connString, msg: "invalid connect_timeout", err: err} } config.ConnectTimeout = connectTimeout config.DialFunc = makeConnectTimeoutDialFunc(connectTimeout) @@ -301,7 +323,6 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con "krbspn": {}, "krbsrvname": {}, "target_session_attrs": {}, - "min_read_buffer_size": {}, "service": {}, "servicefile": {}, } @@ -336,7 +357,7 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con port, err := parsePort(portStr) if err != nil { - return nil, &parseConfigError{connString: connString, msg: "invalid port", err: err} + return nil, &ParseConfigError{ConnString: connString, msg: "invalid port", err: err} } var tlsConfigs []*tls.Config @@ -348,7 +369,7 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con var err error tlsConfigs, err = configTLS(settings, host, options) if err != nil { - return nil, &parseConfigError{connString: connString, msg: "failed to configure TLS", err: err} + return nil, &ParseConfigError{ConnString: connString, msg: "failed to configure TLS", err: err} } } @@ -366,9 +387,9 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con config.TLSConfig = fallbacks[0].TLSConfig config.Fallbacks = fallbacks[1:] - if config.Password == "" { - passfile, err := pgpassfile.ReadPassfile(settings["passfile"]) - if err == nil { + passfile, err := pgpassfile.ReadPassfile(settings["passfile"]) + if err == nil { + if config.Password == "" { host := config.Host if network, _ := NetworkAddress(config.Host, config.Port); network == "unix" { host = "localhost" @@ -392,7 +413,7 @@ func ParseConfigWithOptions(connString string, options ParseConfigOptions) (*Con case "any": // do nothing default: - return nil, &parseConfigError{connString: connString, msg: fmt.Sprintf("unknown target_session_attrs value: %v", tsa)} + return nil, &ParseConfigError{ConnString: connString, msg: fmt.Sprintf("unknown target_session_attrs value: %v", tsa)} } return config, nil @@ -446,14 +467,17 @@ func parseEnvSettings() map[string]string { func parseURLSettings(connString string) (map[string]string, error) { settings := make(map[string]string) - url, err := url.Parse(connString) + parsedURL, err := url.Parse(connString) if err != nil { + if urlErr := new(url.Error); errors.As(err, &urlErr) { + return nil, urlErr.Err + } return nil, err } - if url.User != nil { - settings["user"] = url.User.Username() - if password, present := url.User.Password(); present { + if parsedURL.User != nil { + settings["user"] = parsedURL.User.Username() + if password, present := parsedURL.User.Password(); present { settings["password"] = password } } @@ -461,7 +485,7 @@ func parseURLSettings(connString string) (map[string]string, error) { // Handle multiple host:port's in url.Host by splitting them into host,host,host and port,port,port. var hosts []string var ports []string - for _, host := range strings.Split(url.Host, ",") { + for _, host := range strings.Split(parsedURL.Host, ",") { if host == "" { continue } @@ -487,7 +511,7 @@ func parseURLSettings(connString string) (map[string]string, error) { settings["port"] = strings.Join(ports, ",") } - database := strings.TrimLeft(url.Path, "/") + database := strings.TrimLeft(parsedURL.Path, "/") if database != "" { settings["database"] = database } @@ -496,7 +520,7 @@ func parseURLSettings(connString string) (map[string]string, error) { "dbname": "database", } - for k, v := range url.Query() { + for k, v := range parsedURL.Query() { if k2, present := nameMap[k]; present { k = k2 } @@ -513,7 +537,7 @@ func isIPOnly(host string) bool { var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} -func parseDSNSettings(s string) (map[string]string, error) { +func parseKeywordValueSettings(s string) (map[string]string, error) { settings := make(map[string]string) nameMap := map[string]string{ @@ -524,7 +548,7 @@ func parseDSNSettings(s string) (map[string]string, error) { var key, val string eqIdx := strings.IndexRune(s, '=') if eqIdx < 0 { - return nil, errors.New("invalid dsn") + return nil, errors.New("invalid keyword/value") } key = strings.Trim(s[:eqIdx], " \t\n\r\v\f") @@ -576,7 +600,7 @@ func parseDSNSettings(s string) (map[string]string, error) { } if key == "" { - return nil, errors.New("invalid dsn") + return nil, errors.New("invalid keyword/value") } settings[key] = val @@ -633,6 +657,36 @@ func configTLS(settings map[string]string, thisHost string, parseConfigOptions P tlsConfig := &tls.Config{} + if sslrootcert != "" { + var caCertPool *x509.CertPool + + if sslrootcert == "system" { + var err error + + caCertPool, err = x509.SystemCertPool() + if err != nil { + return nil, fmt.Errorf("unable to load system certificate pool: %w", err) + } + + sslmode = "verify-full" + } else { + caCertPool = x509.NewCertPool() + + caPath := sslrootcert + caCert, err := os.ReadFile(caPath) + if err != nil { + return nil, fmt.Errorf("unable to read CA file: %w", err) + } + + if !caCertPool.AppendCertsFromPEM(caCert) { + return nil, errors.New("unable to add CA to cert pool") + } + } + + tlsConfig.RootCAs = caCertPool + tlsConfig.ClientCAs = caCertPool + } + switch sslmode { case "disable": return []*tls.Config{nil}, nil @@ -690,33 +744,19 @@ func configTLS(settings map[string]string, thisHost string, parseConfigOptions P return nil, errors.New("sslmode is invalid") } - if sslrootcert != "" { - caCertPool := x509.NewCertPool() - - caPath := sslrootcert - caCert, err := ioutil.ReadFile(caPath) - if err != nil { - return nil, fmt.Errorf("unable to read CA file: %w", err) - } - - if !caCertPool.AppendCertsFromPEM(caCert) { - return nil, errors.New("unable to add CA to cert pool") - } - - tlsConfig.RootCAs = caCertPool - tlsConfig.ClientCAs = caCertPool - } - if (sslcert != "" && sslkey == "") || (sslcert == "" && sslkey != "") { return nil, errors.New(`both "sslcert" and "sslkey" are required`) } if sslcert != "" && sslkey != "" { - buf, err := ioutil.ReadFile(sslkey) + buf, err := os.ReadFile(sslkey) if err != nil { return nil, fmt.Errorf("unable to read sslkey: %w", err) } block, _ := pem.Decode(buf) + if block == nil { + return nil, errors.New("failed to decode sslkey") + } var pemKey []byte var decryptedKey []byte var decryptedError error @@ -751,7 +791,7 @@ func configTLS(settings map[string]string, thisHost string, parseConfigOptions P } else { pemKey = pem.EncodeToMemory(block) } - certfile, err := ioutil.ReadFile(sslcert) + certfile, err := os.ReadFile(sslcert) if err != nil { return nil, fmt.Errorf("unable to read cert: %w", err) } @@ -793,25 +833,14 @@ func parsePort(s string) (uint16, error) { } func makeDefaultDialer() *net.Dialer { - return &net.Dialer{KeepAlive: 5 * time.Minute} + // rely on GOLANG KeepAlive settings + return &net.Dialer{} } func makeDefaultResolver() *net.Resolver { return net.DefaultResolver } -func makeDefaultBuildFrontendFunc(minBufferLen int) BuildFrontendFunc { - return func(r io.Reader, w io.Writer) Frontend { - cr, err := chunkreader.NewConfig(r, chunkreader.Config{MinBufLen: minBufferLen}) - if err != nil { - panic(fmt.Sprintf("BUG: chunkreader.NewConfig failed: %v", err)) - } - frontend := pgproto3.NewFrontend(cr, w) - - return frontend - } -} - func parseConnectTimeoutSetting(s string) (time.Duration, error) { timeout, err := strconv.ParseInt(s, 10, 64) if err != nil { @@ -829,7 +858,7 @@ func makeConnectTimeoutDialFunc(timeout time.Duration) DialFunc { return d.DialContext } -// ValidateConnectTargetSessionAttrsReadWrite is an ValidateConnectFunc that implements libpq compatible +// ValidateConnectTargetSessionAttrsReadWrite is a ValidateConnectFunc that implements libpq compatible // target_session_attrs=read-write. func ValidateConnectTargetSessionAttrsReadWrite(ctx context.Context, pgConn *PgConn) error { result := pgConn.ExecParams(ctx, "show transaction_read_only", nil, nil, nil, nil).Read() @@ -844,7 +873,7 @@ func ValidateConnectTargetSessionAttrsReadWrite(ctx context.Context, pgConn *PgC return nil } -// ValidateConnectTargetSessionAttrsReadOnly is an ValidateConnectFunc that implements libpq compatible +// ValidateConnectTargetSessionAttrsReadOnly is a ValidateConnectFunc that implements libpq compatible // target_session_attrs=read-only. func ValidateConnectTargetSessionAttrsReadOnly(ctx context.Context, pgConn *PgConn) error { result := pgConn.ExecParams(ctx, "show transaction_read_only", nil, nil, nil, nil).Read() @@ -859,7 +888,7 @@ func ValidateConnectTargetSessionAttrsReadOnly(ctx context.Context, pgConn *PgCo return nil } -// ValidateConnectTargetSessionAttrsStandby is an ValidateConnectFunc that implements libpq compatible +// ValidateConnectTargetSessionAttrsStandby is a ValidateConnectFunc that implements libpq compatible // target_session_attrs=standby. func ValidateConnectTargetSessionAttrsStandby(ctx context.Context, pgConn *PgConn) error { result := pgConn.ExecParams(ctx, "select pg_is_in_recovery()", nil, nil, nil, nil).Read() @@ -874,7 +903,7 @@ func ValidateConnectTargetSessionAttrsStandby(ctx context.Context, pgConn *PgCon return nil } -// ValidateConnectTargetSessionAttrsPrimary is an ValidateConnectFunc that implements libpq compatible +// ValidateConnectTargetSessionAttrsPrimary is a ValidateConnectFunc that implements libpq compatible // target_session_attrs=primary. func ValidateConnectTargetSessionAttrsPrimary(ctx context.Context, pgConn *PgConn) error { result := pgConn.ExecParams(ctx, "select pg_is_in_recovery()", nil, nil, nil, nil).Read() @@ -889,7 +918,7 @@ func ValidateConnectTargetSessionAttrsPrimary(ctx context.Context, pgConn *PgCon return nil } -// ValidateConnectTargetSessionAttrsPreferStandby is an ValidateConnectFunc that implements libpq compatible +// ValidateConnectTargetSessionAttrsPreferStandby is a ValidateConnectFunc that implements libpq compatible // target_session_attrs=prefer-standby. func ValidateConnectTargetSessionAttrsPreferStandby(ctx context.Context, pgConn *PgConn) error { result := pgConn.ExecParams(ctx, "select pg_is_in_recovery()", nil, nil, nil, nil).Read() diff --git a/vendor/github.com/jackc/pgconn/internal/ctxwatch/context_watcher.go b/vendor/github.com/jackc/pgx/v5/pgconn/ctxwatch/context_watcher.go similarity index 71% rename from vendor/github.com/jackc/pgconn/internal/ctxwatch/context_watcher.go rename to vendor/github.com/jackc/pgx/v5/pgconn/ctxwatch/context_watcher.go index b39cb3e..db8884e 100644 --- a/vendor/github.com/jackc/pgconn/internal/ctxwatch/context_watcher.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/ctxwatch/context_watcher.go @@ -8,9 +8,8 @@ import ( // ContextWatcher watches a context and performs an action when the context is canceled. It can watch one context at a // time. type ContextWatcher struct { - onCancel func() - onUnwatchAfterCancel func() - unwatchChan chan struct{} + handler Handler + unwatchChan chan struct{} lock sync.Mutex watchInProgress bool @@ -20,11 +19,10 @@ type ContextWatcher struct { // NewContextWatcher returns a ContextWatcher. onCancel will be called when a watched context is canceled. // OnUnwatchAfterCancel will be called when Unwatch is called and the watched context had already been canceled and // onCancel called. -func NewContextWatcher(onCancel func(), onUnwatchAfterCancel func()) *ContextWatcher { +func NewContextWatcher(handler Handler) *ContextWatcher { cw := &ContextWatcher{ - onCancel: onCancel, - onUnwatchAfterCancel: onUnwatchAfterCancel, - unwatchChan: make(chan struct{}), + handler: handler, + unwatchChan: make(chan struct{}), } return cw @@ -46,7 +44,7 @@ func (cw *ContextWatcher) Watch(ctx context.Context) { go func() { select { case <-ctx.Done(): - cw.onCancel() + cw.handler.HandleCancel(ctx) cw.onCancelWasCalled = true <-cw.unwatchChan case <-cw.unwatchChan: @@ -66,8 +64,17 @@ func (cw *ContextWatcher) Unwatch() { if cw.watchInProgress { cw.unwatchChan <- struct{}{} if cw.onCancelWasCalled { - cw.onUnwatchAfterCancel() + cw.handler.HandleUnwatchAfterCancel() } cw.watchInProgress = false } } + +type Handler interface { + // HandleCancel is called when the context that a ContextWatcher is currently watching is canceled. canceledCtx is the + // context that was canceled. + HandleCancel(canceledCtx context.Context) + + // HandleUnwatchAfterCancel is called when a ContextWatcher that called HandleCancel on this Handler is unwatched. + HandleUnwatchAfterCancel() +} diff --git a/vendor/github.com/jackc/pgconn/defaults.go b/vendor/github.com/jackc/pgx/v5/pgconn/defaults.go similarity index 97% rename from vendor/github.com/jackc/pgconn/defaults.go rename to vendor/github.com/jackc/pgx/v5/pgconn/defaults.go index c7209fd..1dd514f 100644 --- a/vendor/github.com/jackc/pgconn/defaults.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/defaults.go @@ -40,8 +40,6 @@ func defaultSettings() map[string]string { settings["target_session_attrs"] = "any" - settings["min_read_buffer_size"] = "8192" - return settings } diff --git a/vendor/github.com/jackc/pgconn/defaults_windows.go b/vendor/github.com/jackc/pgx/v5/pgconn/defaults_windows.go similarity index 97% rename from vendor/github.com/jackc/pgconn/defaults_windows.go rename to vendor/github.com/jackc/pgx/v5/pgconn/defaults_windows.go index 71eb77d..33b4a1f 100644 --- a/vendor/github.com/jackc/pgconn/defaults_windows.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/defaults_windows.go @@ -46,8 +46,6 @@ func defaultSettings() map[string]string { settings["target_session_attrs"] = "any" - settings["min_read_buffer_size"] = "8192" - return settings } diff --git a/vendor/github.com/jackc/pgx/v5/pgconn/doc.go b/vendor/github.com/jackc/pgx/v5/pgconn/doc.go new file mode 100644 index 0000000..7013750 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgconn/doc.go @@ -0,0 +1,38 @@ +// Package pgconn is a low-level PostgreSQL database driver. +/* +pgconn provides lower level access to a PostgreSQL connection than a database/sql or pgx connection. It operates at +nearly the same level is the C library libpq. + +Establishing a Connection + +Use Connect to establish a connection. It accepts a connection string in URL or keyword/value format and will read the +environment for libpq style environment variables. + +Executing a Query + +ExecParams and ExecPrepared execute a single query. They return readers that iterate over each row. The Read method +reads all rows into memory. + +Executing Multiple Queries in a Single Round Trip + +Exec and ExecBatch can execute multiple queries in a single round trip. They return readers that iterate over each query +result. The ReadAll method reads all query results into memory. + +Pipeline Mode + +Pipeline mode allows sending queries without having read the results of previously sent queries. It allows control of +exactly how many and when network round trips occur. + +Context Support + +All potentially blocking operations take a context.Context. The default behavior when a context is canceled is for the +method to immediately return. In most circumstances, this will also close the underlying connection. This behavior can +be customized by using BuildContextWatcherHandler on the Config to create a ctxwatch.Handler with different behavior. +This can be especially useful when queries that are frequently canceled and the overhead of creating new connections is +a problem. DeadlineContextWatcherHandler and CancelRequestContextWatcherHandler can be used to introduce a delay before +interrupting the query in such a way as to close the connection. + +The CancelRequest method may be used to request the PostgreSQL server cancel an in-progress query without forcing the +client to abort. +*/ +package pgconn diff --git a/vendor/github.com/jackc/pgconn/errors.go b/vendor/github.com/jackc/pgx/v5/pgconn/errors.go similarity index 54% rename from vendor/github.com/jackc/pgconn/errors.go rename to vendor/github.com/jackc/pgx/v5/pgconn/errors.go index 66d3558..ec4a6d4 100644 --- a/vendor/github.com/jackc/pgconn/errors.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/errors.go @@ -12,14 +12,15 @@ import ( // SafeToRetry checks if the err is guaranteed to have occurred before sending any data to the server. func SafeToRetry(err error) bool { - if e, ok := err.(interface{ SafeToRetry() bool }); ok { - return e.SafeToRetry() + var retryableErr interface{ SafeToRetry() bool } + if errors.As(err, &retryableErr) { + return retryableErr.SafeToRetry() } return false } -// Timeout checks if err was was caused by a timeout. To be specific, it is true if err was caused within pgconn by a -// context.Canceled, context.DeadlineExceeded or an implementer of net.Error where Timeout() is true. +// Timeout checks if err was caused by a timeout. To be specific, it is true if err was caused within pgconn by a +// context.DeadlineExceeded or an implementer of net.Error where Timeout() is true. func Timeout(err error) bool { var timeoutErr *errTimeout return errors.As(err, &timeoutErr) @@ -29,23 +30,24 @@ func Timeout(err error) bool { // http://www.postgresql.org/docs/11/static/protocol-error-fields.html for // detailed field description. type PgError struct { - Severity string - Code string - Message string - Detail string - Hint string - Position int32 - InternalPosition int32 - InternalQuery string - Where string - SchemaName string - TableName string - ColumnName string - DataTypeName string - ConstraintName string - File string - Line int32 - Routine string + Severity string + SeverityUnlocalized string + Code string + Message string + Detail string + Hint string + Position int32 + InternalPosition int32 + InternalQuery string + Where string + SchemaName string + TableName string + ColumnName string + DataTypeName string + ConstraintName string + File string + Line int32 + Routine string } func (pe *PgError) Error() string { @@ -57,22 +59,37 @@ func (pe *PgError) SQLState() string { return pe.Code } -type connectError struct { - config *Config - msg string +// ConnectError is the error returned when a connection attempt fails. +type ConnectError struct { + Config *Config // The configuration that was used in the connection attempt. err error } -func (e *connectError) Error() string { - sb := &strings.Builder{} - fmt.Fprintf(sb, "failed to connect to `host=%s user=%s database=%s`: %s", e.config.Host, e.config.User, e.config.Database, e.msg) - if e.err != nil { - fmt.Fprintf(sb, " (%s)", e.err.Error()) +func (e *ConnectError) Error() string { + prefix := fmt.Sprintf("failed to connect to `user=%s database=%s`:", e.Config.User, e.Config.Database) + details := e.err.Error() + if strings.Contains(details, "\n") { + return prefix + "\n\t" + strings.ReplaceAll(details, "\n", "\n\t") + } else { + return prefix + " " + details } - return sb.String() } -func (e *connectError) Unwrap() error { +func (e *ConnectError) Unwrap() error { + return e.err +} + +type perDialConnectError struct { + address string + originalHostname string + err error +} + +func (e *perDialConnectError) Error() string { + return fmt.Sprintf("%s (%s): %s", e.address, e.originalHostname, e.err.Error()) +} + +func (e *perDialConnectError) Unwrap() error { return e.err } @@ -88,29 +105,39 @@ func (e *connLockError) Error() string { return e.status } -type parseConfigError struct { - connString string +// ParseConfigError is the error returned when a connection string cannot be parsed. +type ParseConfigError struct { + ConnString string // The connection string that could not be parsed. msg string err error } -func (e *parseConfigError) Error() string { - connString := redactPW(e.connString) +func (e *ParseConfigError) Error() string { + // Now that ParseConfigError is public and ConnString is available to the developer, perhaps it would be better only + // return a static string. That would ensure that the error message cannot leak a password. The ConnString field would + // allow access to the original string if desired and Unwrap would allow access to the underlying error. + connString := redactPW(e.ConnString) if e.err == nil { return fmt.Sprintf("cannot parse `%s`: %s", connString, e.msg) } return fmt.Sprintf("cannot parse `%s`: %s (%s)", connString, e.msg, e.err.Error()) } -func (e *parseConfigError) Unwrap() error { +func (e *ParseConfigError) Unwrap() error { return e.err } -// preferContextOverNetTimeoutError returns ctx.Err() if ctx.Err() is present and err is a net.Error with Timeout() == -// true. Otherwise returns err. -func preferContextOverNetTimeoutError(ctx context.Context, err error) error { - if err, ok := err.(net.Error); ok && err.Timeout() && ctx.Err() != nil { - return &errTimeout{err: ctx.Err()} +func normalizeTimeoutError(ctx context.Context, err error) error { + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + if ctx.Err() == context.Canceled { + // Since the timeout was caused by a context cancellation, the actual error is context.Canceled not the timeout error. + return context.Canceled + } else if ctx.Err() == context.DeadlineExceeded { + return &errTimeout{err: ctx.Err()} + } else { + return &errTimeout{err: netErr} + } } return err } @@ -178,33 +205,16 @@ func newContextAlreadyDoneError(ctx context.Context) (err error) { return &errTimeout{&contextAlreadyDoneError{err: ctx.Err()}} } -type writeError struct { - err error - safeToRetry bool -} - -func (e *writeError) Error() string { - return fmt.Sprintf("write failed: %s", e.err.Error()) -} - -func (e *writeError) SafeToRetry() bool { - return e.safeToRetry -} - -func (e *writeError) Unwrap() error { - return e.err -} - func redactPW(connString string) string { if strings.HasPrefix(connString, "postgres://") || strings.HasPrefix(connString, "postgresql://") { if u, err := url.Parse(connString); err == nil { return redactURL(u) } } - quotedDSN := regexp.MustCompile(`password='[^']*'`) - connString = quotedDSN.ReplaceAllLiteralString(connString, "password=xxxxx") - plainDSN := regexp.MustCompile(`password=[^ ]*`) - connString = plainDSN.ReplaceAllLiteralString(connString, "password=xxxxx") + quotedKV := regexp.MustCompile(`password='[^']*'`) + connString = quotedKV.ReplaceAllLiteralString(connString, "password=xxxxx") + plainKV := regexp.MustCompile(`password=[^ ]*`) + connString = plainKV.ReplaceAllLiteralString(connString, "password=xxxxx") brokenURL := regexp.MustCompile(`:[^:@]+?@`) connString = brokenURL.ReplaceAllLiteralString(connString, ":xxxxxx@") return connString diff --git a/vendor/github.com/jackc/pgx/v5/pgconn/internal/bgreader/bgreader.go b/vendor/github.com/jackc/pgx/v5/pgconn/internal/bgreader/bgreader.go new file mode 100644 index 0000000..e65c2c2 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgconn/internal/bgreader/bgreader.go @@ -0,0 +1,139 @@ +// Package bgreader provides a io.Reader that can optionally buffer reads in the background. +package bgreader + +import ( + "io" + "sync" + + "github.com/jackc/pgx/v5/internal/iobufpool" +) + +const ( + StatusStopped = iota + StatusRunning + StatusStopping +) + +// BGReader is an io.Reader that can optionally buffer reads in the background. It is safe for concurrent use. +type BGReader struct { + r io.Reader + + cond *sync.Cond + status int32 + readResults []readResult +} + +type readResult struct { + buf *[]byte + err error +} + +// Start starts the backgrounder reader. If the background reader is already running this is a no-op. The background +// reader will stop automatically when the underlying reader returns an error. +func (r *BGReader) Start() { + r.cond.L.Lock() + defer r.cond.L.Unlock() + + switch r.status { + case StatusStopped: + r.status = StatusRunning + go r.bgRead() + case StatusRunning: + // no-op + case StatusStopping: + r.status = StatusRunning + } +} + +// Stop tells the background reader to stop after the in progress Read returns. It is safe to call Stop when the +// background reader is not running. +func (r *BGReader) Stop() { + r.cond.L.Lock() + defer r.cond.L.Unlock() + + switch r.status { + case StatusStopped: + // no-op + case StatusRunning: + r.status = StatusStopping + case StatusStopping: + // no-op + } +} + +// Status returns the current status of the background reader. +func (r *BGReader) Status() int32 { + r.cond.L.Lock() + defer r.cond.L.Unlock() + return r.status +} + +func (r *BGReader) bgRead() { + keepReading := true + for keepReading { + buf := iobufpool.Get(8192) + n, err := r.r.Read(*buf) + *buf = (*buf)[:n] + + r.cond.L.Lock() + r.readResults = append(r.readResults, readResult{buf: buf, err: err}) + if r.status == StatusStopping || err != nil { + r.status = StatusStopped + keepReading = false + } + r.cond.L.Unlock() + r.cond.Broadcast() + } +} + +// Read implements the io.Reader interface. +func (r *BGReader) Read(p []byte) (int, error) { + r.cond.L.Lock() + defer r.cond.L.Unlock() + + if len(r.readResults) > 0 { + return r.readFromReadResults(p) + } + + // There are no unread background read results and the background reader is stopped. + if r.status == StatusStopped { + return r.r.Read(p) + } + + // Wait for results from the background reader + for len(r.readResults) == 0 { + r.cond.Wait() + } + return r.readFromReadResults(p) +} + +// readBackgroundResults reads a result previously read by the background reader. r.cond.L must be held. +func (r *BGReader) readFromReadResults(p []byte) (int, error) { + buf := r.readResults[0].buf + var err error + + n := copy(p, *buf) + if n == len(*buf) { + err = r.readResults[0].err + iobufpool.Put(buf) + if len(r.readResults) == 1 { + r.readResults = nil + } else { + r.readResults = r.readResults[1:] + } + } else { + *buf = (*buf)[n:] + r.readResults[0].buf = buf + } + + return n, err +} + +func New(r io.Reader) *BGReader { + return &BGReader{ + r: r, + cond: &sync.Cond{ + L: &sync.Mutex{}, + }, + } +} diff --git a/vendor/github.com/jackc/pgconn/krb5.go b/vendor/github.com/jackc/pgx/v5/pgconn/krb5.go similarity index 94% rename from vendor/github.com/jackc/pgconn/krb5.go rename to vendor/github.com/jackc/pgx/v5/pgconn/krb5.go index 1639b72..3c1af34 100644 --- a/vendor/github.com/jackc/pgconn/krb5.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/krb5.go @@ -4,7 +4,7 @@ import ( "errors" "fmt" - "github.com/jackc/pgproto3/v2" + "github.com/jackc/pgx/v5/pgproto3" ) // NewGSSFunc creates a GSS authentication provider, for use with @@ -62,11 +62,8 @@ func (c *PgConn) gssAuth() error { gssResponse := &pgproto3.GSSResponse{ Data: nextData, } - buf, err := gssResponse.Encode(nil) - if err != nil { - return err - } - _, err = c.conn.Write(buf) + c.frontend.Send(gssResponse) + err = c.flushWithPotentialWriteReadDeadlock() if err != nil { return err } diff --git a/vendor/github.com/jackc/pgconn/pgconn.go b/vendor/github.com/jackc/pgx/v5/pgconn/pgconn.go similarity index 54% rename from vendor/github.com/jackc/pgconn/pgconn.go rename to vendor/github.com/jackc/pgx/v5/pgconn/pgconn.go index 894baa2..7efb522 100644 --- a/vendor/github.com/jackc/pgconn/pgconn.go +++ b/vendor/github.com/jackc/pgx/v5/pgconn/pgconn.go @@ -16,9 +16,11 @@ import ( "sync" "time" - "github.com/jackc/pgconn/internal/ctxwatch" - "github.com/jackc/pgio" - "github.com/jackc/pgproto3/v2" + "github.com/jackc/pgx/v5/internal/iobufpool" + "github.com/jackc/pgx/v5/internal/pgio" + "github.com/jackc/pgx/v5/pgconn/ctxwatch" + "github.com/jackc/pgx/v5/pgconn/internal/bgreader" + "github.com/jackc/pgx/v5/pgproto3" ) const ( @@ -29,8 +31,6 @@ const ( connStatusBusy ) -const wbufLen = 1024 - // Notice represents a notice response message reported by the PostgreSQL server. Be aware that this is distinct from // LISTEN/NOTIFY notification. type Notice PgError @@ -50,7 +50,13 @@ type DialFunc func(ctx context.Context, network, addr string) (net.Conn, error) type LookupFunc func(ctx context.Context, host string) (addrs []string, err error) // BuildFrontendFunc is a function that can be used to create Frontend implementation for connection. -type BuildFrontendFunc func(r io.Reader, w io.Writer) Frontend +type BuildFrontendFunc func(r io.Reader, w io.Writer) *pgproto3.Frontend + +// PgErrorHandler is a function that handles errors returned from Postgres. This function must return true to keep +// the connection open. Returning false will cause the connection to be closed immediately. You should return +// false on any FATAL-severity errors. This will not receive network errors. The *PgConn is provided so the handler is +// aware of the origin of the error, but it must not invoke any query method. +type PgErrorHandler func(*PgConn, *PgError) bool // NoticeHandler is a function that can handle notices received from the PostgreSQL server. Notices can be received at // any time, usually during handling of a query response. The *PgConn is provided so the handler is aware of the origin @@ -64,19 +70,19 @@ type NoticeHandler func(*PgConn, *Notice) // notice event. type NotificationHandler func(*PgConn, *Notification) -// Frontend used to receive messages from backend. -type Frontend interface { - Receive() (pgproto3.BackendMessage, error) -} - // PgConn is a low-level PostgreSQL connection handle. It is not safe for concurrent usage. type PgConn struct { - conn net.Conn // the underlying TCP or unix domain socket connection + conn net.Conn pid uint32 // backend pid secretKey uint32 // key to use to send a cancel query message to the server parameterStatuses map[string]string // parameters that have been reported by the server txStatus byte - frontend Frontend + frontend *pgproto3.Frontend + bgReader *bgreader.BGReader + slowWriteTimer *time.Timer + bgReaderStarted chan struct{} + + customData map[string]any config *Config @@ -90,16 +96,18 @@ type PgConn struct { peekedMsg pgproto3.BackendMessage // Reusable / preallocated resources - wbuf []byte // write buffer resultReader ResultReader multiResultReader MultiResultReader + pipeline Pipeline contextWatcher *ctxwatch.ContextWatcher + fieldDescriptions [16]FieldDescription cleanupDone chan struct{} } -// Connect establishes a connection to a PostgreSQL server using the environment and connString (in URL or DSN format) -// to provide configuration. See documentation for ParseConfig for details. ctx can be used to cancel a connect attempt. +// Connect establishes a connection to a PostgreSQL server using the environment and connString (in URL or keyword/value +// format) to provide configuration. See documentation for [ParseConfig] for details. ctx can be used to cancel a +// connect attempt. func Connect(ctx context.Context, connString string) (*PgConn, error) { config, err := ParseConfig(connString) if err != nil { @@ -109,9 +117,9 @@ func Connect(ctx context.Context, connString string) (*PgConn, error) { return ConnectConfig(ctx, config) } -// Connect establishes a connection to a PostgreSQL server using the environment and connString (in URL or DSN format) -// and ParseConfigOptions to provide additional configuration. See documentation for ParseConfig for details. ctx can be -// used to cancel a connect attempt. +// Connect establishes a connection to a PostgreSQL server using the environment and connString (in URL or keyword/value +// format) and ParseConfigOptions to provide additional configuration. See documentation for [ParseConfig] for details. +// ctx can be used to cancel a connect attempt. func ConnectWithOptions(ctx context.Context, connString string, parseConfigOptions ParseConfigOptions) (*PgConn, error) { config, err := ParseConfigWithOptions(connString, parseConfigOptions) if err != nil { @@ -122,115 +130,82 @@ func ConnectWithOptions(ctx context.Context, connString string, parseConfigOptio } // Connect establishes a connection to a PostgreSQL server using config. config must have been constructed with -// ParseConfig. ctx can be used to cancel a connect attempt. +// [ParseConfig]. ctx can be used to cancel a connect attempt. // // If config.Fallbacks are present they will sequentially be tried in case of error establishing network connection. An // authentication error will terminate the chain of attempts (like libpq: -// https://www.postgresql.org/docs/11/libpq-connect.html#LIBPQ-MULTIPLE-HOSTS) and be returned as the error. Otherwise, -// if all attempts fail the last error is returned. -func ConnectConfig(octx context.Context, config *Config) (pgConn *PgConn, err error) { +// https://www.postgresql.org/docs/11/libpq-connect.html#LIBPQ-MULTIPLE-HOSTS) and be returned as the error. +func ConnectConfig(ctx context.Context, config *Config) (*PgConn, error) { // Default values are set in ParseConfig. Enforce initial creation by ParseConfig rather than setting defaults from // zero values. if !config.createdByParseConfig { panic("config must be created by ParseConfig") } - // Simplify usage by treating primary config and fallbacks the same. - fallbackConfigs := []*FallbackConfig{ - { - Host: config.Host, - Port: config.Port, - TLSConfig: config.TLSConfig, - }, - } - fallbackConfigs = append(fallbackConfigs, config.Fallbacks...) - ctx := octx - fallbackConfigs, err = expandWithIPs(ctx, config.LookupFunc, fallbackConfigs) - if err != nil { - return nil, &connectError{config: config, msg: "hostname resolving error", err: err} - } - - if len(fallbackConfigs) == 0 { - return nil, &connectError{config: config, msg: "hostname resolving error", err: errors.New("ip addr wasn't found")} - } + var allErrors []error - foundBestServer := false - var fallbackConfig *FallbackConfig - for i, fc := range fallbackConfigs { - // ConnectTimeout restricts the whole connection process. - if config.ConnectTimeout != 0 { - // create new context first time or when previous host was different - if i == 0 || (fallbackConfigs[i].Host != fallbackConfigs[i-1].Host) { - var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(octx, config.ConnectTimeout) - defer cancel() - } - } else { - ctx = octx - } - pgConn, err = connect(ctx, config, fc, false) - if err == nil { - foundBestServer = true - break - } else if pgerr, ok := err.(*PgError); ok { - err = &connectError{config: config, msg: "server error", err: pgerr} - const ERRCODE_INVALID_PASSWORD = "28P01" // wrong password - const ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION = "28000" // wrong password or bad pg_hba.conf settings - const ERRCODE_INVALID_CATALOG_NAME = "3D000" // db does not exist - const ERRCODE_INSUFFICIENT_PRIVILEGE = "42501" // missing connect privilege - if pgerr.Code == ERRCODE_INVALID_PASSWORD || - pgerr.Code == ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION && fc.TLSConfig != nil || - pgerr.Code == ERRCODE_INVALID_CATALOG_NAME || - pgerr.Code == ERRCODE_INSUFFICIENT_PRIVILEGE { - break - } - } else if cerr, ok := err.(*connectError); ok { - if _, ok := cerr.err.(*NotPreferredError); ok { - fallbackConfig = fc - } - } + connectConfigs, errs := buildConnectOneConfigs(ctx, config) + if len(errs) > 0 { + allErrors = append(allErrors, errs...) } - if !foundBestServer && fallbackConfig != nil { - pgConn, err = connect(ctx, config, fallbackConfig, true) - if pgerr, ok := err.(*PgError); ok { - err = &connectError{config: config, msg: "server error", err: pgerr} - } + if len(connectConfigs) == 0 { + return nil, &ConnectError{Config: config, err: fmt.Errorf("hostname resolving error: %w", errors.Join(allErrors...))} } - if err != nil { - return nil, err // no need to wrap in connectError because it will already be wrapped in all cases except PgError + pgConn, errs := connectPreferred(ctx, config, connectConfigs) + if len(errs) > 0 { + allErrors = append(allErrors, errs...) + return nil, &ConnectError{Config: config, err: errors.Join(allErrors...)} } if config.AfterConnect != nil { err := config.AfterConnect(ctx, pgConn) if err != nil { pgConn.conn.Close() - return nil, &connectError{config: config, msg: "AfterConnect error", err: err} + return nil, &ConnectError{Config: config, err: fmt.Errorf("AfterConnect error: %w", err)} } } return pgConn, nil } -func expandWithIPs(ctx context.Context, lookupFn LookupFunc, fallbacks []*FallbackConfig) ([]*FallbackConfig, error) { - var configs []*FallbackConfig +// buildConnectOneConfigs resolves hostnames and builds a list of connectOneConfigs to try connecting to. It returns a +// slice of successfully resolved connectOneConfigs and a slice of errors. It is possible for both slices to contain +// values if some hosts were successfully resolved and others were not. +func buildConnectOneConfigs(ctx context.Context, config *Config) ([]*connectOneConfig, []error) { + // Simplify usage by treating primary config and fallbacks the same. + fallbackConfigs := []*FallbackConfig{ + { + Host: config.Host, + Port: config.Port, + TLSConfig: config.TLSConfig, + }, + } + fallbackConfigs = append(fallbackConfigs, config.Fallbacks...) + + var configs []*connectOneConfig - for _, fb := range fallbacks { + var allErrors []error + + for _, fb := range fallbackConfigs { // skip resolve for unix sockets if isAbsolutePath(fb.Host) { - configs = append(configs, &FallbackConfig{ - Host: fb.Host, - Port: fb.Port, - TLSConfig: fb.TLSConfig, + network, address := NetworkAddress(fb.Host, fb.Port) + configs = append(configs, &connectOneConfig{ + network: network, + address: address, + originalHostname: fb.Host, + tlsConfig: fb.TLSConfig, }) continue } - ips, err := lookupFn(ctx, fb.Host) + ips, err := config.LookupFunc(ctx, fb.Host) if err != nil { - return nil, err + allErrors = append(allErrors, err) + continue } for _, ip := range ips { @@ -238,66 +213,140 @@ func expandWithIPs(ctx context.Context, lookupFn LookupFunc, fallbacks []*Fallba if err == nil { port, err := strconv.ParseUint(splitPort, 10, 16) if err != nil { - return nil, fmt.Errorf("error parsing port (%s) from lookup: %w", splitPort, err) + return nil, []error{fmt.Errorf("error parsing port (%s) from lookup: %w", splitPort, err)} } - configs = append(configs, &FallbackConfig{ - Host: splitIP, - Port: uint16(port), - TLSConfig: fb.TLSConfig, + network, address := NetworkAddress(splitIP, uint16(port)) + configs = append(configs, &connectOneConfig{ + network: network, + address: address, + originalHostname: fb.Host, + tlsConfig: fb.TLSConfig, }) } else { - configs = append(configs, &FallbackConfig{ - Host: ip, - Port: fb.Port, - TLSConfig: fb.TLSConfig, + network, address := NetworkAddress(ip, fb.Port) + configs = append(configs, &connectOneConfig{ + network: network, + address: address, + originalHostname: fb.Host, + tlsConfig: fb.TLSConfig, }) } } } - return configs, nil + return configs, allErrors +} + +// connectPreferred attempts to connect to the preferred host from connectOneConfigs. The connections are attempted in +// order. If a connection is successful it is returned. If no connection is successful then all errors are returned. If +// a connection attempt returns a [NotPreferredError], then that host will be used if no other hosts are successful. +func connectPreferred(ctx context.Context, config *Config, connectOneConfigs []*connectOneConfig) (*PgConn, []error) { + octx := ctx + var allErrors []error + + var fallbackConnectOneConfig *connectOneConfig + for i, c := range connectOneConfigs { + // ConnectTimeout restricts the whole connection process. + if config.ConnectTimeout != 0 { + // create new context first time or when previous host was different + if i == 0 || (connectOneConfigs[i].address != connectOneConfigs[i-1].address) { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(octx, config.ConnectTimeout) + defer cancel() + } + } else { + ctx = octx + } + + pgConn, err := connectOne(ctx, config, c, false) + if pgConn != nil { + return pgConn, nil + } + + allErrors = append(allErrors, err) + + var pgErr *PgError + if errors.As(err, &pgErr) { + const ERRCODE_INVALID_PASSWORD = "28P01" // wrong password + const ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION = "28000" // wrong password or bad pg_hba.conf settings + const ERRCODE_INVALID_CATALOG_NAME = "3D000" // db does not exist + const ERRCODE_INSUFFICIENT_PRIVILEGE = "42501" // missing connect privilege + if pgErr.Code == ERRCODE_INVALID_PASSWORD || + pgErr.Code == ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION && c.tlsConfig != nil || + pgErr.Code == ERRCODE_INVALID_CATALOG_NAME || + pgErr.Code == ERRCODE_INSUFFICIENT_PRIVILEGE { + return nil, allErrors + } + } + + var npErr *NotPreferredError + if errors.As(err, &npErr) { + fallbackConnectOneConfig = c + } + } + + if fallbackConnectOneConfig != nil { + pgConn, err := connectOne(ctx, config, fallbackConnectOneConfig, true) + if err == nil { + return pgConn, nil + } + allErrors = append(allErrors, err) + } + + return nil, allErrors } -func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig, - ignoreNotPreferredErr bool) (*PgConn, error) { +// connectOne makes one connection attempt to a single host. +func connectOne(ctx context.Context, config *Config, connectConfig *connectOneConfig, + ignoreNotPreferredErr bool, +) (*PgConn, error) { pgConn := new(PgConn) pgConn.config = config - pgConn.wbuf = make([]byte, 0, wbufLen) pgConn.cleanupDone = make(chan struct{}) + pgConn.customData = make(map[string]any) var err error - network, address := NetworkAddress(fallbackConfig.Host, fallbackConfig.Port) - netConn, err := config.DialFunc(ctx, network, address) - if err != nil { - var netErr net.Error - if errors.As(err, &netErr) && netErr.Timeout() { - err = &errTimeout{err: err} - } - return nil, &connectError{config: config, msg: "dial error", err: err} + + newPerDialConnectError := func(msg string, err error) *perDialConnectError { + err = normalizeTimeoutError(ctx, err) + e := &perDialConnectError{address: connectConfig.address, originalHostname: connectConfig.originalHostname, err: fmt.Errorf("%s: %w", msg, err)} + return e } - pgConn.conn = netConn - pgConn.contextWatcher = newContextWatcher(netConn) - pgConn.contextWatcher.Watch(ctx) + pgConn.conn, err = config.DialFunc(ctx, connectConfig.network, connectConfig.address) + if err != nil { + return nil, newPerDialConnectError("dial error", err) + } - if fallbackConfig.TLSConfig != nil { - tlsConn, err := startTLS(netConn, fallbackConfig.TLSConfig) + if connectConfig.tlsConfig != nil { + pgConn.contextWatcher = ctxwatch.NewContextWatcher(&DeadlineContextWatcherHandler{Conn: pgConn.conn}) + pgConn.contextWatcher.Watch(ctx) + tlsConn, err := startTLS(pgConn.conn, connectConfig.tlsConfig) pgConn.contextWatcher.Unwatch() // Always unwatch `netConn` after TLS. if err != nil { - netConn.Close() - return nil, &connectError{config: config, msg: "tls error", err: err} + pgConn.conn.Close() + return nil, newPerDialConnectError("tls error", err) } pgConn.conn = tlsConn - pgConn.contextWatcher = newContextWatcher(tlsConn) - pgConn.contextWatcher.Watch(ctx) } + pgConn.contextWatcher = ctxwatch.NewContextWatcher(config.BuildContextWatcherHandler(pgConn)) + pgConn.contextWatcher.Watch(ctx) defer pgConn.contextWatcher.Unwatch() pgConn.parameterStatuses = make(map[string]string) pgConn.status = connStatusConnecting - pgConn.frontend = config.BuildFrontend(pgConn.conn, pgConn.conn) + pgConn.bgReader = bgreader.New(pgConn.conn) + pgConn.slowWriteTimer = time.AfterFunc(time.Duration(math.MaxInt64), + func() { + pgConn.bgReader.Start() + pgConn.bgReaderStarted <- struct{}{} + }, + ) + pgConn.slowWriteTimer.Stop() + pgConn.bgReaderStarted = make(chan struct{}) + pgConn.frontend = config.BuildFrontend(pgConn.bgReader, pgConn.conn) startupMsg := pgproto3.StartupMessage{ ProtocolVersion: pgproto3.ProtocolVersionNumber, @@ -314,13 +363,10 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig startupMsg.Parameters["database"] = config.Database } - buf, err := startupMsg.Encode(pgConn.wbuf) - if err != nil { - return nil, &connectError{config: config, msg: "failed to write startup message", err: err} - } - if _, err := pgConn.conn.Write(buf); err != nil { + pgConn.frontend.Send(&startupMsg) + if err := pgConn.flushWithPotentialWriteReadDeadlock(); err != nil { pgConn.conn.Close() - return nil, &connectError{config: config, msg: "failed to write startup message", err: err} + return nil, newPerDialConnectError("failed to write startup message", err) } for { @@ -328,9 +374,9 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig if err != nil { pgConn.conn.Close() if err, ok := err.(*PgError); ok { - return nil, err + return nil, newPerDialConnectError("server error", err) } - return nil, &connectError{config: config, msg: "failed to receive message", err: preferContextOverNetTimeoutError(ctx, err)} + return nil, newPerDialConnectError("failed to receive message", err) } switch msg := msg.(type) { @@ -343,26 +389,26 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig err = pgConn.txPasswordMessage(pgConn.config.Password) if err != nil { pgConn.conn.Close() - return nil, &connectError{config: config, msg: "failed to write password message", err: err} + return nil, newPerDialConnectError("failed to write password message", err) } case *pgproto3.AuthenticationMD5Password: digestedPassword := "md5" + hexMD5(hexMD5(pgConn.config.Password+pgConn.config.User)+string(msg.Salt[:])) err = pgConn.txPasswordMessage(digestedPassword) if err != nil { pgConn.conn.Close() - return nil, &connectError{config: config, msg: "failed to write password message", err: err} + return nil, newPerDialConnectError("failed to write password message", err) } case *pgproto3.AuthenticationSASL: err = pgConn.scramAuth(msg.AuthMechanisms) if err != nil { pgConn.conn.Close() - return nil, &connectError{config: config, msg: "failed SASL auth", err: err} + return nil, newPerDialConnectError("failed SASL auth", err) } case *pgproto3.AuthenticationGSS: err = pgConn.gssAuth() if err != nil { pgConn.conn.Close() - return nil, &connectError{config: config, msg: "failed GSS auth", err: err} + return nil, newPerDialConnectError("failed GSS auth", err) } case *pgproto3.ReadyForQuery: pgConn.status = connStatusIdle @@ -380,7 +426,7 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig return pgConn, nil } pgConn.conn.Close() - return nil, &connectError{config: config, msg: "ValidateConnect failed", err: err} + return nil, newPerDialConnectError("ValidateConnect failed", err) } } return pgConn, nil @@ -388,21 +434,14 @@ func connect(ctx context.Context, config *Config, fallbackConfig *FallbackConfig // handled by ReceiveMessage case *pgproto3.ErrorResponse: pgConn.conn.Close() - return nil, ErrorResponseToPgError(msg) + return nil, newPerDialConnectError("server error", ErrorResponseToPgError(msg)) default: pgConn.conn.Close() - return nil, &connectError{config: config, msg: "received unexpected message", err: err} + return nil, newPerDialConnectError("received unexpected message", err) } } } -func newContextWatcher(conn net.Conn) *ctxwatch.ContextWatcher { - return ctxwatch.NewContextWatcher( - func() { conn.SetDeadline(time.Date(1, 1, 1, 1, 1, 1, 1, time.UTC)) }, - func() { conn.SetDeadline(time.Time{}) }, - ) -} - func startTLS(conn net.Conn, tlsConfig *tls.Config) (net.Conn, error) { err := binary.Write(conn, binary.BigEndian, []int32{8, 80877103}) if err != nil { @@ -422,13 +461,8 @@ func startTLS(conn net.Conn, tlsConfig *tls.Config) (net.Conn, error) { } func (pgConn *PgConn) txPasswordMessage(password string) (err error) { - msg := &pgproto3.PasswordMessage{Password: password} - buf, err := msg.Encode(pgConn.wbuf) - if err != nil { - return err - } - _, err = pgConn.conn.Write(buf) - return err + pgConn.frontend.Send(&pgproto3.PasswordMessage{Password: password}) + return pgConn.flushWithPotentialWriteReadDeadlock() } func hexMD5(s string) string { @@ -455,36 +489,6 @@ func (pgConn *PgConn) signalMessage() chan struct{} { return ch } -// SendBytes sends buf to the PostgreSQL server. It must only be used when the connection is not busy. e.g. It is as -// error to call SendBytes while reading the result of a query. -// -// This is a very low level method that requires deep understanding of the PostgreSQL wire protocol to use correctly. -// See https://www.postgresql.org/docs/current/protocol.html. -func (pgConn *PgConn) SendBytes(ctx context.Context, buf []byte) error { - if err := pgConn.lock(); err != nil { - return err - } - defer pgConn.unlock() - - if ctx != context.Background() { - select { - case <-ctx.Done(): - return newContextAlreadyDoneError(ctx) - default: - } - pgConn.contextWatcher.Watch(ctx) - defer pgConn.contextWatcher.Unwatch() - } - - n, err := pgConn.conn.Write(buf) - if err != nil { - pgConn.asyncClose() - return &writeError{err: err, safeToRetry: n == 0} - } - - return nil -} - // ReceiveMessage receives one wire protocol message from the PostgreSQL server. It must only be used when the // connection is not busy. e.g. It is an error to call ReceiveMessage while reading the result of a query. The messages // are still handled by the core pgconn message handling system so receiving a NotificationResponse will still trigger @@ -512,8 +516,9 @@ func (pgConn *PgConn) ReceiveMessage(ctx context.Context) (pgproto3.BackendMessa if err != nil { err = &pgconnError{ msg: "receive message failed", - err: preferContextOverNetTimeoutError(ctx, err), - safeToRetry: true} + err: normalizeTimeoutError(ctx, err), + safeToRetry: true, + } } return msg, err } @@ -561,13 +566,6 @@ func (pgConn *PgConn) peekMessage() (pgproto3.BackendMessage, error) { func (pgConn *PgConn) receiveMessage() (pgproto3.BackendMessage, error) { msg, err := pgConn.peekMessage() if err != nil { - // Close on anything other than timeout error - everything else is fatal - var netErr net.Error - isNetErr := errors.As(err, &netErr) - if !(isNetErr && netErr.Timeout()) { - pgConn.asyncClose() - } - return nil, err } pgConn.peekedMsg = nil @@ -578,11 +576,12 @@ func (pgConn *PgConn) receiveMessage() (pgproto3.BackendMessage, error) { case *pgproto3.ParameterStatus: pgConn.parameterStatuses[msg.Name] = msg.Value case *pgproto3.ErrorResponse: - if msg.Severity == "FATAL" { + err := ErrorResponseToPgError(msg) + if pgConn.config.OnPgError != nil && !pgConn.config.OnPgError(pgConn, err) { pgConn.status = connStatusClosed pgConn.conn.Close() // Ignore error as the connection is already broken and there is already an error to return. close(pgConn.cleanupDone) - return nil, ErrorResponseToPgError(msg) + return nil, err } case *pgproto3.NoticeResponse: if pgConn.config.OnNotice != nil { @@ -597,7 +596,8 @@ func (pgConn *PgConn) receiveMessage() (pgproto3.BackendMessage, error) { return msg, nil } -// Conn returns the underlying net.Conn. +// Conn returns the underlying net.Conn. This rarely necessary. If the connection will be directly used for reading or +// writing then SyncConn should usually be called before Conn. func (pgConn *PgConn) Conn() net.Conn { return pgConn.conn } @@ -625,7 +625,12 @@ func (pgConn *PgConn) SecretKey() uint32 { return pgConn.secretKey } -// Close closes a connection. It is safe to call Close on a already closed connection. Close attempts a clean close by +// Frontend returns the underlying *pgproto3.Frontend. This rarely necessary. +func (pgConn *PgConn) Frontend() *pgproto3.Frontend { + return pgConn.frontend +} + +// Close closes a connection. It is safe to call Close on an already closed connection. Close attempts a clean close by // sending the exit message to PostgreSQL. However, this could block so ctx is available to limit the time to wait. The // underlying net.Conn.Close() will always be called regardless of any other errors. func (pgConn *PgConn) Close(ctx context.Context) error { @@ -654,7 +659,8 @@ func (pgConn *PgConn) Close(ctx context.Context) error { // ignores errors. // // See https://github.com/jackc/pgx/issues/637 - pgConn.conn.Write([]byte{'X', 0, 0, 0, 4}) + pgConn.frontend.Send(&pgproto3.Terminate{}) + pgConn.flushWithPotentialWriteReadDeadlock() return pgConn.conn.Close() } @@ -680,7 +686,8 @@ func (pgConn *PgConn) asyncClose() { pgConn.conn.SetDeadline(deadline) - pgConn.conn.Write([]byte{'X', 0, 0, 0, 4}) + pgConn.frontend.Send(&pgproto3.Terminate{}) + pgConn.flushWithPotentialWriteReadDeadlock() }() } @@ -738,16 +745,23 @@ func (pgConn *PgConn) ParameterStatus(key string) string { return pgConn.parameterStatuses[key] } -// CommandTag is the result of an Exec function -type CommandTag []byte +// CommandTag is the status text returned by PostgreSQL for a query. +type CommandTag struct { + s string +} + +// NewCommandTag makes a CommandTag from s. +func NewCommandTag(s string) CommandTag { + return CommandTag{s: s} +} // RowsAffected returns the number of rows affected. If the CommandTag was not // for a row affecting command (e.g. "CREATE TABLE") then it returns 0. func (ct CommandTag) RowsAffected() int64 { // Find last non-digit idx := -1 - for i := len(ct) - 1; i >= 0; i-- { - if ct[i] >= '0' && ct[i] <= '9' { + for i := len(ct.s) - 1; i >= 0; i-- { + if ct.s[i] >= '0' && ct.s[i] <= '9' { idx = i } else { break @@ -759,7 +773,7 @@ func (ct CommandTag) RowsAffected() int64 { } var n int64 - for _, b := range ct[idx:] { + for _, b := range ct.s[idx:] { n = n*10 + int64(b-'0') } @@ -767,62 +781,71 @@ func (ct CommandTag) RowsAffected() int64 { } func (ct CommandTag) String() string { - return string(ct) + return ct.s } // Insert is true if the command tag starts with "INSERT". func (ct CommandTag) Insert() bool { - return len(ct) >= 6 && - ct[0] == 'I' && - ct[1] == 'N' && - ct[2] == 'S' && - ct[3] == 'E' && - ct[4] == 'R' && - ct[5] == 'T' + return strings.HasPrefix(ct.s, "INSERT") } // Update is true if the command tag starts with "UPDATE". func (ct CommandTag) Update() bool { - return len(ct) >= 6 && - ct[0] == 'U' && - ct[1] == 'P' && - ct[2] == 'D' && - ct[3] == 'A' && - ct[4] == 'T' && - ct[5] == 'E' + return strings.HasPrefix(ct.s, "UPDATE") } // Delete is true if the command tag starts with "DELETE". func (ct CommandTag) Delete() bool { - return len(ct) >= 6 && - ct[0] == 'D' && - ct[1] == 'E' && - ct[2] == 'L' && - ct[3] == 'E' && - ct[4] == 'T' && - ct[5] == 'E' + return strings.HasPrefix(ct.s, "DELETE") } // Select is true if the command tag starts with "SELECT". func (ct CommandTag) Select() bool { - return len(ct) >= 6 && - ct[0] == 'S' && - ct[1] == 'E' && - ct[2] == 'L' && - ct[3] == 'E' && - ct[4] == 'C' && - ct[5] == 'T' + return strings.HasPrefix(ct.s, "SELECT") +} + +type FieldDescription struct { + Name string + TableOID uint32 + TableAttributeNumber uint16 + DataTypeOID uint32 + DataTypeSize int16 + TypeModifier int32 + Format int16 +} + +func (pgConn *PgConn) convertRowDescription(dst []FieldDescription, rd *pgproto3.RowDescription) []FieldDescription { + if cap(dst) >= len(rd.Fields) { + dst = dst[:len(rd.Fields):len(rd.Fields)] + } else { + dst = make([]FieldDescription, len(rd.Fields)) + } + + for i := range rd.Fields { + dst[i].Name = string(rd.Fields[i].Name) + dst[i].TableOID = rd.Fields[i].TableOID + dst[i].TableAttributeNumber = rd.Fields[i].TableAttributeNumber + dst[i].DataTypeOID = rd.Fields[i].DataTypeOID + dst[i].DataTypeSize = rd.Fields[i].DataTypeSize + dst[i].TypeModifier = rd.Fields[i].TypeModifier + dst[i].Format = rd.Fields[i].Format + } + + return dst } type StatementDescription struct { Name string SQL string ParamOIDs []uint32 - Fields []pgproto3.FieldDescription + Fields []FieldDescription } // Prepare creates a prepared statement. If the name is empty, the anonymous prepared statement will be used. This // allows Prepare to also to describe statements without creating a server-side prepared statement. +// +// Prepare does not send a PREPARE statement to the server. It uses the PostgreSQL Parse and Describe protocol messages +// directly. func (pgConn *PgConn) Prepare(ctx context.Context, name, sql string, paramOIDs []uint32) (*StatementDescription, error) { if err := pgConn.lock(); err != nil { return nil, err @@ -839,25 +862,13 @@ func (pgConn *PgConn) Prepare(ctx context.Context, name, sql string, paramOIDs [ defer pgConn.contextWatcher.Unwatch() } - buf := pgConn.wbuf - var err error - buf, err = (&pgproto3.Parse{Name: name, Query: sql, ParameterOIDs: paramOIDs}).Encode(buf) - if err != nil { - return nil, err - } - buf, err = (&pgproto3.Describe{ObjectType: 'S', Name: name}).Encode(buf) - if err != nil { - return nil, err - } - buf, err = (&pgproto3.Sync{}).Encode(buf) - if err != nil { - return nil, err - } - - n, err := pgConn.conn.Write(buf) + pgConn.frontend.SendParse(&pgproto3.Parse{Name: name, Query: sql, ParameterOIDs: paramOIDs}) + pgConn.frontend.SendDescribe(&pgproto3.Describe{ObjectType: 'S', Name: name}) + pgConn.frontend.SendSync(&pgproto3.Sync{}) + err := pgConn.flushWithPotentialWriteReadDeadlock() if err != nil { pgConn.asyncClose() - return nil, &writeError{err: err, safeToRetry: n == 0} + return nil, err } psd := &StatementDescription{Name: name, SQL: sql} @@ -869,7 +880,7 @@ readloop: msg, err := pgConn.receiveMessage() if err != nil { pgConn.asyncClose() - return nil, preferContextOverNetTimeoutError(ctx, err) + return nil, normalizeTimeoutError(ctx, err) } switch msg := msg.(type) { @@ -877,8 +888,7 @@ readloop: psd.ParamOIDs = make([]uint32, len(msg.ParameterOIDs)) copy(psd.ParamOIDs, msg.ParameterOIDs) case *pgproto3.RowDescription: - psd.Fields = make([]pgproto3.FieldDescription, len(msg.Fields)) - copy(psd.Fields, msg.Fields) + psd.Fields = pgConn.convertRowDescription(nil, msg) case *pgproto3.ErrorResponse: parseErr = ErrorResponseToPgError(msg) case *pgproto3.ReadyForQuery: @@ -892,26 +902,73 @@ readloop: return psd, nil } +// Deallocate deallocates a prepared statement. +// +// Deallocate does not send a DEALLOCATE statement to the server. It uses the PostgreSQL Close protocol message +// directly. This has slightly different behavior than executing DEALLOCATE statement. +// - Deallocate can succeed in an aborted transaction. +// - Deallocating a non-existent prepared statement is not an error. +func (pgConn *PgConn) Deallocate(ctx context.Context, name string) error { + if err := pgConn.lock(); err != nil { + return err + } + defer pgConn.unlock() + + if ctx != context.Background() { + select { + case <-ctx.Done(): + return newContextAlreadyDoneError(ctx) + default: + } + pgConn.contextWatcher.Watch(ctx) + defer pgConn.contextWatcher.Unwatch() + } + + pgConn.frontend.SendClose(&pgproto3.Close{ObjectType: 'S', Name: name}) + pgConn.frontend.SendSync(&pgproto3.Sync{}) + err := pgConn.flushWithPotentialWriteReadDeadlock() + if err != nil { + pgConn.asyncClose() + return err + } + + for { + msg, err := pgConn.receiveMessage() + if err != nil { + pgConn.asyncClose() + return normalizeTimeoutError(ctx, err) + } + + switch msg := msg.(type) { + case *pgproto3.ErrorResponse: + return ErrorResponseToPgError(msg) + case *pgproto3.ReadyForQuery: + return nil + } + } +} + // ErrorResponseToPgError converts a wire protocol error message to a *PgError. func ErrorResponseToPgError(msg *pgproto3.ErrorResponse) *PgError { return &PgError{ - Severity: msg.Severity, - Code: string(msg.Code), - Message: string(msg.Message), - Detail: string(msg.Detail), - Hint: msg.Hint, - Position: msg.Position, - InternalPosition: msg.InternalPosition, - InternalQuery: string(msg.InternalQuery), - Where: string(msg.Where), - SchemaName: string(msg.SchemaName), - TableName: string(msg.TableName), - ColumnName: string(msg.ColumnName), - DataTypeName: string(msg.DataTypeName), - ConstraintName: msg.ConstraintName, - File: string(msg.File), - Line: msg.Line, - Routine: string(msg.Routine), + Severity: msg.Severity, + SeverityUnlocalized: msg.SeverityUnlocalized, + Code: string(msg.Code), + Message: string(msg.Message), + Detail: string(msg.Detail), + Hint: msg.Hint, + Position: msg.Position, + InternalPosition: msg.InternalPosition, + InternalQuery: string(msg.InternalQuery), + Where: string(msg.Where), + SchemaName: string(msg.SchemaName), + TableName: string(msg.TableName), + ColumnName: string(msg.ColumnName), + DataTypeName: string(msg.DataTypeName), + ConstraintName: msg.ConstraintName, + File: string(msg.File), + Line: msg.Line, + Routine: string(msg.Routine), } } @@ -928,17 +985,33 @@ func (pgConn *PgConn) CancelRequest(ctx context.Context) error { // the connection config. This is important in high availability configurations where fallback connections may be // specified or DNS may be used to load balance. serverAddr := pgConn.conn.RemoteAddr() - cancelConn, err := pgConn.config.DialFunc(ctx, serverAddr.Network(), serverAddr.String()) + var serverNetwork string + var serverAddress string + if serverAddr.Network() == "unix" { + // for unix sockets, RemoteAddr() calls getpeername() which returns the name the + // server passed to bind(). For Postgres, this is always a relative path "./.s.PGSQL.5432" + // so connecting to it will fail. Fall back to the config's value + serverNetwork, serverAddress = NetworkAddress(pgConn.config.Host, pgConn.config.Port) + } else { + serverNetwork, serverAddress = serverAddr.Network(), serverAddr.String() + } + cancelConn, err := pgConn.config.DialFunc(ctx, serverNetwork, serverAddress) if err != nil { - return err + // In case of unix sockets, RemoteAddr() returns only the file part of the path. If the + // first connect failed, try the config. + if serverAddr.Network() != "unix" { + return err + } + serverNetwork, serverAddr := NetworkAddress(pgConn.config.Host, pgConn.config.Port) + cancelConn, err = pgConn.config.DialFunc(ctx, serverNetwork, serverAddr) + if err != nil { + return err + } } defer cancelConn.Close() if ctx != context.Background() { - contextWatcher := ctxwatch.NewContextWatcher( - func() { cancelConn.SetDeadline(time.Date(1, 1, 1, 1, 1, 1, 1, time.UTC)) }, - func() { cancelConn.SetDeadline(time.Time{}) }, - ) + contextWatcher := ctxwatch.NewContextWatcher(&DeadlineContextWatcherHandler{Conn: cancelConn}) contextWatcher.Watch(ctx) defer contextWatcher.Unwatch() } @@ -946,22 +1019,21 @@ func (pgConn *PgConn) CancelRequest(ctx context.Context) error { buf := make([]byte, 16) binary.BigEndian.PutUint32(buf[0:4], 16) binary.BigEndian.PutUint32(buf[4:8], 80877102) - binary.BigEndian.PutUint32(buf[8:12], uint32(pgConn.pid)) - binary.BigEndian.PutUint32(buf[12:16], uint32(pgConn.secretKey)) - _, err = cancelConn.Write(buf) - if err != nil { - return err - } + binary.BigEndian.PutUint32(buf[8:12], pgConn.pid) + binary.BigEndian.PutUint32(buf[12:16], pgConn.secretKey) - _, err = cancelConn.Read(buf) - if err != io.EOF { - return err + if _, err := cancelConn.Write(buf); err != nil { + return fmt.Errorf("write to connection for cancellation: %w", err) } + // Wait for the cancel request to be acknowledged by the server. + // It copies the behavior of the libpq: https://github.com/postgres/postgres/blob/REL_16_0/src/interfaces/libpq/fe-connect.c#L4946-L4960 + _, _ = cancelConn.Read(buf) + return nil } -// WaitForNotification waits for a LISTON/NOTIFY message to be received. It returns an error if a notification was not +// WaitForNotification waits for a LISTEN/NOTIFY message to be received. It returns an error if a notification was not // received. func (pgConn *PgConn) WaitForNotification(ctx context.Context) error { if err := pgConn.lock(); err != nil { @@ -983,7 +1055,7 @@ func (pgConn *PgConn) WaitForNotification(ctx context.Context) error { for { msg, err := pgConn.receiveMessage() if err != nil { - return preferContextOverNetTimeoutError(ctx, err) + return normalizeTimeoutError(ctx, err) } switch msg.(type) { @@ -1023,22 +1095,13 @@ func (pgConn *PgConn) Exec(ctx context.Context, sql string) *MultiResultReader { pgConn.contextWatcher.Watch(ctx) } - buf := pgConn.wbuf - var err error - buf, err = (&pgproto3.Query{String: sql}).Encode(buf) - if err != nil { - return &MultiResultReader{ - closed: true, - err: err, - } - } - - n, err := pgConn.conn.Write(buf) + pgConn.frontend.SendQuery(&pgproto3.Query{String: sql}) + err := pgConn.flushWithPotentialWriteReadDeadlock() if err != nil { pgConn.asyncClose() pgConn.contextWatcher.Unwatch() multiResult.closed = true - multiResult.err = &writeError{err: err, safeToRetry: n == 0} + multiResult.err = err pgConn.unlock() return multiResult } @@ -1046,39 +1109,6 @@ func (pgConn *PgConn) Exec(ctx context.Context, sql string) *MultiResultReader { return multiResult } -// ReceiveResults reads the result that might be returned by Postgres after a SendBytes -// (e.a. after sending a CopyDone in a copy-both situation). -// -// This is a very low level method that requires deep understanding of the PostgreSQL wire protocol to use correctly. -// See https://www.postgresql.org/docs/current/protocol.html. -func (pgConn *PgConn) ReceiveResults(ctx context.Context) *MultiResultReader { - if err := pgConn.lock(); err != nil { - return &MultiResultReader{ - closed: true, - err: err, - } - } - - pgConn.multiResultReader = MultiResultReader{ - pgConn: pgConn, - ctx: ctx, - } - multiResult := &pgConn.multiResultReader - if ctx != context.Background() { - select { - case <-ctx.Done(): - multiResult.closed = true - multiResult.err = newContextAlreadyDoneError(ctx) - pgConn.unlock() - return multiResult - default: - } - pgConn.contextWatcher.Watch(ctx) - } - - return multiResult -} - // ExecParams executes a command via the PostgreSQL extended query protocol. // // sql is a SQL command string. It may only contain one query. Parameter substitution is positional using $1, $2, $3, @@ -1104,27 +1134,10 @@ func (pgConn *PgConn) ExecParams(ctx context.Context, sql string, paramValues [] return result } - buf := pgConn.wbuf - var err error - buf, err = (&pgproto3.Parse{Query: sql, ParameterOIDs: paramOIDs}).Encode(buf) - if err != nil { - result.concludeCommand(nil, err) - pgConn.contextWatcher.Unwatch() - result.closed = true - pgConn.unlock() - return result - } - - buf, err = (&pgproto3.Bind{ParameterFormatCodes: paramFormats, Parameters: paramValues, ResultFormatCodes: resultFormats}).Encode(buf) - if err != nil { - result.concludeCommand(nil, err) - pgConn.contextWatcher.Unwatch() - result.closed = true - pgConn.unlock() - return result - } + pgConn.frontend.SendParse(&pgproto3.Parse{Query: sql, ParameterOIDs: paramOIDs}) + pgConn.frontend.SendBind(&pgproto3.Bind{ParameterFormatCodes: paramFormats, Parameters: paramValues, ResultFormatCodes: resultFormats}) - pgConn.execExtendedSuffix(buf, result) + pgConn.execExtendedSuffix(result) return result } @@ -1147,18 +1160,9 @@ func (pgConn *PgConn) ExecPrepared(ctx context.Context, stmtName string, paramVa return result } - buf := pgConn.wbuf - var err error - buf, err = (&pgproto3.Bind{PreparedStatement: stmtName, ParameterFormatCodes: paramFormats, Parameters: paramValues, ResultFormatCodes: resultFormats}).Encode(buf) - if err != nil { - result.concludeCommand(nil, err) - pgConn.contextWatcher.Unwatch() - result.closed = true - pgConn.unlock() - return result - } + pgConn.frontend.SendBind(&pgproto3.Bind{PreparedStatement: stmtName, ParameterFormatCodes: paramFormats, Parameters: paramValues, ResultFormatCodes: resultFormats}) - pgConn.execExtendedSuffix(buf, result) + pgConn.execExtendedSuffix(result) return result } @@ -1171,13 +1175,13 @@ func (pgConn *PgConn) execExtendedPrefix(ctx context.Context, paramValues [][]by result := &pgConn.resultReader if err := pgConn.lock(); err != nil { - result.concludeCommand(nil, err) + result.concludeCommand(CommandTag{}, err) result.closed = true return result } if len(paramValues) > math.MaxUint16 { - result.concludeCommand(nil, fmt.Errorf("extended protocol limited to %v parameters", math.MaxUint16)) + result.concludeCommand(CommandTag{}, fmt.Errorf("extended protocol limited to %v parameters", math.MaxUint16)) result.closed = true pgConn.unlock() return result @@ -1186,7 +1190,7 @@ func (pgConn *PgConn) execExtendedPrefix(ctx context.Context, paramValues [][]by if ctx != context.Background() { select { case <-ctx.Done(): - result.concludeCommand(nil, newContextAlreadyDoneError(ctx)) + result.concludeCommand(CommandTag{}, newContextAlreadyDoneError(ctx)) result.closed = true pgConn.unlock() return result @@ -1198,37 +1202,15 @@ func (pgConn *PgConn) execExtendedPrefix(ctx context.Context, paramValues [][]by return result } -func (pgConn *PgConn) execExtendedSuffix(buf []byte, result *ResultReader) { - var err error - buf, err = (&pgproto3.Describe{ObjectType: 'P'}).Encode(buf) - if err != nil { - result.concludeCommand(nil, err) - pgConn.contextWatcher.Unwatch() - result.closed = true - pgConn.unlock() - return - } - buf, err = (&pgproto3.Execute{}).Encode(buf) - if err != nil { - result.concludeCommand(nil, err) - pgConn.contextWatcher.Unwatch() - result.closed = true - pgConn.unlock() - return - } - buf, err = (&pgproto3.Sync{}).Encode(buf) - if err != nil { - result.concludeCommand(nil, err) - pgConn.contextWatcher.Unwatch() - result.closed = true - pgConn.unlock() - return - } +func (pgConn *PgConn) execExtendedSuffix(result *ResultReader) { + pgConn.frontend.SendDescribe(&pgproto3.Describe{ObjectType: 'P'}) + pgConn.frontend.SendExecute(&pgproto3.Execute{}) + pgConn.frontend.SendSync(&pgproto3.Sync{}) - n, err := pgConn.conn.Write(buf) + err := pgConn.flushWithPotentialWriteReadDeadlock() if err != nil { pgConn.asyncClose() - result.concludeCommand(nil, &writeError{err: err, safeToRetry: n == 0}) + result.concludeCommand(CommandTag{}, err) pgConn.contextWatcher.Unwatch() result.closed = true pgConn.unlock() @@ -1241,14 +1223,14 @@ func (pgConn *PgConn) execExtendedSuffix(buf []byte, result *ResultReader) { // CopyTo executes the copy command sql and copies the results to w. func (pgConn *PgConn) CopyTo(ctx context.Context, w io.Writer, sql string) (CommandTag, error) { if err := pgConn.lock(); err != nil { - return nil, err + return CommandTag{}, err } if ctx != context.Background() { select { case <-ctx.Done(): pgConn.unlock() - return nil, newContextAlreadyDoneError(ctx) + return CommandTag{}, newContextAlreadyDoneError(ctx) default: } pgConn.contextWatcher.Watch(ctx) @@ -1256,19 +1238,13 @@ func (pgConn *PgConn) CopyTo(ctx context.Context, w io.Writer, sql string) (Comm } // Send copy to command - buf := pgConn.wbuf - var err error - buf, err = (&pgproto3.Query{String: sql}).Encode(buf) - if err != nil { - pgConn.unlock() - return nil, err - } + pgConn.frontend.SendQuery(&pgproto3.Query{String: sql}) - n, err := pgConn.conn.Write(buf) + err := pgConn.flushWithPotentialWriteReadDeadlock() if err != nil { pgConn.asyncClose() pgConn.unlock() - return nil, &writeError{err: err, safeToRetry: n == 0} + return CommandTag{}, err } // Read results @@ -1278,7 +1254,7 @@ func (pgConn *PgConn) CopyTo(ctx context.Context, w io.Writer, sql string) (Comm msg, err := pgConn.receiveMessage() if err != nil { pgConn.asyncClose() - return nil, preferContextOverNetTimeoutError(ctx, err) + return CommandTag{}, normalizeTimeoutError(ctx, err) } switch msg := msg.(type) { @@ -1287,13 +1263,13 @@ func (pgConn *PgConn) CopyTo(ctx context.Context, w io.Writer, sql string) (Comm _, err := w.Write(msg.Data) if err != nil { pgConn.asyncClose() - return nil, err + return CommandTag{}, err } case *pgproto3.ReadyForQuery: pgConn.unlock() return commandTag, pgErr case *pgproto3.CommandComplete: - commandTag = CommandTag(msg.CommandTag) + commandTag = pgConn.makeCommandTag(msg.CommandTag) case *pgproto3.ErrorResponse: pgErr = ErrorResponseToPgError(msg) } @@ -1306,33 +1282,26 @@ func (pgConn *PgConn) CopyTo(ctx context.Context, w io.Writer, sql string) (Comm // could still block. func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (CommandTag, error) { if err := pgConn.lock(); err != nil { - return nil, err + return CommandTag{}, err } defer pgConn.unlock() if ctx != context.Background() { select { case <-ctx.Done(): - return nil, newContextAlreadyDoneError(ctx) + return CommandTag{}, newContextAlreadyDoneError(ctx) default: } pgConn.contextWatcher.Watch(ctx) defer pgConn.contextWatcher.Unwatch() } - // Send copy to command - buf := pgConn.wbuf - var err error - buf, err = (&pgproto3.Query{String: sql}).Encode(buf) - if err != nil { - pgConn.unlock() - return nil, err - } - - n, err := pgConn.conn.Write(buf) + // Send copy from query + pgConn.frontend.SendQuery(&pgproto3.Query{String: sql}) + err := pgConn.flushWithPotentialWriteReadDeadlock() if err != nil { pgConn.asyncClose() - return nil, &writeError{err: err, safeToRetry: n == 0} + return CommandTag{}, err } // Send copy data @@ -1344,19 +1313,20 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co go func() { defer wg.Done() - buf := make([]byte, 0, 65536) - buf = append(buf, 'd') - sp := len(buf) + buf := iobufpool.Get(65536) + defer iobufpool.Put(buf) + (*buf)[0] = 'd' for { - n, readErr := r.Read(buf[5:cap(buf)]) + n, readErr := r.Read((*buf)[5:cap(*buf)]) if n > 0 { - buf = buf[0 : n+5] - pgio.SetInt32(buf[sp:], int32(n+4)) + *buf = (*buf)[0 : n+5] + pgio.SetInt32((*buf)[1:], int32(n+4)) - _, writeErr := pgConn.conn.Write(buf) + writeErr := pgConn.frontend.SendUnbufferedEncodedCopyData(*buf) if writeErr != nil { - // Write errors are always fatal, but we can't use asyncClose because we are in a different goroutine. + // Write errors are always fatal, but we can't use asyncClose because we are in a different goroutine. Not + // setting pgConn.status or closing pgConn.cleanupDone for the same reason. pgConn.conn.Close() copyErrChan <- writeErr @@ -1382,11 +1352,16 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co select { case copyErr = <-copyErrChan: case <-signalMessageChan: - msg, err := pgConn.receiveMessage() - if err != nil { - pgConn.asyncClose() - return nil, preferContextOverNetTimeoutError(ctx, err) + // If pgConn.receiveMessage encounters an error it will call pgConn.asyncClose. But that is a race condition with + // the goroutine. So instead check pgConn.bufferingReceiveErr which will have been set by the signalMessage. If an + // error is found then forcibly close the connection without sending the Terminate message. + if err := pgConn.bufferingReceiveErr; err != nil { + pgConn.status = connStatusClosed + pgConn.conn.Close() + close(pgConn.cleanupDone) + return CommandTag{}, normalizeTimeoutError(ctx, err) } + msg, _ := pgConn.receiveMessage() switch msg := msg.(type) { case *pgproto3.ErrorResponse: @@ -1400,28 +1375,15 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co // Make sure io goroutine finishes before writing. wg.Wait() - buf = buf[:0] if copyErr == io.EOF || pgErr != nil { - copyDone := &pgproto3.CopyDone{} - var err error - buf, err = copyDone.Encode(buf) - if err != nil { - pgConn.asyncClose() - return nil, err - } + pgConn.frontend.Send(&pgproto3.CopyDone{}) } else { - copyFail := &pgproto3.CopyFail{Message: copyErr.Error()} - var err error - buf, err = copyFail.Encode(buf) - if err != nil { - pgConn.asyncClose() - return nil, err - } + pgConn.frontend.Send(&pgproto3.CopyFail{Message: copyErr.Error()}) } - _, err = pgConn.conn.Write(buf) + err = pgConn.flushWithPotentialWriteReadDeadlock() if err != nil { pgConn.asyncClose() - return nil, err + return CommandTag{}, err } // Read results @@ -1430,14 +1392,14 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co msg, err := pgConn.receiveMessage() if err != nil { pgConn.asyncClose() - return nil, preferContextOverNetTimeoutError(ctx, err) + return CommandTag{}, normalizeTimeoutError(ctx, err) } switch msg := msg.(type) { case *pgproto3.ReadyForQuery: return commandTag, pgErr case *pgproto3.CommandComplete: - commandTag = CommandTag(msg.CommandTag) + commandTag = pgConn.makeCommandTag(msg.CommandTag) case *pgproto3.ErrorResponse: pgErr = ErrorResponseToPgError(msg) } @@ -1446,8 +1408,9 @@ func (pgConn *PgConn) CopyFrom(ctx context.Context, r io.Reader, sql string) (Co // MultiResultReader is a reader for a command that could return multiple results such as Exec or ExecBatch. type MultiResultReader struct { - pgConn *PgConn - ctx context.Context + pgConn *PgConn + ctx context.Context + pipeline *Pipeline rr *ResultReader @@ -1469,10 +1432,9 @@ func (mrr *MultiResultReader) ReadAll() ([]*Result, error) { func (mrr *MultiResultReader) receiveMessage() (pgproto3.BackendMessage, error) { msg, err := mrr.pgConn.receiveMessage() - if err != nil { mrr.pgConn.contextWatcher.Unwatch() - mrr.err = preferContextOverNetTimeoutError(mrr.ctx, err) + mrr.err = normalizeTimeoutError(mrr.ctx, err) mrr.closed = true mrr.pgConn.asyncClose() return nil, mrr.err @@ -1480,9 +1442,13 @@ func (mrr *MultiResultReader) receiveMessage() (pgproto3.BackendMessage, error) switch msg := msg.(type) { case *pgproto3.ReadyForQuery: - mrr.pgConn.contextWatcher.Unwatch() mrr.closed = true - mrr.pgConn.unlock() + if mrr.pipeline != nil { + mrr.pipeline.expectedReadyForQueryCount-- + } else { + mrr.pgConn.contextWatcher.Unwatch() + mrr.pgConn.unlock() + } case *pgproto3.ErrorResponse: mrr.err = ErrorResponseToPgError(msg) } @@ -1504,13 +1470,14 @@ func (mrr *MultiResultReader) NextResult() bool { pgConn: mrr.pgConn, multiResultReader: mrr, ctx: mrr.ctx, - fieldDescriptions: msg.Fields, + fieldDescriptions: mrr.pgConn.convertRowDescription(mrr.pgConn.fieldDescriptions[:], msg), } + mrr.rr = &mrr.pgConn.resultReader return true case *pgproto3.CommandComplete: mrr.pgConn.resultReader = ResultReader{ - commandTag: CommandTag(msg.CommandTag), + commandTag: mrr.pgConn.makeCommandTag(msg.CommandTag), commandConcluded: true, closed: true, } @@ -1545,9 +1512,10 @@ func (mrr *MultiResultReader) Close() error { type ResultReader struct { pgConn *PgConn multiResultReader *MultiResultReader + pipeline *Pipeline ctx context.Context - fieldDescriptions []pgproto3.FieldDescription + fieldDescriptions []FieldDescription rowValues [][]byte commandTag CommandTag commandConcluded bool @@ -1557,7 +1525,7 @@ type ResultReader struct { // Result is the saved query response that is returned by calling Read on a ResultReader. type Result struct { - FieldDescriptions []pgproto3.FieldDescription + FieldDescriptions []FieldDescription Rows [][][]byte CommandTag CommandTag Err error @@ -1569,12 +1537,18 @@ func (rr *ResultReader) Read() *Result { for rr.NextRow() { if br.FieldDescriptions == nil { - br.FieldDescriptions = make([]pgproto3.FieldDescription, len(rr.FieldDescriptions())) + br.FieldDescriptions = make([]FieldDescription, len(rr.FieldDescriptions())) copy(br.FieldDescriptions, rr.FieldDescriptions()) } - row := make([][]byte, len(rr.Values())) - copy(row, rr.Values()) + values := rr.Values() + row := make([][]byte, len(values)) + for i := range row { + if values[i] != nil { + row[i] = make([]byte, len(values[i])) + copy(row[i], values[i]) + } + } br.Rows = append(br.Rows, row) } @@ -1602,14 +1576,14 @@ func (rr *ResultReader) NextRow() bool { } // FieldDescriptions returns the field descriptions for the current result set. The returned slice is only valid until -// the ResultReader is closed. -func (rr *ResultReader) FieldDescriptions() []pgproto3.FieldDescription { +// the ResultReader is closed. It may return nil (for example, if the query did not return a result set or an error was +// encountered.) +func (rr *ResultReader) FieldDescriptions() []FieldDescription { return rr.fieldDescriptions } // Values returns the current row data. NextRow must have been previously been called. The returned [][]byte is only -// valid until the next NextRow call or the ResultReader is closed. However, the underlying byte data is safe to -// retain a reference to and mutate. +// valid until the next NextRow call or the ResultReader is closed. func (rr *ResultReader) Values() [][]byte { return rr.rowValues } @@ -1625,15 +1599,15 @@ func (rr *ResultReader) Close() (CommandTag, error) { for !rr.commandConcluded { _, err := rr.receiveMessage() if err != nil { - return nil, rr.err + return CommandTag{}, rr.err } } - if rr.multiResultReader == nil { + if rr.multiResultReader == nil && rr.pipeline == nil { for { msg, err := rr.receiveMessage() if err != nil { - return nil, rr.err + return CommandTag{}, rr.err } switch msg := msg.(type) { @@ -1679,8 +1653,8 @@ func (rr *ResultReader) receiveMessage() (msg pgproto3.BackendMessage, err error } if err != nil { - err = preferContextOverNetTimeoutError(rr.ctx, err) - rr.concludeCommand(nil, err) + err = normalizeTimeoutError(rr.ctx, err) + rr.concludeCommand(CommandTag{}, err) rr.pgConn.contextWatcher.Unwatch() rr.closed = true if rr.multiResultReader == nil { @@ -1692,13 +1666,13 @@ func (rr *ResultReader) receiveMessage() (msg pgproto3.BackendMessage, err error switch msg := msg.(type) { case *pgproto3.RowDescription: - rr.fieldDescriptions = msg.Fields + rr.fieldDescriptions = rr.pgConn.convertRowDescription(rr.pgConn.fieldDescriptions[:], msg) case *pgproto3.CommandComplete: - rr.concludeCommand(CommandTag(msg.CommandTag), nil) + rr.concludeCommand(rr.pgConn.makeCommandTag(msg.CommandTag), nil) case *pgproto3.EmptyQueryResponse: - rr.concludeCommand(nil, nil) + rr.concludeCommand(CommandTag{}, nil) case *pgproto3.ErrorResponse: - rr.concludeCommand(nil, ErrorResponseToPgError(msg)) + rr.concludeCommand(CommandTag{}, ErrorResponseToPgError(msg)) } return msg, nil @@ -1762,7 +1736,8 @@ func (batch *Batch) ExecPrepared(stmtName string, paramValues [][]byte, paramFor } // ExecBatch executes all the queries in batch in a single round-trip. Execution is implicitly transactional unless a -// transaction is already in progress or SQL contains transaction control statements. +// transaction is already in progress or SQL contains transaction control statements. This is a simpler way of executing +// multiple queries in a single round trip than using pipeline mode. func (pgConn *PgConn) ExecBatch(ctx context.Context, batch *Batch) *MultiResultReader { if batch.err != nil { return &MultiResultReader{ @@ -1804,18 +1779,15 @@ func (pgConn *PgConn) ExecBatch(ctx context.Context, batch *Batch) *MultiResultR return multiResult } - // A large batch can deadlock without concurrent reading and writing. If the Write fails the underlying net.Conn is - // closed. This is all that can be done without introducing a race condition or adding a concurrent safe communication - // channel to relay the error back. The practical effect of this is that the underlying Write error is not reported. - // The error the code reading the batch results receives will be a closed connection error. - // - // See https://github.com/jackc/pgx/issues/374. - go func() { - _, err := pgConn.conn.Write(batch.buf) - if err != nil { - pgConn.conn.Close() - } - }() + pgConn.enterPotentialWriteReadDeadlock() + defer pgConn.exitPotentialWriteReadDeadlock() + _, err := pgConn.conn.Write(batch.buf) + if err != nil { + multiResult.closed = true + multiResult.err = err + pgConn.unlock() + return multiResult + } return multiResult } @@ -1837,23 +1809,122 @@ func (pgConn *PgConn) EscapeString(s string) (string, error) { return strings.Replace(s, "'", "''", -1), nil } +// CheckConn checks the underlying connection without writing any bytes. This is currently implemented by doing a read +// with a very short deadline. This can be useful because a TCP connection can be broken such that a write will appear +// to succeed even though it will never actually reach the server. Reading immediately before a write will detect this +// condition. If this is done immediately before sending a query it reduces the chances a query will be sent that fails +// without the client knowing whether the server received it or not. +// +// Deprecated: CheckConn is deprecated in favor of Ping. CheckConn cannot detect all types of broken connections where +// the write would still appear to succeed. Prefer Ping unless on a high latency connection. +func (pgConn *PgConn) CheckConn() error { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) + defer cancel() + + _, err := pgConn.ReceiveMessage(ctx) + if err != nil { + if !Timeout(err) { + return err + } + } + + return nil +} + +// Ping pings the server. This can be useful because a TCP connection can be broken such that a write will appear to +// succeed even though it will never actually reach the server. Pinging immediately before sending a query reduces the +// chances a query will be sent that fails without the client knowing whether the server received it or not. +func (pgConn *PgConn) Ping(ctx context.Context) error { + return pgConn.Exec(ctx, "-- ping").Close() +} + +// makeCommandTag makes a CommandTag. It does not retain a reference to buf or buf's underlying memory. +func (pgConn *PgConn) makeCommandTag(buf []byte) CommandTag { + return CommandTag{s: string(buf)} +} + +// enterPotentialWriteReadDeadlock must be called before a write that could deadlock if the server is simultaneously +// blocked writing to us. +func (pgConn *PgConn) enterPotentialWriteReadDeadlock() { + // The time to wait is somewhat arbitrary. A Write should only take as long as the syscall and memcpy to the OS + // outbound network buffer unless the buffer is full (which potentially is a block). It needs to be long enough for + // the normal case, but short enough not to kill performance if a block occurs. + // + // In addition, on Windows the default timer resolution is 15.6ms. So setting the timer to less than that is + // ineffective. + if pgConn.slowWriteTimer.Reset(15 * time.Millisecond) { + panic("BUG: slow write timer already active") + } +} + +// exitPotentialWriteReadDeadlock must be called after a call to enterPotentialWriteReadDeadlock. +func (pgConn *PgConn) exitPotentialWriteReadDeadlock() { + if !pgConn.slowWriteTimer.Stop() { + // The timer starts its function in a separate goroutine. It is necessary to ensure the background reader has + // started before calling Stop. Otherwise, the background reader may not be stopped. That on its own is not a + // serious problem. But what is a serious problem is that the background reader may start at an inopportune time in + // a subsequent query. For example, if a subsequent query was canceled then a deadline may be set on the net.Conn to + // interrupt an in-progress read. After the read is interrupted, but before the deadline is cleared, the background + // reader could start and read a deadline error. Then the next query would receive the an unexpected deadline error. + <-pgConn.bgReaderStarted + pgConn.bgReader.Stop() + } +} + +func (pgConn *PgConn) flushWithPotentialWriteReadDeadlock() error { + pgConn.enterPotentialWriteReadDeadlock() + defer pgConn.exitPotentialWriteReadDeadlock() + err := pgConn.frontend.Flush() + return err +} + +// SyncConn prepares the underlying net.Conn for direct use. PgConn may internally buffer reads or use goroutines for +// background IO. This means that any direct use of the underlying net.Conn may be corrupted if a read is already +// buffered or a read is in progress. SyncConn drains read buffers and stops background IO. In some cases this may +// require sending a ping to the server. ctx can be used to cancel this operation. This should be called before any +// operation that will use the underlying net.Conn directly. e.g. Before Conn() or Hijack(). +// +// This should not be confused with the PostgreSQL protocol Sync message. +func (pgConn *PgConn) SyncConn(ctx context.Context) error { + for i := 0; i < 10; i++ { + if pgConn.bgReader.Status() == bgreader.StatusStopped && pgConn.frontend.ReadBufferLen() == 0 { + return nil + } + + err := pgConn.Ping(ctx) + if err != nil { + return fmt.Errorf("SyncConn: Ping failed while syncing conn: %w", err) + } + } + + // This should never happen. Only way I can imagine this occurring is if the server is constantly sending data such as + // LISTEN/NOTIFY or log notifications such that we never can get an empty buffer. + return errors.New("SyncConn: conn never synchronized") +} + +// CustomData returns a map that can be used to associate custom data with the connection. +func (pgConn *PgConn) CustomData() map[string]any { + return pgConn.customData +} + // HijackedConn is the result of hijacking a connection. // // Due to the necessary exposure of internal implementation details, it is not covered by the semantic versioning // compatibility. type HijackedConn struct { - Conn net.Conn // the underlying TCP or unix domain socket connection + Conn net.Conn PID uint32 // backend pid SecretKey uint32 // key to use to send a cancel query message to the server ParameterStatuses map[string]string // parameters that have been reported by the server TxStatus byte - Frontend Frontend + Frontend *pgproto3.Frontend Config *Config + CustomData map[string]any } -// Hijack extracts the internal connection data. pgConn must be in an idle state. pgConn is unusable after hijacking. -// Hijacking is typically only useful when using pgconn to establish a connection, but taking complete control of the -// raw connection after that (e.g. a load balancer or proxy). +// Hijack extracts the internal connection data. pgConn must be in an idle state. SyncConn should be called immediately +// before Hijack. pgConn is unusable after hijacking. Hijacking is typically only useful when using pgconn to establish +// a connection, but taking complete control of the raw connection after that (e.g. a load balancer or proxy). // // Due to the necessary exposure of internal implementation details, it is not covered by the semantic versioning // compatibility. @@ -1871,12 +1942,15 @@ func (pgConn *PgConn) Hijack() (*HijackedConn, error) { TxStatus: pgConn.txStatus, Frontend: pgConn.frontend, Config: pgConn.config, + CustomData: pgConn.customData, }, nil } // Construct created a PgConn from an already established connection to a PostgreSQL server. This is the inverse of // PgConn.Hijack. The connection must be in an idle state. // +// hc.Frontend is replaced by a new pgproto3.Frontend built by hc.Config.BuildFrontend. +// // Due to the necessary exposure of internal implementation details, it is not covered by the semantic versioning // compatibility. func Construct(hc *HijackedConn) (*PgConn, error) { @@ -1888,14 +1962,385 @@ func Construct(hc *HijackedConn) (*PgConn, error) { txStatus: hc.TxStatus, frontend: hc.Frontend, config: hc.Config, + customData: hc.CustomData, status: connStatusIdle, - wbuf: make([]byte, 0, wbufLen), cleanupDone: make(chan struct{}), } - pgConn.contextWatcher = newContextWatcher(pgConn.conn) + pgConn.contextWatcher = ctxwatch.NewContextWatcher(hc.Config.BuildContextWatcherHandler(pgConn)) + pgConn.bgReader = bgreader.New(pgConn.conn) + pgConn.slowWriteTimer = time.AfterFunc(time.Duration(math.MaxInt64), + func() { + pgConn.bgReader.Start() + pgConn.bgReaderStarted <- struct{}{} + }, + ) + pgConn.slowWriteTimer.Stop() + pgConn.bgReaderStarted = make(chan struct{}) + pgConn.frontend = hc.Config.BuildFrontend(pgConn.bgReader, pgConn.conn) return pgConn, nil } + +// Pipeline represents a connection in pipeline mode. +// +// SendPrepare, SendQueryParams, and SendQueryPrepared queue requests to the server. These requests are not written until +// pipeline is flushed by Flush or Sync. Sync must be called after the last request is queued. Requests between +// synchronization points are implicitly transactional unless explicit transaction control statements have been issued. +// +// The context the pipeline was started with is in effect for the entire life of the Pipeline. +// +// For a deeper understanding of pipeline mode see the PostgreSQL documentation for the extended query protocol +// (https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-FLOW-EXT-QUERY) and the libpq pipeline mode +// (https://www.postgresql.org/docs/current/libpq-pipeline-mode.html). +type Pipeline struct { + conn *PgConn + ctx context.Context + + expectedReadyForQueryCount int + pendingSync bool + + err error + closed bool +} + +// PipelineSync is returned by GetResults when a ReadyForQuery message is received. +type PipelineSync struct{} + +// CloseComplete is returned by GetResults when a CloseComplete message is received. +type CloseComplete struct{} + +// StartPipeline switches the connection to pipeline mode and returns a *Pipeline. In pipeline mode requests can be sent +// to the server without waiting for a response. Close must be called on the returned *Pipeline to return the connection +// to normal mode. While in pipeline mode, no methods that communicate with the server may be called except +// CancelRequest and Close. ctx is in effect for entire life of the *Pipeline. +// +// Prefer ExecBatch when only sending one group of queries at once. +func (pgConn *PgConn) StartPipeline(ctx context.Context) *Pipeline { + if err := pgConn.lock(); err != nil { + return &Pipeline{ + closed: true, + err: err, + } + } + + pgConn.pipeline = Pipeline{ + conn: pgConn, + ctx: ctx, + } + pipeline := &pgConn.pipeline + + if ctx != context.Background() { + select { + case <-ctx.Done(): + pipeline.closed = true + pipeline.err = newContextAlreadyDoneError(ctx) + pgConn.unlock() + return pipeline + default: + } + pgConn.contextWatcher.Watch(ctx) + } + + return pipeline +} + +// SendPrepare is the pipeline version of *PgConn.Prepare. +func (p *Pipeline) SendPrepare(name, sql string, paramOIDs []uint32) { + if p.closed { + return + } + p.pendingSync = true + + p.conn.frontend.SendParse(&pgproto3.Parse{Name: name, Query: sql, ParameterOIDs: paramOIDs}) + p.conn.frontend.SendDescribe(&pgproto3.Describe{ObjectType: 'S', Name: name}) +} + +// SendDeallocate deallocates a prepared statement. +func (p *Pipeline) SendDeallocate(name string) { + if p.closed { + return + } + p.pendingSync = true + + p.conn.frontend.SendClose(&pgproto3.Close{ObjectType: 'S', Name: name}) +} + +// SendQueryParams is the pipeline version of *PgConn.QueryParams. +func (p *Pipeline) SendQueryParams(sql string, paramValues [][]byte, paramOIDs []uint32, paramFormats []int16, resultFormats []int16) { + if p.closed { + return + } + p.pendingSync = true + + p.conn.frontend.SendParse(&pgproto3.Parse{Query: sql, ParameterOIDs: paramOIDs}) + p.conn.frontend.SendBind(&pgproto3.Bind{ParameterFormatCodes: paramFormats, Parameters: paramValues, ResultFormatCodes: resultFormats}) + p.conn.frontend.SendDescribe(&pgproto3.Describe{ObjectType: 'P'}) + p.conn.frontend.SendExecute(&pgproto3.Execute{}) +} + +// SendQueryPrepared is the pipeline version of *PgConn.QueryPrepared. +func (p *Pipeline) SendQueryPrepared(stmtName string, paramValues [][]byte, paramFormats []int16, resultFormats []int16) { + if p.closed { + return + } + p.pendingSync = true + + p.conn.frontend.SendBind(&pgproto3.Bind{PreparedStatement: stmtName, ParameterFormatCodes: paramFormats, Parameters: paramValues, ResultFormatCodes: resultFormats}) + p.conn.frontend.SendDescribe(&pgproto3.Describe{ObjectType: 'P'}) + p.conn.frontend.SendExecute(&pgproto3.Execute{}) +} + +// Flush flushes the queued requests without establishing a synchronization point. +func (p *Pipeline) Flush() error { + if p.closed { + if p.err != nil { + return p.err + } + return errors.New("pipeline closed") + } + + err := p.conn.flushWithPotentialWriteReadDeadlock() + if err != nil { + err = normalizeTimeoutError(p.ctx, err) + + p.conn.asyncClose() + + p.conn.contextWatcher.Unwatch() + p.conn.unlock() + p.closed = true + p.err = err + return err + } + + return nil +} + +// Sync establishes a synchronization point and flushes the queued requests. +func (p *Pipeline) Sync() error { + if p.closed { + if p.err != nil { + return p.err + } + return errors.New("pipeline closed") + } + + p.conn.frontend.SendSync(&pgproto3.Sync{}) + err := p.Flush() + if err != nil { + return err + } + + p.pendingSync = false + p.expectedReadyForQueryCount++ + + return nil +} + +// GetResults gets the next results. If results are present, results may be a *ResultReader, *StatementDescription, or +// *PipelineSync. If an ErrorResponse is received from the server, results will be nil and err will be a *PgError. If no +// results are available, results and err will both be nil. +func (p *Pipeline) GetResults() (results any, err error) { + if p.closed { + if p.err != nil { + return nil, p.err + } + return nil, errors.New("pipeline closed") + } + + if p.expectedReadyForQueryCount == 0 { + return nil, nil + } + + return p.getResults() +} + +func (p *Pipeline) getResults() (results any, err error) { + for { + msg, err := p.conn.receiveMessage() + if err != nil { + p.closed = true + p.err = err + p.conn.asyncClose() + return nil, normalizeTimeoutError(p.ctx, err) + } + + switch msg := msg.(type) { + case *pgproto3.RowDescription: + p.conn.resultReader = ResultReader{ + pgConn: p.conn, + pipeline: p, + ctx: p.ctx, + fieldDescriptions: p.conn.convertRowDescription(p.conn.fieldDescriptions[:], msg), + } + return &p.conn.resultReader, nil + case *pgproto3.CommandComplete: + p.conn.resultReader = ResultReader{ + commandTag: p.conn.makeCommandTag(msg.CommandTag), + commandConcluded: true, + closed: true, + } + return &p.conn.resultReader, nil + case *pgproto3.ParseComplete: + peekedMsg, err := p.conn.peekMessage() + if err != nil { + p.conn.asyncClose() + return nil, normalizeTimeoutError(p.ctx, err) + } + if _, ok := peekedMsg.(*pgproto3.ParameterDescription); ok { + return p.getResultsPrepare() + } + case *pgproto3.CloseComplete: + return &CloseComplete{}, nil + case *pgproto3.ReadyForQuery: + p.expectedReadyForQueryCount-- + return &PipelineSync{}, nil + case *pgproto3.ErrorResponse: + pgErr := ErrorResponseToPgError(msg) + return nil, pgErr + } + + } +} + +func (p *Pipeline) getResultsPrepare() (*StatementDescription, error) { + psd := &StatementDescription{} + + for { + msg, err := p.conn.receiveMessage() + if err != nil { + p.conn.asyncClose() + return nil, normalizeTimeoutError(p.ctx, err) + } + + switch msg := msg.(type) { + case *pgproto3.ParameterDescription: + psd.ParamOIDs = make([]uint32, len(msg.ParameterOIDs)) + copy(psd.ParamOIDs, msg.ParameterOIDs) + case *pgproto3.RowDescription: + psd.Fields = p.conn.convertRowDescription(nil, msg) + return psd, nil + + // NoData is returned instead of RowDescription when there is no expected result. e.g. An INSERT without a RETURNING + // clause. + case *pgproto3.NoData: + return psd, nil + + // These should never happen here. But don't take chances that could lead to a deadlock. + case *pgproto3.ErrorResponse: + pgErr := ErrorResponseToPgError(msg) + return nil, pgErr + case *pgproto3.CommandComplete: + p.conn.asyncClose() + return nil, errors.New("BUG: received CommandComplete while handling Describe") + case *pgproto3.ReadyForQuery: + p.conn.asyncClose() + return nil, errors.New("BUG: received ReadyForQuery while handling Describe") + } + } +} + +// Close closes the pipeline and returns the connection to normal mode. +func (p *Pipeline) Close() error { + if p.closed { + return p.err + } + + p.closed = true + + if p.pendingSync { + p.conn.asyncClose() + p.err = errors.New("pipeline has unsynced requests") + p.conn.contextWatcher.Unwatch() + p.conn.unlock() + + return p.err + } + + for p.expectedReadyForQueryCount > 0 { + _, err := p.getResults() + if err != nil { + p.err = err + var pgErr *PgError + if !errors.As(err, &pgErr) { + p.conn.asyncClose() + break + } + } + } + + p.conn.contextWatcher.Unwatch() + p.conn.unlock() + + return p.err +} + +// DeadlineContextWatcherHandler handles canceled contexts by setting a deadline on a net.Conn. +type DeadlineContextWatcherHandler struct { + Conn net.Conn + + // DeadlineDelay is the delay to set on the deadline set on net.Conn when the context is canceled. + DeadlineDelay time.Duration +} + +func (h *DeadlineContextWatcherHandler) HandleCancel(ctx context.Context) { + h.Conn.SetDeadline(time.Now().Add(h.DeadlineDelay)) +} + +func (h *DeadlineContextWatcherHandler) HandleUnwatchAfterCancel() { + h.Conn.SetDeadline(time.Time{}) +} + +// CancelRequestContextWatcherHandler handles canceled contexts by sending a cancel request to the server. It also sets +// a deadline on a net.Conn as a fallback. +type CancelRequestContextWatcherHandler struct { + Conn *PgConn + + // CancelRequestDelay is the delay before sending the cancel request to the server. + CancelRequestDelay time.Duration + + // DeadlineDelay is the delay to set on the deadline set on net.Conn when the context is canceled. + DeadlineDelay time.Duration + + cancelFinishedChan chan struct{} + handleUnwatchAfterCancelCalled func() +} + +func (h *CancelRequestContextWatcherHandler) HandleCancel(context.Context) { + h.cancelFinishedChan = make(chan struct{}) + var handleUnwatchedAfterCancelCalledCtx context.Context + handleUnwatchedAfterCancelCalledCtx, h.handleUnwatchAfterCancelCalled = context.WithCancel(context.Background()) + + deadline := time.Now().Add(h.DeadlineDelay) + h.Conn.conn.SetDeadline(deadline) + + go func() { + defer close(h.cancelFinishedChan) + + select { + case <-handleUnwatchedAfterCancelCalledCtx.Done(): + return + case <-time.After(h.CancelRequestDelay): + } + + cancelRequestCtx, cancel := context.WithDeadline(handleUnwatchedAfterCancelCalledCtx, deadline) + defer cancel() + h.Conn.CancelRequest(cancelRequestCtx) + + // CancelRequest is inherently racy. Even though the cancel request has been received by the server at this point, + // it hasn't necessarily been delivered to the other connection. If we immediately return and the connection is + // immediately used then it is possible the CancelRequest will actually cancel our next query. The + // TestCancelRequestContextWatcherHandler Stress test can produce this error without the sleep below. The sleep time + // is arbitrary, but should be sufficient to prevent this error case. + time.Sleep(100 * time.Millisecond) + }() +} + +func (h *CancelRequestContextWatcherHandler) HandleUnwatchAfterCancel() { + h.handleUnwatchAfterCancelCalled() + <-h.cancelFinishedChan + + h.Conn.conn.SetDeadline(time.Time{}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgproto3/README.md b/vendor/github.com/jackc/pgx/v5/pgproto3/README.md new file mode 100644 index 0000000..7a26f1c --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/README.md @@ -0,0 +1,7 @@ +# pgproto3 + +Package pgproto3 is an encoder and decoder of the PostgreSQL wire protocol version 3. + +pgproto3 can be used as a foundation for PostgreSQL drivers, proxies, mock servers, load balancers and more. + +See example/pgfortune for a playful example of a fake PostgreSQL server. diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_cleartext_password.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_cleartext_password.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/authentication_cleartext_password.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_cleartext_password.go index 1ec219b..ac2962e 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_cleartext_password.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_cleartext_password.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // AuthenticationCleartextPassword is a message sent from the backend indicating that a clear-text password is required. diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_gss.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_gss.go similarity index 96% rename from vendor/github.com/jackc/pgproto3/v2/authentication_gss.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_gss.go index 425be6e..178ef31 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_gss.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_gss.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type AuthenticationGSS struct{} diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_gss_continue.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_gss_continue.go similarity index 96% rename from vendor/github.com/jackc/pgproto3/v2/authentication_gss_continue.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_gss_continue.go index 42a70da..2ba3f3b 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_gss_continue.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_gss_continue.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type AuthenticationGSSContinue struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_md5_password.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_md5_password.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/authentication_md5_password.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_md5_password.go index 9c0f5ee..854c640 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_md5_password.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_md5_password.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // AuthenticationMD5Password is a message sent from the backend indicating that an MD5 hashed password is required. diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_ok.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_ok.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/authentication_ok.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_ok.go index 021f820..ec11d39 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_ok.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_ok.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // AuthenticationOk is a message sent from the backend indicating that authentication was successful. diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_sasl.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl.go similarity index 86% rename from vendor/github.com/jackc/pgproto3/v2/authentication_sasl.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl.go index b56461c..e66580f 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_sasl.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl.go @@ -6,7 +6,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // AuthenticationSASL is a message sent from the backend indicating that SASL authentication is required. @@ -36,10 +36,11 @@ func (dst *AuthenticationSASL) Decode(src []byte) error { authMechanisms := src[4:] for len(authMechanisms) > 1 { idx := bytes.IndexByte(authMechanisms, 0) - if idx > 0 { - dst.AuthMechanisms = append(dst.AuthMechanisms, string(authMechanisms[:idx])) - authMechanisms = authMechanisms[idx+1:] + if idx == -1 { + return &invalidMessageFormatErr{messageType: "AuthenticationSASL", details: "unterminated string"} } + dst.AuthMechanisms = append(dst.AuthMechanisms, string(authMechanisms[:idx])) + authMechanisms = authMechanisms[idx+1:] } return nil diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_sasl_continue.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl_continue.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/authentication_sasl_continue.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl_continue.go index d405b12..70fba4a 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_sasl_continue.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl_continue.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // AuthenticationSASLContinue is a message sent from the backend containing a SASL challenge. diff --git a/vendor/github.com/jackc/pgproto3/v2/authentication_sasl_final.go b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl_final.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/authentication_sasl_final.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl_final.go index c34ac4e..84976c2 100644 --- a/vendor/github.com/jackc/pgproto3/v2/authentication_sasl_final.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/authentication_sasl_final.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // AuthenticationSASLFinal is a message sent from the backend indicating a SASL authentication has completed. diff --git a/vendor/github.com/jackc/pgproto3/v2/backend.go b/vendor/github.com/jackc/pgx/v5/pgproto3/backend.go similarity index 66% rename from vendor/github.com/jackc/pgproto3/v2/backend.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/backend.go index 6eabcd8..d146c33 100644 --- a/vendor/github.com/jackc/pgproto3/v2/backend.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/backend.go @@ -1,17 +1,24 @@ package pgproto3 import ( + "bytes" "encoding/binary" - "errors" "fmt" "io" ) // Backend acts as a server for the PostgreSQL wire protocol version 3. type Backend struct { - cr ChunkReader + cr *chunkReader w io.Writer + // tracer is used to trace messages when Send or Receive is called. This means an outbound message is traced + // before it is actually transmitted (i.e. before Flush). + tracer *tracer + + wbuf []byte + encodeError error + // Frontend message flyweights bind Bind cancelRequest CancelRequest @@ -32,6 +39,7 @@ type Backend struct { terminate Terminate bodyLen int + maxBodyLen int // maxBodyLen is the maximum length of a message body in octets. If a message body exceeds this length, Receive will return an error. msgType byte partialMsg bool authType uint32 @@ -43,19 +51,68 @@ const ( ) // NewBackend creates a new Backend. -func NewBackend(cr ChunkReader, w io.Writer) *Backend { +func NewBackend(r io.Reader, w io.Writer) *Backend { + cr := newChunkReader(r, 0) return &Backend{cr: cr, w: w} } -// Send sends a message to the frontend. -func (b *Backend) Send(msg BackendMessage) error { - buf, err := msg.Encode(nil) +// Send sends a message to the frontend (i.e. the client). The message is buffered until Flush is called. Any error +// encountered will be returned from Flush. +func (b *Backend) Send(msg BackendMessage) { + if b.encodeError != nil { + return + } + + prevLen := len(b.wbuf) + newBuf, err := msg.Encode(b.wbuf) + if err != nil { + b.encodeError = err + return + } + b.wbuf = newBuf + + if b.tracer != nil { + b.tracer.traceMessage('B', int32(len(b.wbuf)-prevLen), msg) + } +} + +// Flush writes any pending messages to the frontend (i.e. the client). +func (b *Backend) Flush() error { + if err := b.encodeError; err != nil { + b.encodeError = nil + b.wbuf = b.wbuf[:0] + return &writeError{err: err, safeToRetry: true} + } + + n, err := b.w.Write(b.wbuf) + + const maxLen = 1024 + if len(b.wbuf) > maxLen { + b.wbuf = make([]byte, 0, maxLen) + } else { + b.wbuf = b.wbuf[:0] + } + if err != nil { - return err + return &writeError{err: err, safeToRetry: n == 0} } - _, err = b.w.Write(buf) - return err + return nil +} + +// Trace starts tracing the message traffic to w. It writes in a similar format to that produced by the libpq function +// PQtrace. +func (b *Backend) Trace(w io.Writer, options TracerOptions) { + b.tracer = &tracer{ + w: w, + buf: &bytes.Buffer{}, + TracerOptions: options, + } +} + +// Untrace stops tracing. +func (b *Backend) Untrace() { + b.tracer = nil } // ReceiveStartupMessage receives the initial connection message. This method is used of the normal Receive method @@ -119,10 +176,10 @@ func (b *Backend) Receive() (FrontendMessage, error) { b.msgType = header[0] b.bodyLen = int(binary.BigEndian.Uint32(header[1:])) - 4 - b.partialMsg = true - if b.bodyLen < 0 { - return nil, errors.New("invalid message with negative body length received") + if b.maxBodyLen > 0 && b.bodyLen > b.maxBodyLen { + return nil, &ExceededMaxBodyLenErr{b.maxBodyLen, b.bodyLen} } + b.partialMsg = true } var msg FrontendMessage @@ -160,7 +217,7 @@ func (b *Backend) Receive() (FrontendMessage, error) { case AuthTypeCleartextPassword, AuthTypeMD5Password: fallthrough default: - // to maintain backwards compatability + // to maintain backwards compatibility msg = &PasswordMessage{} } case 'Q': @@ -181,7 +238,15 @@ func (b *Backend) Receive() (FrontendMessage, error) { b.partialMsg = false err = msg.Decode(msgBody) - return msg, err + if err != nil { + return nil, err + } + + if b.tracer != nil { + b.tracer.traceMessage('F', int32(5+len(msgBody)), msg) + } + + return msg, nil } // SetAuthType sets the authentication type in the backend. @@ -216,3 +281,12 @@ func (b *Backend) SetAuthType(authType uint32) error { return nil } + +// SetMaxBodyLen sets the maximum length of a message body in octets. If a message body exceeds this length, Receive will return +// an error. This is useful for protecting against malicious clients that send large messages with the intent of +// causing memory exhaustion. +// The default value is 0. +// If maxBodyLen is 0, then no maximum is enforced. +func (b *Backend) SetMaxBodyLen(maxBodyLen int) { + b.maxBodyLen = maxBodyLen +} diff --git a/vendor/github.com/jackc/pgproto3/v2/backend_key_data.go b/vendor/github.com/jackc/pgx/v5/pgproto3/backend_key_data.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/backend_key_data.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/backend_key_data.go index 0a3d5e5..23f5da6 100644 --- a/vendor/github.com/jackc/pgproto3/v2/backend_key_data.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/backend_key_data.go @@ -4,7 +4,7 @@ import ( "encoding/binary" "encoding/json" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type BackendKeyData struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/big_endian.go b/vendor/github.com/jackc/pgx/v5/pgproto3/big_endian.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/big_endian.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/big_endian.go diff --git a/vendor/github.com/jackc/pgproto3/v2/bind.go b/vendor/github.com/jackc/pgx/v5/pgproto3/bind.go similarity index 99% rename from vendor/github.com/jackc/pgproto3/v2/bind.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/bind.go index dd5503b..ad6ac48 100644 --- a/vendor/github.com/jackc/pgproto3/v2/bind.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/bind.go @@ -9,7 +9,7 @@ import ( "fmt" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type Bind struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/bind_complete.go b/vendor/github.com/jackc/pgx/v5/pgproto3/bind_complete.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/bind_complete.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/bind_complete.go diff --git a/vendor/github.com/jackc/pgproto3/v2/cancel_request.go b/vendor/github.com/jackc/pgx/v5/pgproto3/cancel_request.go similarity index 96% rename from vendor/github.com/jackc/pgproto3/v2/cancel_request.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/cancel_request.go index 76acb3f..6b52dd9 100644 --- a/vendor/github.com/jackc/pgproto3/v2/cancel_request.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/cancel_request.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) const cancelRequestCode = 80877102 diff --git a/vendor/github.com/jackc/pgx/v5/pgproto3/chunkreader.go b/vendor/github.com/jackc/pgx/v5/pgproto3/chunkreader.go new file mode 100644 index 0000000..fc0fa61 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/chunkreader.go @@ -0,0 +1,90 @@ +package pgproto3 + +import ( + "io" + + "github.com/jackc/pgx/v5/internal/iobufpool" +) + +// chunkReader is a io.Reader wrapper that minimizes IO reads and memory allocations. It allocates memory in chunks and +// will read as much as will fit in the current buffer in a single call regardless of how large a read is actually +// requested. The memory returned via Next is only valid until the next call to Next. +// +// This is roughly equivalent to a bufio.Reader that only uses Peek and Discard to never copy bytes. +type chunkReader struct { + r io.Reader + + buf *[]byte + rp, wp int // buf read position and write position + + minBufSize int +} + +// newChunkReader creates and returns a new chunkReader for r with default configuration. If minBufSize is <= 0 it uses +// a default value. +func newChunkReader(r io.Reader, minBufSize int) *chunkReader { + if minBufSize <= 0 { + // By historical reasons Postgres currently has 8KB send buffer inside, + // so here we want to have at least the same size buffer. + // @see https://github.com/postgres/postgres/blob/249d64999615802752940e017ee5166e726bc7cd/src/backend/libpq/pqcomm.c#L134 + // @see https://www.postgresql.org/message-id/0cdc5485-cb3c-5e16-4a46-e3b2f7a41322%40ya.ru + // + // In addition, testing has found no benefit of any larger buffer. + minBufSize = 8192 + } + + return &chunkReader{ + r: r, + minBufSize: minBufSize, + buf: iobufpool.Get(minBufSize), + } +} + +// Next returns buf filled with the next n bytes. buf is only valid until next call of Next. If an error occurs, buf +// will be nil. +func (r *chunkReader) Next(n int) (buf []byte, err error) { + // Reset the buffer if it is empty + if r.rp == r.wp { + if len(*r.buf) != r.minBufSize { + iobufpool.Put(r.buf) + r.buf = iobufpool.Get(r.minBufSize) + } + r.rp = 0 + r.wp = 0 + } + + // n bytes already in buf + if (r.wp - r.rp) >= n { + buf = (*r.buf)[r.rp : r.rp+n : r.rp+n] + r.rp += n + return buf, err + } + + // buf is smaller than requested number of bytes + if len(*r.buf) < n { + bigBuf := iobufpool.Get(n) + r.wp = copy((*bigBuf), (*r.buf)[r.rp:r.wp]) + r.rp = 0 + iobufpool.Put(r.buf) + r.buf = bigBuf + } + + // buf is large enough, but need to shift filled area to start to make enough contiguous space + minReadCount := n - (r.wp - r.rp) + if (len(*r.buf) - r.wp) < minReadCount { + r.wp = copy((*r.buf), (*r.buf)[r.rp:r.wp]) + r.rp = 0 + } + + // Read at least the required number of bytes from the underlying io.Reader + readBytesCount, err := io.ReadAtLeast(r.r, (*r.buf)[r.wp:], minReadCount) + r.wp += readBytesCount + // fmt.Println("read", n) + if err != nil { + return nil, err + } + + buf = (*r.buf)[r.rp : r.rp+n : r.rp+n] + r.rp += n + return buf, nil +} diff --git a/vendor/github.com/jackc/pgproto3/v2/close.go b/vendor/github.com/jackc/pgx/v5/pgproto3/close.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/close.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/close.go diff --git a/vendor/github.com/jackc/pgproto3/v2/close_complete.go b/vendor/github.com/jackc/pgx/v5/pgproto3/close_complete.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/close_complete.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/close_complete.go diff --git a/vendor/github.com/jackc/pgproto3/v2/command_complete.go b/vendor/github.com/jackc/pgx/v5/pgproto3/command_complete.go similarity index 90% rename from vendor/github.com/jackc/pgproto3/v2/command_complete.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/command_complete.go index 9d82206..eba7094 100644 --- a/vendor/github.com/jackc/pgproto3/v2/command_complete.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/command_complete.go @@ -16,8 +16,11 @@ func (*CommandComplete) Backend() {} // type identifier and 4 byte message length. func (dst *CommandComplete) Decode(src []byte) error { idx := bytes.IndexByte(src, 0) + if idx == -1 { + return &invalidMessageFormatErr{messageType: "CommandComplete", details: "unterminated string"} + } if idx != len(src)-1 { - return &invalidMessageFormatErr{messageType: "CommandComplete"} + return &invalidMessageFormatErr{messageType: "CommandComplete", details: "string terminated too early"} } dst.CommandTag = src[:idx] diff --git a/vendor/github.com/jackc/pgproto3/v2/copy_both_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_both_response.go similarity index 98% rename from vendor/github.com/jackc/pgproto3/v2/copy_both_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/copy_both_response.go index 4bf3ef3..99e1afe 100644 --- a/vendor/github.com/jackc/pgproto3/v2/copy_both_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_both_response.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type CopyBothResponse struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/copy_data.go b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_data.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/copy_data.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/copy_data.go diff --git a/vendor/github.com/jackc/pgproto3/v2/copy_done.go b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_done.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/copy_done.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/copy_done.go diff --git a/vendor/github.com/jackc/pgproto3/v2/copy_fail.go b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_fail.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/copy_fail.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/copy_fail.go diff --git a/vendor/github.com/jackc/pgproto3/v2/copy_in_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_in_response.go similarity index 98% rename from vendor/github.com/jackc/pgproto3/v2/copy_in_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/copy_in_response.go index bfc3ee0..06cf99c 100644 --- a/vendor/github.com/jackc/pgproto3/v2/copy_in_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_in_response.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type CopyInResponse struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/copy_out_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_out_response.go similarity index 98% rename from vendor/github.com/jackc/pgproto3/v2/copy_out_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/copy_out_response.go index 265e35f..549e916 100644 --- a/vendor/github.com/jackc/pgproto3/v2/copy_out_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/copy_out_response.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type CopyOutResponse struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/data_row.go b/vendor/github.com/jackc/pgx/v5/pgproto3/data_row.go similarity index 92% rename from vendor/github.com/jackc/pgproto3/v2/data_row.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/data_row.go index d755515..fdfb0f7 100644 --- a/vendor/github.com/jackc/pgproto3/v2/data_row.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/data_row.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type DataRow struct { @@ -45,19 +45,19 @@ func (dst *DataRow) Decode(src []byte) error { return &invalidMessageFormatErr{messageType: "DataRow"} } - msgSize := int(int32(binary.BigEndian.Uint32(src[rp:]))) + valueLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) rp += 4 // null - if msgSize == -1 { + if valueLen == -1 { dst.Values[i] = nil } else { - if len(src[rp:]) < msgSize { + if len(src[rp:]) < valueLen || valueLen < 0 { return &invalidMessageFormatErr{messageType: "DataRow"} } - dst.Values[i] = src[rp : rp+msgSize : rp+msgSize] - rp += msgSize + dst.Values[i] = src[rp : rp+valueLen : rp+valueLen] + rp += valueLen } } diff --git a/vendor/github.com/jackc/pgproto3/v2/describe.go b/vendor/github.com/jackc/pgx/v5/pgproto3/describe.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/describe.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/describe.go diff --git a/vendor/github.com/jackc/pgx/v5/pgproto3/doc.go b/vendor/github.com/jackc/pgx/v5/pgproto3/doc.go new file mode 100644 index 0000000..0afd18e --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/doc.go @@ -0,0 +1,11 @@ +// Package pgproto3 is an encoder and decoder of the PostgreSQL wire protocol version 3. +// +// The primary interfaces are Frontend and Backend. They correspond to a client and server respectively. Messages are +// sent with Send (or a specialized Send variant). Messages are automatically buffered to minimize small writes. Call +// Flush to ensure a message has actually been sent. +// +// The Trace method of Frontend and Backend can be used to examine the wire-level message traffic. It outputs in a +// similar format to the PQtrace function in libpq. +// +// See https://www.postgresql.org/docs/current/protocol-message-formats.html for meanings of the different messages. +package pgproto3 diff --git a/vendor/github.com/jackc/pgproto3/v2/empty_query_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/empty_query_response.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/empty_query_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/empty_query_response.go diff --git a/vendor/github.com/jackc/pgproto3/v2/error_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/error_response.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/error_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/error_response.go diff --git a/vendor/github.com/jackc/pgproto3/v2/execute.go b/vendor/github.com/jackc/pgx/v5/pgproto3/execute.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/execute.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/execute.go index efb9e1e..31bc714 100644 --- a/vendor/github.com/jackc/pgproto3/v2/execute.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/execute.go @@ -5,7 +5,7 @@ import ( "encoding/binary" "encoding/json" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type Execute struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/flush.go b/vendor/github.com/jackc/pgx/v5/pgproto3/flush.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/flush.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/flush.go diff --git a/vendor/github.com/jackc/pgx/v5/pgproto3/frontend.go b/vendor/github.com/jackc/pgx/v5/pgproto3/frontend.go new file mode 100644 index 0000000..b41abbe --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/frontend.go @@ -0,0 +1,454 @@ +package pgproto3 + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" +) + +// Frontend acts as a client for the PostgreSQL wire protocol version 3. +type Frontend struct { + cr *chunkReader + w io.Writer + + // tracer is used to trace messages when Send or Receive is called. This means an outbound message is traced + // before it is actually transmitted (i.e. before Flush). It is safe to change this variable when the Frontend is + // idle. Setting and unsetting tracer provides equivalent functionality to PQtrace and PQuntrace in libpq. + tracer *tracer + + wbuf []byte + encodeError error + + // Backend message flyweights + authenticationOk AuthenticationOk + authenticationCleartextPassword AuthenticationCleartextPassword + authenticationMD5Password AuthenticationMD5Password + authenticationGSS AuthenticationGSS + authenticationGSSContinue AuthenticationGSSContinue + authenticationSASL AuthenticationSASL + authenticationSASLContinue AuthenticationSASLContinue + authenticationSASLFinal AuthenticationSASLFinal + backendKeyData BackendKeyData + bindComplete BindComplete + closeComplete CloseComplete + commandComplete CommandComplete + copyBothResponse CopyBothResponse + copyData CopyData + copyInResponse CopyInResponse + copyOutResponse CopyOutResponse + copyDone CopyDone + dataRow DataRow + emptyQueryResponse EmptyQueryResponse + errorResponse ErrorResponse + functionCallResponse FunctionCallResponse + noData NoData + noticeResponse NoticeResponse + notificationResponse NotificationResponse + parameterDescription ParameterDescription + parameterStatus ParameterStatus + parseComplete ParseComplete + readyForQuery ReadyForQuery + rowDescription RowDescription + portalSuspended PortalSuspended + + bodyLen int + msgType byte + partialMsg bool + authType uint32 +} + +// NewFrontend creates a new Frontend. +func NewFrontend(r io.Reader, w io.Writer) *Frontend { + cr := newChunkReader(r, 0) + return &Frontend{cr: cr, w: w} +} + +// Send sends a message to the backend (i.e. the server). The message is buffered until Flush is called. Any error +// encountered will be returned from Flush. +// +// Send can work with any FrontendMessage. Some commonly used message types such as Bind have specialized send methods +// such as SendBind. These methods should be preferred when the type of message is known up front (e.g. when building an +// extended query protocol query) as they may be faster due to knowing the type of msg rather than it being hidden +// behind an interface. +func (f *Frontend) Send(msg FrontendMessage) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceMessage('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// Flush writes any pending messages to the backend (i.e. the server). +func (f *Frontend) Flush() error { + if err := f.encodeError; err != nil { + f.encodeError = nil + f.wbuf = f.wbuf[:0] + return &writeError{err: err, safeToRetry: true} + } + + if len(f.wbuf) == 0 { + return nil + } + + n, err := f.w.Write(f.wbuf) + + const maxLen = 1024 + if len(f.wbuf) > maxLen { + f.wbuf = make([]byte, 0, maxLen) + } else { + f.wbuf = f.wbuf[:0] + } + + if err != nil { + return &writeError{err: err, safeToRetry: n == 0} + } + + return nil +} + +// Trace starts tracing the message traffic to w. It writes in a similar format to that produced by the libpq function +// PQtrace. +func (f *Frontend) Trace(w io.Writer, options TracerOptions) { + f.tracer = &tracer{ + w: w, + buf: &bytes.Buffer{}, + TracerOptions: options, + } +} + +// Untrace stops tracing. +func (f *Frontend) Untrace() { + f.tracer = nil +} + +// SendBind sends a Bind message to the backend (i.e. the server). The message is buffered until Flush is called. Any +// error encountered will be returned from Flush. +func (f *Frontend) SendBind(msg *Bind) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceBind('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendParse sends a Parse message to the backend (i.e. the server). The message is buffered until Flush is called. Any +// error encountered will be returned from Flush. +func (f *Frontend) SendParse(msg *Parse) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceParse('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendClose sends a Close message to the backend (i.e. the server). The message is buffered until Flush is called. Any +// error encountered will be returned from Flush. +func (f *Frontend) SendClose(msg *Close) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceClose('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendDescribe sends a Describe message to the backend (i.e. the server). The message is buffered until Flush is +// called. Any error encountered will be returned from Flush. +func (f *Frontend) SendDescribe(msg *Describe) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceDescribe('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendExecute sends an Execute message to the backend (i.e. the server). The message is buffered until Flush is called. +// Any error encountered will be returned from Flush. +func (f *Frontend) SendExecute(msg *Execute) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.TraceQueryute('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendSync sends a Sync message to the backend (i.e. the server). The message is buffered until Flush is called. Any +// error encountered will be returned from Flush. +func (f *Frontend) SendSync(msg *Sync) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceSync('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendQuery sends a Query message to the backend (i.e. the server). The message is buffered until Flush is called. Any +// error encountered will be returned from Flush. +func (f *Frontend) SendQuery(msg *Query) { + if f.encodeError != nil { + return + } + + prevLen := len(f.wbuf) + newBuf, err := msg.Encode(f.wbuf) + if err != nil { + f.encodeError = err + return + } + f.wbuf = newBuf + + if f.tracer != nil { + f.tracer.traceQuery('F', int32(len(f.wbuf)-prevLen), msg) + } +} + +// SendUnbufferedEncodedCopyData immediately sends an encoded CopyData message to the backend (i.e. the server). This method +// is more efficient than sending a CopyData message with Send as the message data is not copied to the internal buffer +// before being written out. The internal buffer is flushed before the message is sent. +func (f *Frontend) SendUnbufferedEncodedCopyData(msg []byte) error { + err := f.Flush() + if err != nil { + return err + } + + n, err := f.w.Write(msg) + if err != nil { + return &writeError{err: err, safeToRetry: n == 0} + } + + if f.tracer != nil { + f.tracer.traceCopyData('F', int32(len(msg)-1), &CopyData{}) + } + + return nil +} + +func translateEOFtoErrUnexpectedEOF(err error) error { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err +} + +// Receive receives a message from the backend. The returned message is only valid until the next call to Receive. +func (f *Frontend) Receive() (BackendMessage, error) { + if !f.partialMsg { + header, err := f.cr.Next(5) + if err != nil { + return nil, translateEOFtoErrUnexpectedEOF(err) + } + + f.msgType = header[0] + + msgLength := int(binary.BigEndian.Uint32(header[1:])) + if msgLength < 4 { + return nil, fmt.Errorf("invalid message length: %d", msgLength) + } + + f.bodyLen = msgLength - 4 + f.partialMsg = true + } + + msgBody, err := f.cr.Next(f.bodyLen) + if err != nil { + return nil, translateEOFtoErrUnexpectedEOF(err) + } + + f.partialMsg = false + + var msg BackendMessage + switch f.msgType { + case '1': + msg = &f.parseComplete + case '2': + msg = &f.bindComplete + case '3': + msg = &f.closeComplete + case 'A': + msg = &f.notificationResponse + case 'c': + msg = &f.copyDone + case 'C': + msg = &f.commandComplete + case 'd': + msg = &f.copyData + case 'D': + msg = &f.dataRow + case 'E': + msg = &f.errorResponse + case 'G': + msg = &f.copyInResponse + case 'H': + msg = &f.copyOutResponse + case 'I': + msg = &f.emptyQueryResponse + case 'K': + msg = &f.backendKeyData + case 'n': + msg = &f.noData + case 'N': + msg = &f.noticeResponse + case 'R': + var err error + msg, err = f.findAuthenticationMessageType(msgBody) + if err != nil { + return nil, err + } + case 's': + msg = &f.portalSuspended + case 'S': + msg = &f.parameterStatus + case 't': + msg = &f.parameterDescription + case 'T': + msg = &f.rowDescription + case 'V': + msg = &f.functionCallResponse + case 'W': + msg = &f.copyBothResponse + case 'Z': + msg = &f.readyForQuery + default: + return nil, fmt.Errorf("unknown message type: %c", f.msgType) + } + + err = msg.Decode(msgBody) + if err != nil { + return nil, err + } + + if f.tracer != nil { + f.tracer.traceMessage('B', int32(5+len(msgBody)), msg) + } + + return msg, nil +} + +// Authentication message type constants. +// See src/include/libpq/pqcomm.h for all +// constants. +const ( + AuthTypeOk = 0 + AuthTypeCleartextPassword = 3 + AuthTypeMD5Password = 5 + AuthTypeSCMCreds = 6 + AuthTypeGSS = 7 + AuthTypeGSSCont = 8 + AuthTypeSSPI = 9 + AuthTypeSASL = 10 + AuthTypeSASLContinue = 11 + AuthTypeSASLFinal = 12 +) + +func (f *Frontend) findAuthenticationMessageType(src []byte) (BackendMessage, error) { + if len(src) < 4 { + return nil, errors.New("authentication message too short") + } + f.authType = binary.BigEndian.Uint32(src[:4]) + + switch f.authType { + case AuthTypeOk: + return &f.authenticationOk, nil + case AuthTypeCleartextPassword: + return &f.authenticationCleartextPassword, nil + case AuthTypeMD5Password: + return &f.authenticationMD5Password, nil + case AuthTypeSCMCreds: + return nil, errors.New("AuthTypeSCMCreds is unimplemented") + case AuthTypeGSS: + return &f.authenticationGSS, nil + case AuthTypeGSSCont: + return &f.authenticationGSSContinue, nil + case AuthTypeSSPI: + return nil, errors.New("AuthTypeSSPI is unimplemented") + case AuthTypeSASL: + return &f.authenticationSASL, nil + case AuthTypeSASLContinue: + return &f.authenticationSASLContinue, nil + case AuthTypeSASLFinal: + return &f.authenticationSASLFinal, nil + default: + return nil, fmt.Errorf("unknown authentication type: %d", f.authType) + } +} + +// GetAuthType returns the authType used in the current state of the frontend. +// See SetAuthType for more information. +func (f *Frontend) GetAuthType() uint32 { + return f.authType +} + +func (f *Frontend) ReadBufferLen() int { + return f.cr.wp - f.cr.rp +} diff --git a/vendor/github.com/jackc/pgproto3/v2/function_call.go b/vendor/github.com/jackc/pgx/v5/pgproto3/function_call.go similarity index 98% rename from vendor/github.com/jackc/pgproto3/v2/function_call.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/function_call.go index 5d799c4..7d83579 100644 --- a/vendor/github.com/jackc/pgproto3/v2/function_call.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/function_call.go @@ -5,7 +5,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type FunctionCall struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/function_call_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/function_call_response.go similarity index 98% rename from vendor/github.com/jackc/pgproto3/v2/function_call_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/function_call_response.go index abc14f0..1f27349 100644 --- a/vendor/github.com/jackc/pgproto3/v2/function_call_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/function_call_response.go @@ -5,7 +5,7 @@ import ( "encoding/hex" "encoding/json" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type FunctionCallResponse struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/gss_enc_request.go b/vendor/github.com/jackc/pgx/v5/pgproto3/gss_enc_request.go similarity index 96% rename from vendor/github.com/jackc/pgproto3/v2/gss_enc_request.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/gss_enc_request.go index f6e4f66..70cb20c 100644 --- a/vendor/github.com/jackc/pgproto3/v2/gss_enc_request.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/gss_enc_request.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) const gssEncReqNumber = 80877104 diff --git a/vendor/github.com/jackc/pgproto3/v2/gss_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/gss_response.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/gss_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/gss_response.go diff --git a/vendor/github.com/jackc/pgproto3/v2/no_data.go b/vendor/github.com/jackc/pgx/v5/pgproto3/no_data.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/no_data.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/no_data.go diff --git a/vendor/github.com/jackc/pgproto3/v2/notice_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/notice_response.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/notice_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/notice_response.go diff --git a/vendor/github.com/jackc/pgproto3/v2/notification_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/notification_response.go similarity index 90% rename from vendor/github.com/jackc/pgproto3/v2/notification_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/notification_response.go index 5be3edd..243b6bf 100644 --- a/vendor/github.com/jackc/pgproto3/v2/notification_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/notification_response.go @@ -5,7 +5,7 @@ import ( "encoding/binary" "encoding/json" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type NotificationResponse struct { @@ -22,6 +22,10 @@ func (*NotificationResponse) Backend() {} func (dst *NotificationResponse) Decode(src []byte) error { buf := bytes.NewBuffer(src) + if buf.Len() < 4 { + return &invalidMessageFormatErr{messageType: "NotificationResponse", details: "too short"} + } + pid := binary.BigEndian.Uint32(buf.Next(4)) b, err := buf.ReadBytes(0) diff --git a/vendor/github.com/jackc/pgproto3/v2/parameter_description.go b/vendor/github.com/jackc/pgx/v5/pgproto3/parameter_description.go similarity index 97% rename from vendor/github.com/jackc/pgproto3/v2/parameter_description.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/parameter_description.go index fec0fce..1ef27b7 100644 --- a/vendor/github.com/jackc/pgproto3/v2/parameter_description.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/parameter_description.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type ParameterDescription struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/parameter_status.go b/vendor/github.com/jackc/pgx/v5/pgproto3/parameter_status.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/parameter_status.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/parameter_status.go diff --git a/vendor/github.com/jackc/pgproto3/v2/parse.go b/vendor/github.com/jackc/pgx/v5/pgproto3/parse.go similarity index 98% rename from vendor/github.com/jackc/pgproto3/v2/parse.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/parse.go index 7dd0699..6ba3486 100644 --- a/vendor/github.com/jackc/pgproto3/v2/parse.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/parse.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type Parse struct { diff --git a/vendor/github.com/jackc/pgproto3/v2/parse_complete.go b/vendor/github.com/jackc/pgx/v5/pgproto3/parse_complete.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/parse_complete.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/parse_complete.go diff --git a/vendor/github.com/jackc/pgproto3/v2/password_message.go b/vendor/github.com/jackc/pgx/v5/pgproto3/password_message.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/password_message.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/password_message.go diff --git a/vendor/github.com/jackc/pgproto3/v2/pgproto3.go b/vendor/github.com/jackc/pgx/v5/pgproto3/pgproto3.go similarity index 73% rename from vendor/github.com/jackc/pgproto3/v2/pgproto3.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/pgproto3.go index aa4167c..128f97f 100644 --- a/vendor/github.com/jackc/pgproto3/v2/pgproto3.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/pgproto3.go @@ -5,7 +5,7 @@ import ( "errors" "fmt" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // maxMessageBodyLen is the maximum length of a message body in bytes. See PG_LARGE_MESSAGE_LIMIT in the PostgreSQL @@ -23,11 +23,13 @@ type Message interface { Encode(dst []byte) ([]byte, error) } +// FrontendMessage is a message sent by the frontend (i.e. the client). type FrontendMessage interface { Message Frontend() // no-op method to distinguish frontend from backend methods } +// BackendMessage is a message sent by the backend (i.e. the server). type BackendMessage interface { Message Backend() // no-op method to distinguish frontend from backend methods @@ -50,10 +52,37 @@ func (e *invalidMessageLenErr) Error() string { type invalidMessageFormatErr struct { messageType string + details string } func (e *invalidMessageFormatErr) Error() string { - return fmt.Sprintf("%s body is invalid", e.messageType) + return fmt.Sprintf("%s body is invalid %s", e.messageType, e.details) +} + +type writeError struct { + err error + safeToRetry bool +} + +func (e *writeError) Error() string { + return fmt.Sprintf("write failed: %s", e.err.Error()) +} + +func (e *writeError) SafeToRetry() bool { + return e.safeToRetry +} + +func (e *writeError) Unwrap() error { + return e.err +} + +type ExceededMaxBodyLenErr struct { + MaxExpectedBodyLen int + ActualBodyLen int +} + +func (e *ExceededMaxBodyLenErr) Error() string { + return fmt.Sprintf("invalid body length: expected at most %d, but got %d", e.MaxExpectedBodyLen, e.ActualBodyLen) } // getValueFromJSON gets the value from a protocol message representation in JSON. @@ -70,7 +99,7 @@ func getValueFromJSON(v map[string]string) ([]byte, error) { return nil, errors.New("unknown protocol representation") } -// beginMessage begines a new message of type t. It appends the message type and a placeholder for the message length to +// beginMessage begins a new message of type t. It appends the message type and a placeholder for the message length to // dst. It returns the new buffer and the position of the message length placeholder. func beginMessage(dst []byte, t byte) ([]byte, int) { dst = append(dst, t) diff --git a/vendor/github.com/jackc/pgproto3/v2/portal_suspended.go b/vendor/github.com/jackc/pgx/v5/pgproto3/portal_suspended.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/portal_suspended.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/portal_suspended.go diff --git a/vendor/github.com/jackc/pgproto3/v2/query.go b/vendor/github.com/jackc/pgx/v5/pgproto3/query.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/query.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/query.go diff --git a/vendor/github.com/jackc/pgproto3/v2/ready_for_query.go b/vendor/github.com/jackc/pgx/v5/pgproto3/ready_for_query.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/ready_for_query.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/ready_for_query.go diff --git a/vendor/github.com/jackc/pgproto3/v2/row_description.go b/vendor/github.com/jackc/pgx/v5/pgproto3/row_description.go similarity index 99% rename from vendor/github.com/jackc/pgproto3/v2/row_description.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/row_description.go index 3f6b2c6..dc2a4dd 100644 --- a/vendor/github.com/jackc/pgproto3/v2/row_description.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/row_description.go @@ -7,7 +7,7 @@ import ( "errors" "math" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) const ( diff --git a/vendor/github.com/jackc/pgproto3/v2/sasl_initial_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/sasl_initial_response.go similarity index 91% rename from vendor/github.com/jackc/pgproto3/v2/sasl_initial_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/sasl_initial_response.go index 1938f65..9eb1b6a 100644 --- a/vendor/github.com/jackc/pgproto3/v2/sasl_initial_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/sasl_initial_response.go @@ -2,10 +2,11 @@ package pgproto3 import ( "bytes" + "encoding/hex" "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) type SASLInitialResponse struct { @@ -78,6 +79,12 @@ func (dst *SASLInitialResponse) UnmarshalJSON(data []byte) error { return err } dst.AuthMechanism = msg.AuthMechanism - dst.Data = []byte(msg.Data) + if msg.Data != "" { + decoded, err := hex.DecodeString(msg.Data) + if err != nil { + return err + } + dst.Data = decoded + } return nil } diff --git a/vendor/github.com/jackc/pgproto3/v2/sasl_response.go b/vendor/github.com/jackc/pgx/v5/pgproto3/sasl_response.go similarity index 89% rename from vendor/github.com/jackc/pgproto3/v2/sasl_response.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/sasl_response.go index f4a1318..1b604c2 100644 --- a/vendor/github.com/jackc/pgproto3/v2/sasl_response.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/sasl_response.go @@ -1,6 +1,7 @@ package pgproto3 import ( + "encoding/hex" "encoding/json" ) @@ -44,6 +45,12 @@ func (dst *SASLResponse) UnmarshalJSON(data []byte) error { if err := json.Unmarshal(data, &msg); err != nil { return err } - dst.Data = []byte(msg.Data) + if msg.Data != "" { + decoded, err := hex.DecodeString(msg.Data) + if err != nil { + return err + } + dst.Data = decoded + } return nil } diff --git a/vendor/github.com/jackc/pgproto3/v2/ssl_request.go b/vendor/github.com/jackc/pgx/v5/pgproto3/ssl_request.go similarity index 96% rename from vendor/github.com/jackc/pgproto3/v2/ssl_request.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/ssl_request.go index 8feff1a..b0fc284 100644 --- a/vendor/github.com/jackc/pgproto3/v2/ssl_request.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/ssl_request.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) const sslRequestNumber = 80877103 diff --git a/vendor/github.com/jackc/pgproto3/v2/startup_message.go b/vendor/github.com/jackc/pgx/v5/pgproto3/startup_message.go similarity index 92% rename from vendor/github.com/jackc/pgproto3/v2/startup_message.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/startup_message.go index 255ea22..3af4587 100644 --- a/vendor/github.com/jackc/pgproto3/v2/startup_message.go +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/startup_message.go @@ -7,7 +7,7 @@ import ( "errors" "fmt" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) const ProtocolVersionNumber = 196608 // 3.0 @@ -38,14 +38,14 @@ func (dst *StartupMessage) Decode(src []byte) error { for { idx := bytes.IndexByte(src[rp:], 0) if idx < 0 { - return &invalidMessageFormatErr{messageType: "StartupMesage"} + return &invalidMessageFormatErr{messageType: "StartupMessage"} } key := string(src[rp : rp+idx]) rp += idx + 1 idx = bytes.IndexByte(src[rp:], 0) if idx < 0 { - return &invalidMessageFormatErr{messageType: "StartupMesage"} + return &invalidMessageFormatErr{messageType: "StartupMessage"} } value := string(src[rp : rp+idx]) rp += idx + 1 diff --git a/vendor/github.com/jackc/pgproto3/v2/sync.go b/vendor/github.com/jackc/pgx/v5/pgproto3/sync.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/sync.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/sync.go diff --git a/vendor/github.com/jackc/pgproto3/v2/terminate.go b/vendor/github.com/jackc/pgx/v5/pgproto3/terminate.go similarity index 100% rename from vendor/github.com/jackc/pgproto3/v2/terminate.go rename to vendor/github.com/jackc/pgx/v5/pgproto3/terminate.go diff --git a/vendor/github.com/jackc/pgx/v5/pgproto3/trace.go b/vendor/github.com/jackc/pgx/v5/pgproto3/trace.go new file mode 100644 index 0000000..6cc7d3e --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgproto3/trace.go @@ -0,0 +1,416 @@ +package pgproto3 + +import ( + "bytes" + "fmt" + "io" + "strconv" + "strings" + "sync" + "time" +) + +// tracer traces the messages send to and from a Backend or Frontend. The format it produces roughly mimics the +// format produced by the libpq C function PQtrace. +type tracer struct { + TracerOptions + + mux sync.Mutex + w io.Writer + buf *bytes.Buffer +} + +// TracerOptions controls tracing behavior. It is roughly equivalent to the libpq function PQsetTraceFlags. +type TracerOptions struct { + // SuppressTimestamps prevents printing of timestamps. + SuppressTimestamps bool + + // RegressMode redacts fields that may be vary between executions. + RegressMode bool +} + +func (t *tracer) traceMessage(sender byte, encodedLen int32, msg Message) { + switch msg := msg.(type) { + case *AuthenticationCleartextPassword: + t.traceAuthenticationCleartextPassword(sender, encodedLen, msg) + case *AuthenticationGSS: + t.traceAuthenticationGSS(sender, encodedLen, msg) + case *AuthenticationGSSContinue: + t.traceAuthenticationGSSContinue(sender, encodedLen, msg) + case *AuthenticationMD5Password: + t.traceAuthenticationMD5Password(sender, encodedLen, msg) + case *AuthenticationOk: + t.traceAuthenticationOk(sender, encodedLen, msg) + case *AuthenticationSASL: + t.traceAuthenticationSASL(sender, encodedLen, msg) + case *AuthenticationSASLContinue: + t.traceAuthenticationSASLContinue(sender, encodedLen, msg) + case *AuthenticationSASLFinal: + t.traceAuthenticationSASLFinal(sender, encodedLen, msg) + case *BackendKeyData: + t.traceBackendKeyData(sender, encodedLen, msg) + case *Bind: + t.traceBind(sender, encodedLen, msg) + case *BindComplete: + t.traceBindComplete(sender, encodedLen, msg) + case *CancelRequest: + t.traceCancelRequest(sender, encodedLen, msg) + case *Close: + t.traceClose(sender, encodedLen, msg) + case *CloseComplete: + t.traceCloseComplete(sender, encodedLen, msg) + case *CommandComplete: + t.traceCommandComplete(sender, encodedLen, msg) + case *CopyBothResponse: + t.traceCopyBothResponse(sender, encodedLen, msg) + case *CopyData: + t.traceCopyData(sender, encodedLen, msg) + case *CopyDone: + t.traceCopyDone(sender, encodedLen, msg) + case *CopyFail: + t.traceCopyFail(sender, encodedLen, msg) + case *CopyInResponse: + t.traceCopyInResponse(sender, encodedLen, msg) + case *CopyOutResponse: + t.traceCopyOutResponse(sender, encodedLen, msg) + case *DataRow: + t.traceDataRow(sender, encodedLen, msg) + case *Describe: + t.traceDescribe(sender, encodedLen, msg) + case *EmptyQueryResponse: + t.traceEmptyQueryResponse(sender, encodedLen, msg) + case *ErrorResponse: + t.traceErrorResponse(sender, encodedLen, msg) + case *Execute: + t.TraceQueryute(sender, encodedLen, msg) + case *Flush: + t.traceFlush(sender, encodedLen, msg) + case *FunctionCall: + t.traceFunctionCall(sender, encodedLen, msg) + case *FunctionCallResponse: + t.traceFunctionCallResponse(sender, encodedLen, msg) + case *GSSEncRequest: + t.traceGSSEncRequest(sender, encodedLen, msg) + case *NoData: + t.traceNoData(sender, encodedLen, msg) + case *NoticeResponse: + t.traceNoticeResponse(sender, encodedLen, msg) + case *NotificationResponse: + t.traceNotificationResponse(sender, encodedLen, msg) + case *ParameterDescription: + t.traceParameterDescription(sender, encodedLen, msg) + case *ParameterStatus: + t.traceParameterStatus(sender, encodedLen, msg) + case *Parse: + t.traceParse(sender, encodedLen, msg) + case *ParseComplete: + t.traceParseComplete(sender, encodedLen, msg) + case *PortalSuspended: + t.tracePortalSuspended(sender, encodedLen, msg) + case *Query: + t.traceQuery(sender, encodedLen, msg) + case *ReadyForQuery: + t.traceReadyForQuery(sender, encodedLen, msg) + case *RowDescription: + t.traceRowDescription(sender, encodedLen, msg) + case *SSLRequest: + t.traceSSLRequest(sender, encodedLen, msg) + case *StartupMessage: + t.traceStartupMessage(sender, encodedLen, msg) + case *Sync: + t.traceSync(sender, encodedLen, msg) + case *Terminate: + t.traceTerminate(sender, encodedLen, msg) + default: + t.writeTrace(sender, encodedLen, "Unknown", nil) + } +} + +func (t *tracer) traceAuthenticationCleartextPassword(sender byte, encodedLen int32, msg *AuthenticationCleartextPassword) { + t.writeTrace(sender, encodedLen, "AuthenticationCleartextPassword", nil) +} + +func (t *tracer) traceAuthenticationGSS(sender byte, encodedLen int32, msg *AuthenticationGSS) { + t.writeTrace(sender, encodedLen, "AuthenticationGSS", nil) +} + +func (t *tracer) traceAuthenticationGSSContinue(sender byte, encodedLen int32, msg *AuthenticationGSSContinue) { + t.writeTrace(sender, encodedLen, "AuthenticationGSSContinue", nil) +} + +func (t *tracer) traceAuthenticationMD5Password(sender byte, encodedLen int32, msg *AuthenticationMD5Password) { + t.writeTrace(sender, encodedLen, "AuthenticationMD5Password", nil) +} + +func (t *tracer) traceAuthenticationOk(sender byte, encodedLen int32, msg *AuthenticationOk) { + t.writeTrace(sender, encodedLen, "AuthenticationOk", nil) +} + +func (t *tracer) traceAuthenticationSASL(sender byte, encodedLen int32, msg *AuthenticationSASL) { + t.writeTrace(sender, encodedLen, "AuthenticationSASL", nil) +} + +func (t *tracer) traceAuthenticationSASLContinue(sender byte, encodedLen int32, msg *AuthenticationSASLContinue) { + t.writeTrace(sender, encodedLen, "AuthenticationSASLContinue", nil) +} + +func (t *tracer) traceAuthenticationSASLFinal(sender byte, encodedLen int32, msg *AuthenticationSASLFinal) { + t.writeTrace(sender, encodedLen, "AuthenticationSASLFinal", nil) +} + +func (t *tracer) traceBackendKeyData(sender byte, encodedLen int32, msg *BackendKeyData) { + t.writeTrace(sender, encodedLen, "BackendKeyData", func() { + if t.RegressMode { + t.buf.WriteString("\t NNNN NNNN") + } else { + fmt.Fprintf(t.buf, "\t %d %d", msg.ProcessID, msg.SecretKey) + } + }) +} + +func (t *tracer) traceBind(sender byte, encodedLen int32, msg *Bind) { + t.writeTrace(sender, encodedLen, "Bind", func() { + fmt.Fprintf(t.buf, "\t %s %s %d", traceDoubleQuotedString([]byte(msg.DestinationPortal)), traceDoubleQuotedString([]byte(msg.PreparedStatement)), len(msg.ParameterFormatCodes)) + for _, fc := range msg.ParameterFormatCodes { + fmt.Fprintf(t.buf, " %d", fc) + } + fmt.Fprintf(t.buf, " %d", len(msg.Parameters)) + for _, p := range msg.Parameters { + fmt.Fprintf(t.buf, " %s", traceSingleQuotedString(p)) + } + fmt.Fprintf(t.buf, " %d", len(msg.ResultFormatCodes)) + for _, fc := range msg.ResultFormatCodes { + fmt.Fprintf(t.buf, " %d", fc) + } + }) +} + +func (t *tracer) traceBindComplete(sender byte, encodedLen int32, msg *BindComplete) { + t.writeTrace(sender, encodedLen, "BindComplete", nil) +} + +func (t *tracer) traceCancelRequest(sender byte, encodedLen int32, msg *CancelRequest) { + t.writeTrace(sender, encodedLen, "CancelRequest", nil) +} + +func (t *tracer) traceClose(sender byte, encodedLen int32, msg *Close) { + t.writeTrace(sender, encodedLen, "Close", nil) +} + +func (t *tracer) traceCloseComplete(sender byte, encodedLen int32, msg *CloseComplete) { + t.writeTrace(sender, encodedLen, "CloseComplete", nil) +} + +func (t *tracer) traceCommandComplete(sender byte, encodedLen int32, msg *CommandComplete) { + t.writeTrace(sender, encodedLen, "CommandComplete", func() { + fmt.Fprintf(t.buf, "\t %s", traceDoubleQuotedString(msg.CommandTag)) + }) +} + +func (t *tracer) traceCopyBothResponse(sender byte, encodedLen int32, msg *CopyBothResponse) { + t.writeTrace(sender, encodedLen, "CopyBothResponse", nil) +} + +func (t *tracer) traceCopyData(sender byte, encodedLen int32, msg *CopyData) { + t.writeTrace(sender, encodedLen, "CopyData", nil) +} + +func (t *tracer) traceCopyDone(sender byte, encodedLen int32, msg *CopyDone) { + t.writeTrace(sender, encodedLen, "CopyDone", nil) +} + +func (t *tracer) traceCopyFail(sender byte, encodedLen int32, msg *CopyFail) { + t.writeTrace(sender, encodedLen, "CopyFail", func() { + fmt.Fprintf(t.buf, "\t %s", traceDoubleQuotedString([]byte(msg.Message))) + }) +} + +func (t *tracer) traceCopyInResponse(sender byte, encodedLen int32, msg *CopyInResponse) { + t.writeTrace(sender, encodedLen, "CopyInResponse", nil) +} + +func (t *tracer) traceCopyOutResponse(sender byte, encodedLen int32, msg *CopyOutResponse) { + t.writeTrace(sender, encodedLen, "CopyOutResponse", nil) +} + +func (t *tracer) traceDataRow(sender byte, encodedLen int32, msg *DataRow) { + t.writeTrace(sender, encodedLen, "DataRow", func() { + fmt.Fprintf(t.buf, "\t %d", len(msg.Values)) + for _, v := range msg.Values { + if v == nil { + t.buf.WriteString(" -1") + } else { + fmt.Fprintf(t.buf, " %d %s", len(v), traceSingleQuotedString(v)) + } + } + }) +} + +func (t *tracer) traceDescribe(sender byte, encodedLen int32, msg *Describe) { + t.writeTrace(sender, encodedLen, "Describe", func() { + fmt.Fprintf(t.buf, "\t %c %s", msg.ObjectType, traceDoubleQuotedString([]byte(msg.Name))) + }) +} + +func (t *tracer) traceEmptyQueryResponse(sender byte, encodedLen int32, msg *EmptyQueryResponse) { + t.writeTrace(sender, encodedLen, "EmptyQueryResponse", nil) +} + +func (t *tracer) traceErrorResponse(sender byte, encodedLen int32, msg *ErrorResponse) { + t.writeTrace(sender, encodedLen, "ErrorResponse", nil) +} + +func (t *tracer) TraceQueryute(sender byte, encodedLen int32, msg *Execute) { + t.writeTrace(sender, encodedLen, "Execute", func() { + fmt.Fprintf(t.buf, "\t %s %d", traceDoubleQuotedString([]byte(msg.Portal)), msg.MaxRows) + }) +} + +func (t *tracer) traceFlush(sender byte, encodedLen int32, msg *Flush) { + t.writeTrace(sender, encodedLen, "Flush", nil) +} + +func (t *tracer) traceFunctionCall(sender byte, encodedLen int32, msg *FunctionCall) { + t.writeTrace(sender, encodedLen, "FunctionCall", nil) +} + +func (t *tracer) traceFunctionCallResponse(sender byte, encodedLen int32, msg *FunctionCallResponse) { + t.writeTrace(sender, encodedLen, "FunctionCallResponse", nil) +} + +func (t *tracer) traceGSSEncRequest(sender byte, encodedLen int32, msg *GSSEncRequest) { + t.writeTrace(sender, encodedLen, "GSSEncRequest", nil) +} + +func (t *tracer) traceNoData(sender byte, encodedLen int32, msg *NoData) { + t.writeTrace(sender, encodedLen, "NoData", nil) +} + +func (t *tracer) traceNoticeResponse(sender byte, encodedLen int32, msg *NoticeResponse) { + t.writeTrace(sender, encodedLen, "NoticeResponse", nil) +} + +func (t *tracer) traceNotificationResponse(sender byte, encodedLen int32, msg *NotificationResponse) { + t.writeTrace(sender, encodedLen, "NotificationResponse", func() { + fmt.Fprintf(t.buf, "\t %d %s %s", msg.PID, traceDoubleQuotedString([]byte(msg.Channel)), traceDoubleQuotedString([]byte(msg.Payload))) + }) +} + +func (t *tracer) traceParameterDescription(sender byte, encodedLen int32, msg *ParameterDescription) { + t.writeTrace(sender, encodedLen, "ParameterDescription", nil) +} + +func (t *tracer) traceParameterStatus(sender byte, encodedLen int32, msg *ParameterStatus) { + t.writeTrace(sender, encodedLen, "ParameterStatus", func() { + fmt.Fprintf(t.buf, "\t %s %s", traceDoubleQuotedString([]byte(msg.Name)), traceDoubleQuotedString([]byte(msg.Value))) + }) +} + +func (t *tracer) traceParse(sender byte, encodedLen int32, msg *Parse) { + t.writeTrace(sender, encodedLen, "Parse", func() { + fmt.Fprintf(t.buf, "\t %s %s %d", traceDoubleQuotedString([]byte(msg.Name)), traceDoubleQuotedString([]byte(msg.Query)), len(msg.ParameterOIDs)) + for _, oid := range msg.ParameterOIDs { + fmt.Fprintf(t.buf, " %d", oid) + } + }) +} + +func (t *tracer) traceParseComplete(sender byte, encodedLen int32, msg *ParseComplete) { + t.writeTrace(sender, encodedLen, "ParseComplete", nil) +} + +func (t *tracer) tracePortalSuspended(sender byte, encodedLen int32, msg *PortalSuspended) { + t.writeTrace(sender, encodedLen, "PortalSuspended", nil) +} + +func (t *tracer) traceQuery(sender byte, encodedLen int32, msg *Query) { + t.writeTrace(sender, encodedLen, "Query", func() { + fmt.Fprintf(t.buf, "\t %s", traceDoubleQuotedString([]byte(msg.String))) + }) +} + +func (t *tracer) traceReadyForQuery(sender byte, encodedLen int32, msg *ReadyForQuery) { + t.writeTrace(sender, encodedLen, "ReadyForQuery", func() { + fmt.Fprintf(t.buf, "\t %c", msg.TxStatus) + }) +} + +func (t *tracer) traceRowDescription(sender byte, encodedLen int32, msg *RowDescription) { + t.writeTrace(sender, encodedLen, "RowDescription", func() { + fmt.Fprintf(t.buf, "\t %d", len(msg.Fields)) + for _, fd := range msg.Fields { + fmt.Fprintf(t.buf, ` %s %d %d %d %d %d %d`, traceDoubleQuotedString(fd.Name), fd.TableOID, fd.TableAttributeNumber, fd.DataTypeOID, fd.DataTypeSize, fd.TypeModifier, fd.Format) + } + }) +} + +func (t *tracer) traceSSLRequest(sender byte, encodedLen int32, msg *SSLRequest) { + t.writeTrace(sender, encodedLen, "SSLRequest", nil) +} + +func (t *tracer) traceStartupMessage(sender byte, encodedLen int32, msg *StartupMessage) { + t.writeTrace(sender, encodedLen, "StartupMessage", nil) +} + +func (t *tracer) traceSync(sender byte, encodedLen int32, msg *Sync) { + t.writeTrace(sender, encodedLen, "Sync", nil) +} + +func (t *tracer) traceTerminate(sender byte, encodedLen int32, msg *Terminate) { + t.writeTrace(sender, encodedLen, "Terminate", nil) +} + +func (t *tracer) writeTrace(sender byte, encodedLen int32, msgType string, writeDetails func()) { + t.mux.Lock() + defer t.mux.Unlock() + defer func() { + if t.buf.Cap() > 1024 { + t.buf = &bytes.Buffer{} + } else { + t.buf.Reset() + } + }() + + if !t.SuppressTimestamps { + now := time.Now() + t.buf.WriteString(now.Format("2006-01-02 15:04:05.000000")) + t.buf.WriteByte('\t') + } + + t.buf.WriteByte(sender) + t.buf.WriteByte('\t') + t.buf.WriteString(msgType) + t.buf.WriteByte('\t') + t.buf.WriteString(strconv.FormatInt(int64(encodedLen), 10)) + + if writeDetails != nil { + writeDetails() + } + + t.buf.WriteByte('\n') + t.buf.WriteTo(t.w) +} + +// traceDoubleQuotedString returns t.buf as a double-quoted string without any escaping. It is roughly equivalent to +// pqTraceOutputString in libpq. +func traceDoubleQuotedString(buf []byte) string { + return `"` + string(buf) + `"` +} + +// traceSingleQuotedString returns buf as a single-quoted string with non-printable characters hex-escaped. It is +// roughly equivalent to pqTraceOutputNchar in libpq. +func traceSingleQuotedString(buf []byte) string { + sb := &strings.Builder{} + + sb.WriteByte('\'') + for _, b := range buf { + if b < 32 || b > 126 { + fmt.Fprintf(sb, `\x%x`, b) + } else { + sb.WriteByte(b) + } + } + sb.WriteByte('\'') + + return sb.String() +} diff --git a/vendor/github.com/jackc/pgtype/array.go b/vendor/github.com/jackc/pgx/v5/pgtype/array.go similarity index 63% rename from vendor/github.com/jackc/pgtype/array.go rename to vendor/github.com/jackc/pgx/v5/pgtype/array.go index 174007c..06b824a 100644 --- a/vendor/github.com/jackc/pgtype/array.go +++ b/vendor/github.com/jackc/pgx/v5/pgtype/array.go @@ -5,21 +5,20 @@ import ( "encoding/binary" "fmt" "io" - "reflect" "strconv" "strings" "unicode" - "github.com/jackc/pgio" + "github.com/jackc/pgx/v5/internal/pgio" ) // Information on the internals of PostgreSQL arrays can be found in // src/include/utils/array.h and src/backend/utils/adt/arrayfuncs.c. Of // particular interest is the array_send function. -type ArrayHeader struct { +type arrayHeader struct { ContainsNull bool - ElementOID int32 + ElementOID uint32 Dimensions []ArrayDimension } @@ -28,7 +27,21 @@ type ArrayDimension struct { LowerBound int32 } -func (dst *ArrayHeader) DecodeBinary(ci *ConnInfo, src []byte) (int, error) { +// cardinality returns the number of elements in an array of dimensions size. +func cardinality(dimensions []ArrayDimension) int { + if len(dimensions) == 0 { + return 0 + } + + elementCount := int(dimensions[0].Length) + for _, d := range dimensions[1:] { + elementCount *= int(d.Length) + } + + return elementCount +} + +func (dst *arrayHeader) DecodeBinary(m *Map, src []byte) (int, error) { if len(src) < 12 { return 0, fmt.Errorf("array header too short: %d", len(src)) } @@ -41,12 +54,10 @@ func (dst *ArrayHeader) DecodeBinary(ci *ConnInfo, src []byte) (int, error) { dst.ContainsNull = binary.BigEndian.Uint32(src[rp:]) == 1 rp += 4 - dst.ElementOID = int32(binary.BigEndian.Uint32(src[rp:])) + dst.ElementOID = binary.BigEndian.Uint32(src[rp:]) rp += 4 - if numDims > 0 { - dst.Dimensions = make([]ArrayDimension, numDims) - } + dst.Dimensions = make([]ArrayDimension, numDims) if len(src) < 12+numDims*8 { return 0, fmt.Errorf("array header too short for %d dimensions: %d", numDims, len(src)) } @@ -61,7 +72,7 @@ func (dst *ArrayHeader) DecodeBinary(ci *ConnInfo, src []byte) (int, error) { return rp, nil } -func (src ArrayHeader) EncodeBinary(ci *ConnInfo, buf []byte) []byte { +func (src arrayHeader) EncodeBinary(buf []byte) []byte { buf = pgio.AppendInt32(buf, int32(len(src.Dimensions))) var containsNull int32 @@ -70,7 +81,7 @@ func (src ArrayHeader) EncodeBinary(ci *ConnInfo, buf []byte) []byte { } buf = pgio.AppendInt32(buf, containsNull) - buf = pgio.AppendInt32(buf, src.ElementOID) + buf = pgio.AppendUint32(buf, src.ElementOID) for i := range src.Dimensions { buf = pgio.AppendInt32(buf, src.Dimensions[i].Length) @@ -80,14 +91,18 @@ func (src ArrayHeader) EncodeBinary(ci *ConnInfo, buf []byte) []byte { return buf } -type UntypedTextArray struct { +type untypedTextArray struct { Elements []string Quoted []bool Dimensions []ArrayDimension } -func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { - dst := &UntypedTextArray{} +func parseUntypedTextArray(src string) (*untypedTextArray, error) { + dst := &untypedTextArray{ + Elements: []string{}, + Quoted: []bool{}, + Dimensions: []ArrayDimension{}, + } buf := bytes.NewBufferString(src) @@ -95,7 +110,7 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { r, _, err := buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } var explicitDimensions []ArrayDimension @@ -107,7 +122,7 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { for { r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } if r == '=' { @@ -118,12 +133,12 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { lower, err := arrayParseInteger(buf) if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } if r != ':' { @@ -132,12 +147,12 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { upper, err := arrayParseInteger(buf) if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } if r != ']' { @@ -149,12 +164,12 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } } if r != '{' { - return nil, fmt.Errorf("invalid array, expected '{': %v", err) + return nil, fmt.Errorf("invalid array, expected '{' got %v", r) } implicitDimensions := []ArrayDimension{{LowerBound: 1, Length: 0}} @@ -163,7 +178,7 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { for { r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } if r == '{' { @@ -180,7 +195,7 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { for { r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid array: %v", err) + return nil, fmt.Errorf("invalid array: %w", err) } switch r { @@ -199,7 +214,7 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { buf.UnreadRune() value, quoted, err := arrayParseValue(buf) if err != nil { - return nil, fmt.Errorf("invalid array value: %v", err) + return nil, fmt.Errorf("invalid array value: %w", err) } if currentDim == counterDim { implicitDimensions[currentDim].Length++ @@ -220,7 +235,6 @@ func ParseUntypedTextArray(src string) (*UntypedTextArray, error) { } if len(dst.Elements) == 0 { - dst.Dimensions = nil } else if len(explicitDimensions) > 0 { dst.Dimensions = explicitDimensions } else { @@ -318,7 +332,7 @@ func arrayParseInteger(buf *bytes.Buffer) (int32, error) { } } -func EncodeTextArrayDimensions(buf []byte, dimensions []ArrayDimension) []byte { +func encodeTextArrayDimensions(buf []byte, dimensions []ArrayDimension) []byte { var customDimensions bool for _, dim := range dimensions { if dim.LowerBound != 1 { @@ -348,34 +362,99 @@ func quoteArrayElement(src string) string { } func isSpace(ch byte) bool { - // see https://github.com/postgres/postgres/blob/REL_12_STABLE/src/backend/parser/scansup.c#L224 - return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' + // see array_isspace: + // https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/arrayfuncs.c + return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\v' || ch == '\f' } -func QuoteArrayElementIfNeeded(src string) string { - if src == "" || (len(src) == 4 && strings.ToLower(src) == "null") || isSpace(src[0]) || isSpace(src[len(src)-1]) || strings.ContainsAny(src, `{},"\`) { +func quoteArrayElementIfNeeded(src string) string { + if src == "" || (len(src) == 4 && strings.EqualFold(src, "null")) || isSpace(src[0]) || isSpace(src[len(src)-1]) || strings.ContainsAny(src, `{},"\`) { return quoteArrayElement(src) } return src } -func findDimensionsFromValue(value reflect.Value, dimensions []ArrayDimension, elementsLength int) ([]ArrayDimension, int, bool) { - switch value.Kind() { - case reflect.Array: - fallthrough - case reflect.Slice: - length := value.Len() - if 0 == elementsLength { - elementsLength = length - } else { - elementsLength *= length - } - dimensions = append(dimensions, ArrayDimension{Length: int32(length), LowerBound: 1}) - for i := 0; i < length; i++ { - if d, l, ok := findDimensionsFromValue(value.Index(i), dimensions, elementsLength); ok { - return d, l, true - } - } +// Array represents a PostgreSQL array for T. It implements the ArrayGetter and ArraySetter interfaces. It preserves +// PostgreSQL dimensions and custom lower bounds. Use FlatArray if these are not needed. +type Array[T any] struct { + Elements []T + Dims []ArrayDimension + Valid bool +} + +func (a Array[T]) Dimensions() []ArrayDimension { + return a.Dims +} + +func (a Array[T]) Index(i int) any { + return a.Elements[i] +} + +func (a Array[T]) IndexType() any { + var el T + return el +} + +func (a *Array[T]) SetDimensions(dimensions []ArrayDimension) error { + if dimensions == nil { + *a = Array[T]{} + return nil + } + + elementCount := cardinality(dimensions) + *a = Array[T]{ + Elements: make([]T, elementCount), + Dims: dimensions, + Valid: true, } - return dimensions, elementsLength, true + + return nil +} + +func (a Array[T]) ScanIndex(i int) any { + return &a.Elements[i] +} + +func (a Array[T]) ScanIndexType() any { + return new(T) +} + +// FlatArray implements the ArrayGetter and ArraySetter interfaces for any slice of T. It ignores PostgreSQL dimensions +// and custom lower bounds. Use Array to preserve these. +type FlatArray[T any] []T + +func (a FlatArray[T]) Dimensions() []ArrayDimension { + if a == nil { + return nil + } + + return []ArrayDimension{{Length: int32(len(a)), LowerBound: 1}} +} + +func (a FlatArray[T]) Index(i int) any { + return a[i] +} + +func (a FlatArray[T]) IndexType() any { + var el T + return el +} + +func (a *FlatArray[T]) SetDimensions(dimensions []ArrayDimension) error { + if dimensions == nil { + *a = nil + return nil + } + + elementCount := cardinality(dimensions) + *a = make(FlatArray[T], elementCount) + return nil +} + +func (a FlatArray[T]) ScanIndex(i int) any { + return &a[i] +} + +func (a FlatArray[T]) ScanIndexType() any { + return new(T) } diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/array_codec.go b/vendor/github.com/jackc/pgx/v5/pgtype/array_codec.go new file mode 100644 index 0000000..bf5f698 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/array_codec.go @@ -0,0 +1,405 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "reflect" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +// ArrayGetter is a type that can be converted into a PostgreSQL array. +type ArrayGetter interface { + // Dimensions returns the array dimensions. If array is nil then nil is returned. + Dimensions() []ArrayDimension + + // Index returns the element at i. + Index(i int) any + + // IndexType returns a non-nil scan target of the type Index will return. This is used by ArrayCodec.PlanEncode. + IndexType() any +} + +// ArraySetter is a type can be set from a PostgreSQL array. +type ArraySetter interface { + // SetDimensions prepares the value such that ScanIndex can be called for each element. This will remove any existing + // elements. dimensions may be nil to indicate a NULL array. If unable to exactly preserve dimensions SetDimensions + // may return an error or silently flatten the array dimensions. + SetDimensions(dimensions []ArrayDimension) error + + // ScanIndex returns a value usable as a scan target for i. SetDimensions must be called before ScanIndex. + ScanIndex(i int) any + + // ScanIndexType returns a non-nil scan target of the type ScanIndex will return. This is used by + // ArrayCodec.PlanScan. + ScanIndexType() any +} + +// ArrayCodec is a codec for any array type. +type ArrayCodec struct { + ElementType *Type +} + +func (c *ArrayCodec) FormatSupported(format int16) bool { + return c.ElementType.Codec.FormatSupported(format) +} + +func (c *ArrayCodec) PreferredFormat() int16 { + // The binary format should always be preferred for arrays if it is supported. Usually, this will happen automatically + // because most types that support binary prefer it. However, text, json, and jsonb support binary but prefer the text + // format. This is because it is simpler for jsonb and PostgreSQL can be significantly faster using the text format + // for text-like data types than binary. However, arrays appear to always be faster in binary. + // + // https://www.postgresql.org/message-id/CAMovtNoHFod2jMAKQjjxv209PCTJx5Kc66anwWvX0mEiaXwgmA%40mail.gmail.com + if c.ElementType.Codec.FormatSupported(BinaryFormatCode) { + return BinaryFormatCode + } + return TextFormatCode +} + +func (c *ArrayCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + arrayValuer, ok := value.(ArrayGetter) + if !ok { + return nil + } + + elementType := arrayValuer.IndexType() + + elementEncodePlan := m.PlanEncode(c.ElementType.OID, format, elementType) + if elementEncodePlan == nil { + if reflect.TypeOf(elementType) != nil { + return nil + } + } + + switch format { + case BinaryFormatCode: + return &encodePlanArrayCodecBinary{ac: c, m: m, oid: oid} + case TextFormatCode: + return &encodePlanArrayCodecText{ac: c, m: m, oid: oid} + } + + return nil +} + +type encodePlanArrayCodecText struct { + ac *ArrayCodec + m *Map + oid uint32 +} + +func (p *encodePlanArrayCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + array := value.(ArrayGetter) + + dimensions := array.Dimensions() + if dimensions == nil { + return nil, nil + } + + elementCount := cardinality(dimensions) + if elementCount == 0 { + return append(buf, '{', '}'), nil + } + + buf = encodeTextArrayDimensions(buf, dimensions) + + // dimElemCounts is the multiples of elements that each array lies on. For + // example, a single dimension array of length 4 would have a dimElemCounts of + // [4]. A multi-dimensional array of lengths [3,5,2] would have a + // dimElemCounts of [30,10,2]. This is used to simplify when to render a '{' + // or '}'. + dimElemCounts := make([]int, len(dimensions)) + dimElemCounts[len(dimensions)-1] = int(dimensions[len(dimensions)-1].Length) + for i := len(dimensions) - 2; i > -1; i-- { + dimElemCounts[i] = int(dimensions[i].Length) * dimElemCounts[i+1] + } + + var encodePlan EncodePlan + var lastElemType reflect.Type + inElemBuf := make([]byte, 0, 32) + for i := 0; i < elementCount; i++ { + if i > 0 { + buf = append(buf, ',') + } + + for _, dec := range dimElemCounts { + if i%dec == 0 { + buf = append(buf, '{') + } + } + + elem := array.Index(i) + var elemBuf []byte + if elem != nil { + elemType := reflect.TypeOf(elem) + if lastElemType != elemType { + lastElemType = elemType + encodePlan = p.m.PlanEncode(p.ac.ElementType.OID, TextFormatCode, elem) + if encodePlan == nil { + return nil, fmt.Errorf("unable to encode %v", array.Index(i)) + } + } + elemBuf, err = encodePlan.Encode(elem, inElemBuf) + if err != nil { + return nil, err + } + } + + if elemBuf == nil { + buf = append(buf, `NULL`...) + } else { + buf = append(buf, quoteArrayElementIfNeeded(string(elemBuf))...) + } + + for _, dec := range dimElemCounts { + if (i+1)%dec == 0 { + buf = append(buf, '}') + } + } + } + + return buf, nil +} + +type encodePlanArrayCodecBinary struct { + ac *ArrayCodec + m *Map + oid uint32 +} + +func (p *encodePlanArrayCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + array := value.(ArrayGetter) + + dimensions := array.Dimensions() + if dimensions == nil { + return nil, nil + } + + arrayHeader := arrayHeader{ + Dimensions: dimensions, + ElementOID: p.ac.ElementType.OID, + } + + containsNullIndex := len(buf) + 4 + + buf = arrayHeader.EncodeBinary(buf) + + elementCount := cardinality(dimensions) + + var encodePlan EncodePlan + var lastElemType reflect.Type + for i := 0; i < elementCount; i++ { + sp := len(buf) + buf = pgio.AppendInt32(buf, -1) + + elem := array.Index(i) + var elemBuf []byte + if elem != nil { + elemType := reflect.TypeOf(elem) + if lastElemType != elemType { + lastElemType = elemType + encodePlan = p.m.PlanEncode(p.ac.ElementType.OID, BinaryFormatCode, elem) + if encodePlan == nil { + return nil, fmt.Errorf("unable to encode %v", array.Index(i)) + } + } + elemBuf, err = encodePlan.Encode(elem, buf) + if err != nil { + return nil, err + } + } + + if elemBuf == nil { + pgio.SetInt32(buf[containsNullIndex:], 1) + } else { + buf = elemBuf + pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) + } + } + + return buf, nil +} + +func (c *ArrayCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + arrayScanner, ok := target.(ArraySetter) + if !ok { + return nil + } + + // target / arrayScanner might be a pointer to a nil. If it is create one so we can call ScanIndexType to plan the + // scan of the elements. + if isNil, _ := isNilDriverValuer(target); isNil { + arrayScanner = reflect.New(reflect.TypeOf(target).Elem()).Interface().(ArraySetter) + } + + elementType := arrayScanner.ScanIndexType() + + elementScanPlan := m.PlanScan(c.ElementType.OID, format, elementType) + if _, ok := elementScanPlan.(*scanPlanFail); ok { + return nil + } + + return &scanPlanArrayCodec{ + arrayCodec: c, + m: m, + oid: oid, + formatCode: format, + } +} + +func (c *ArrayCodec) decodeBinary(m *Map, arrayOID uint32, src []byte, array ArraySetter) error { + var arrayHeader arrayHeader + rp, err := arrayHeader.DecodeBinary(m, src) + if err != nil { + return err + } + + err = array.SetDimensions(arrayHeader.Dimensions) + if err != nil { + return err + } + + elementCount := cardinality(arrayHeader.Dimensions) + if elementCount == 0 { + return nil + } + + elementScanPlan := c.ElementType.Codec.PlanScan(m, c.ElementType.OID, BinaryFormatCode, array.ScanIndex(0)) + if elementScanPlan == nil { + elementScanPlan = m.PlanScan(c.ElementType.OID, BinaryFormatCode, array.ScanIndex(0)) + } + + for i := 0; i < elementCount; i++ { + elem := array.ScanIndex(i) + elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) + rp += 4 + var elemSrc []byte + if elemLen >= 0 { + elemSrc = src[rp : rp+elemLen] + rp += elemLen + } + err = elementScanPlan.Scan(elemSrc, elem) + if err != nil { + return fmt.Errorf("failed to scan array element %d: %w", i, err) + } + } + + return nil +} + +func (c *ArrayCodec) decodeText(m *Map, arrayOID uint32, src []byte, array ArraySetter) error { + uta, err := parseUntypedTextArray(string(src)) + if err != nil { + return err + } + + err = array.SetDimensions(uta.Dimensions) + if err != nil { + return err + } + + if len(uta.Elements) == 0 { + return nil + } + + elementScanPlan := c.ElementType.Codec.PlanScan(m, c.ElementType.OID, TextFormatCode, array.ScanIndex(0)) + if elementScanPlan == nil { + elementScanPlan = m.PlanScan(c.ElementType.OID, TextFormatCode, array.ScanIndex(0)) + } + + for i, s := range uta.Elements { + elem := array.ScanIndex(i) + var elemSrc []byte + if s != "NULL" || uta.Quoted[i] { + elemSrc = []byte(s) + } + + err = elementScanPlan.Scan(elemSrc, elem) + if err != nil { + return err + } + } + + return nil +} + +type scanPlanArrayCodec struct { + arrayCodec *ArrayCodec + m *Map + oid uint32 + formatCode int16 + elementScanPlan ScanPlan +} + +func (spac *scanPlanArrayCodec) Scan(src []byte, dst any) error { + c := spac.arrayCodec + m := spac.m + oid := spac.oid + formatCode := spac.formatCode + + array := dst.(ArraySetter) + + if src == nil { + return array.SetDimensions(nil) + } + + switch formatCode { + case BinaryFormatCode: + return c.decodeBinary(m, oid, src, array) + case TextFormatCode: + return c.decodeText(m, oid, src, array) + default: + return fmt.Errorf("unknown format code %d", formatCode) + } +} + +func (c *ArrayCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + return string(src), nil + case BinaryFormatCode: + buf := make([]byte, len(src)) + copy(buf, src) + return buf, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } +} + +func (c *ArrayCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var slice []any + err := m.PlanScan(oid, format, &slice).Scan(src, &slice) + return slice, err +} + +func isRagged(slice reflect.Value) bool { + if slice.Type().Elem().Kind() != reflect.Slice { + return false + } + + sliceLen := slice.Len() + innerLen := 0 + for i := 0; i < sliceLen; i++ { + if i == 0 { + innerLen = slice.Index(i).Len() + } else { + if slice.Index(i).Len() != innerLen { + return true + } + } + if isRagged(slice.Index(i)) { + return true + } + } + + return false +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/bits.go b/vendor/github.com/jackc/pgx/v5/pgtype/bits.go new file mode 100644 index 0000000..e7a1d01 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/bits.go @@ -0,0 +1,210 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type BitsScanner interface { + ScanBits(v Bits) error +} + +type BitsValuer interface { + BitsValue() (Bits, error) +} + +// Bits represents the PostgreSQL bit and varbit types. +type Bits struct { + Bytes []byte + Len int32 // Number of bits + Valid bool +} + +func (b *Bits) ScanBits(v Bits) error { + *b = v + return nil +} + +func (b Bits) BitsValue() (Bits, error) { + return b, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Bits) Scan(src any) error { + if src == nil { + *dst = Bits{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToBitsScanner{}.Scan([]byte(src), dst) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Bits) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + buf, err := BitsCodec{}.PlanEncode(nil, 0, TextFormatCode, src).Encode(src, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type BitsCodec struct{} + +func (BitsCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (BitsCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (BitsCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(BitsValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanBitsCodecBinary{} + case TextFormatCode: + return encodePlanBitsCodecText{} + } + + return nil +} + +type encodePlanBitsCodecBinary struct{} + +func (encodePlanBitsCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + bits, err := value.(BitsValuer).BitsValue() + if err != nil { + return nil, err + } + + if !bits.Valid { + return nil, nil + } + + buf = pgio.AppendInt32(buf, bits.Len) + return append(buf, bits.Bytes...), nil +} + +type encodePlanBitsCodecText struct{} + +func (encodePlanBitsCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + bits, err := value.(BitsValuer).BitsValue() + if err != nil { + return nil, err + } + + if !bits.Valid { + return nil, nil + } + + for i := int32(0); i < bits.Len; i++ { + byteIdx := i / 8 + bitMask := byte(128 >> byte(i%8)) + char := byte('0') + if bits.Bytes[byteIdx]&bitMask > 0 { + char = '1' + } + buf = append(buf, char) + } + + return buf, nil +} + +func (BitsCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case BitsScanner: + return scanPlanBinaryBitsToBitsScanner{} + } + case TextFormatCode: + switch target.(type) { + case BitsScanner: + return scanPlanTextAnyToBitsScanner{} + } + } + + return nil +} + +func (c BitsCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c BitsCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var box Bits + err := codecScan(c, m, oid, format, src, &box) + if err != nil { + return nil, err + } + return box, nil +} + +type scanPlanBinaryBitsToBitsScanner struct{} + +func (scanPlanBinaryBitsToBitsScanner) Scan(src []byte, dst any) error { + scanner := (dst).(BitsScanner) + + if src == nil { + return scanner.ScanBits(Bits{}) + } + + if len(src) < 4 { + return fmt.Errorf("invalid length for bit/varbit: %v", len(src)) + } + + bitLen := int32(binary.BigEndian.Uint32(src)) + rp := 4 + buf := make([]byte, len(src[rp:])) + copy(buf, src[rp:]) + + return scanner.ScanBits(Bits{Bytes: buf, Len: bitLen, Valid: true}) +} + +type scanPlanTextAnyToBitsScanner struct{} + +func (scanPlanTextAnyToBitsScanner) Scan(src []byte, dst any) error { + scanner := (dst).(BitsScanner) + + if src == nil { + return scanner.ScanBits(Bits{}) + } + + bitLen := len(src) + byteLen := bitLen / 8 + if bitLen%8 > 0 { + byteLen++ + } + buf := make([]byte, byteLen) + + for i, b := range src { + if b == '1' { + byteIdx := i / 8 + bitIdx := uint(i % 8) + buf[byteIdx] = buf[byteIdx] | (128 >> bitIdx) + } + } + + return scanner.ScanBits(Bits{Bytes: buf, Len: int32(bitLen), Valid: true}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/bool.go b/vendor/github.com/jackc/pgx/v5/pgtype/bool.go new file mode 100644 index 0000000..71caffa --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/bool.go @@ -0,0 +1,343 @@ +package pgtype + +import ( + "bytes" + "database/sql/driver" + "encoding/json" + "fmt" + "strconv" + "strings" +) + +type BoolScanner interface { + ScanBool(v Bool) error +} + +type BoolValuer interface { + BoolValue() (Bool, error) +} + +type Bool struct { + Bool bool + Valid bool +} + +func (b *Bool) ScanBool(v Bool) error { + *b = v + return nil +} + +func (b Bool) BoolValue() (Bool, error) { + return b, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Bool) Scan(src any) error { + if src == nil { + *dst = Bool{} + return nil + } + + switch src := src.(type) { + case bool: + *dst = Bool{Bool: src, Valid: true} + return nil + case string: + b, err := strconv.ParseBool(src) + if err != nil { + return err + } + *dst = Bool{Bool: b, Valid: true} + return nil + case []byte: + b, err := strconv.ParseBool(string(src)) + if err != nil { + return err + } + *dst = Bool{Bool: b, Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Bool) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + return src.Bool, nil +} + +func (src Bool) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + + if src.Bool { + return []byte("true"), nil + } else { + return []byte("false"), nil + } +} + +func (dst *Bool) UnmarshalJSON(b []byte) error { + var v *bool + err := json.Unmarshal(b, &v) + if err != nil { + return err + } + + if v == nil { + *dst = Bool{} + } else { + *dst = Bool{Bool: *v, Valid: true} + } + + return nil +} + +type BoolCodec struct{} + +func (BoolCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (BoolCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (BoolCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case bool: + return encodePlanBoolCodecBinaryBool{} + case BoolValuer: + return encodePlanBoolCodecBinaryBoolValuer{} + } + case TextFormatCode: + switch value.(type) { + case bool: + return encodePlanBoolCodecTextBool{} + case BoolValuer: + return encodePlanBoolCodecTextBoolValuer{} + } + } + + return nil +} + +type encodePlanBoolCodecBinaryBool struct{} + +func (encodePlanBoolCodecBinaryBool) Encode(value any, buf []byte) (newBuf []byte, err error) { + v := value.(bool) + + if v { + buf = append(buf, 1) + } else { + buf = append(buf, 0) + } + + return buf, nil +} + +type encodePlanBoolCodecTextBoolValuer struct{} + +func (encodePlanBoolCodecTextBoolValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + b, err := value.(BoolValuer).BoolValue() + if err != nil { + return nil, err + } + + if !b.Valid { + return nil, nil + } + + if b.Bool { + buf = append(buf, 't') + } else { + buf = append(buf, 'f') + } + + return buf, nil +} + +type encodePlanBoolCodecBinaryBoolValuer struct{} + +func (encodePlanBoolCodecBinaryBoolValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + b, err := value.(BoolValuer).BoolValue() + if err != nil { + return nil, err + } + + if !b.Valid { + return nil, nil + } + + if b.Bool { + buf = append(buf, 1) + } else { + buf = append(buf, 0) + } + + return buf, nil +} + +type encodePlanBoolCodecTextBool struct{} + +func (encodePlanBoolCodecTextBool) Encode(value any, buf []byte) (newBuf []byte, err error) { + v := value.(bool) + + if v { + buf = append(buf, 't') + } else { + buf = append(buf, 'f') + } + + return buf, nil +} + +func (BoolCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *bool: + return scanPlanBinaryBoolToBool{} + case BoolScanner: + return scanPlanBinaryBoolToBoolScanner{} + } + case TextFormatCode: + switch target.(type) { + case *bool: + return scanPlanTextAnyToBool{} + case BoolScanner: + return scanPlanTextAnyToBoolScanner{} + } + } + + return nil +} + +func (c BoolCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return c.DecodeValue(m, oid, format, src) +} + +func (c BoolCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var b bool + err := codecScan(c, m, oid, format, src, &b) + if err != nil { + return nil, err + } + return b, nil +} + +type scanPlanBinaryBoolToBool struct{} + +func (scanPlanBinaryBoolToBool) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 1 { + return fmt.Errorf("invalid length for bool: %v", len(src)) + } + + p, ok := (dst).(*bool) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = src[0] == 1 + + return nil +} + +type scanPlanTextAnyToBool struct{} + +func (scanPlanTextAnyToBool) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) == 0 { + return fmt.Errorf("cannot scan empty string into %T", dst) + } + + p, ok := (dst).(*bool) + if !ok { + return ErrScanTargetTypeChanged + } + + v, err := planTextToBool(src) + if err != nil { + return err + } + + *p = v + + return nil +} + +type scanPlanBinaryBoolToBoolScanner struct{} + +func (scanPlanBinaryBoolToBoolScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(BoolScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanBool(Bool{}) + } + + if len(src) != 1 { + return fmt.Errorf("invalid length for bool: %v", len(src)) + } + + return s.ScanBool(Bool{Bool: src[0] == 1, Valid: true}) +} + +type scanPlanTextAnyToBoolScanner struct{} + +func (scanPlanTextAnyToBoolScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(BoolScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanBool(Bool{}) + } + + if len(src) == 0 { + return fmt.Errorf("cannot scan empty string into %T", dst) + } + + v, err := planTextToBool(src) + if err != nil { + return err + } + + return s.ScanBool(Bool{Bool: v, Valid: true}) +} + +// https://www.postgresql.org/docs/11/datatype-boolean.html +func planTextToBool(src []byte) (bool, error) { + s := string(bytes.ToLower(bytes.TrimSpace(src))) + + switch { + case strings.HasPrefix("true", s), strings.HasPrefix("yes", s), s == "on", s == "1": + return true, nil + case strings.HasPrefix("false", s), strings.HasPrefix("no", s), strings.HasPrefix("off", s), s == "0": + return false, nil + default: + return false, fmt.Errorf("unknown boolean string representation %q", src) + } +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/box.go b/vendor/github.com/jackc/pgx/v5/pgtype/box.go new file mode 100644 index 0000000..887d268 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/box.go @@ -0,0 +1,238 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type BoxScanner interface { + ScanBox(v Box) error +} + +type BoxValuer interface { + BoxValue() (Box, error) +} + +type Box struct { + P [2]Vec2 + Valid bool +} + +func (b *Box) ScanBox(v Box) error { + *b = v + return nil +} + +func (b Box) BoxValue() (Box, error) { + return b, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Box) Scan(src any) error { + if src == nil { + *dst = Box{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToBoxScanner{}.Scan([]byte(src), dst) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Box) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + buf, err := BoxCodec{}.PlanEncode(nil, 0, TextFormatCode, src).Encode(src, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type BoxCodec struct{} + +func (BoxCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (BoxCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (BoxCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(BoxValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanBoxCodecBinary{} + case TextFormatCode: + return encodePlanBoxCodecText{} + } + + return nil +} + +type encodePlanBoxCodecBinary struct{} + +func (encodePlanBoxCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + box, err := value.(BoxValuer).BoxValue() + if err != nil { + return nil, err + } + + if !box.Valid { + return nil, nil + } + + buf = pgio.AppendUint64(buf, math.Float64bits(box.P[0].X)) + buf = pgio.AppendUint64(buf, math.Float64bits(box.P[0].Y)) + buf = pgio.AppendUint64(buf, math.Float64bits(box.P[1].X)) + buf = pgio.AppendUint64(buf, math.Float64bits(box.P[1].Y)) + return buf, nil +} + +type encodePlanBoxCodecText struct{} + +func (encodePlanBoxCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + box, err := value.(BoxValuer).BoxValue() + if err != nil { + return nil, err + } + + if !box.Valid { + return nil, nil + } + + buf = append(buf, fmt.Sprintf(`(%s,%s),(%s,%s)`, + strconv.FormatFloat(box.P[0].X, 'f', -1, 64), + strconv.FormatFloat(box.P[0].Y, 'f', -1, 64), + strconv.FormatFloat(box.P[1].X, 'f', -1, 64), + strconv.FormatFloat(box.P[1].Y, 'f', -1, 64), + )...) + return buf, nil +} + +func (BoxCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case BoxScanner: + return scanPlanBinaryBoxToBoxScanner{} + } + case TextFormatCode: + switch target.(type) { + case BoxScanner: + return scanPlanTextAnyToBoxScanner{} + } + } + + return nil +} + +type scanPlanBinaryBoxToBoxScanner struct{} + +func (scanPlanBinaryBoxToBoxScanner) Scan(src []byte, dst any) error { + scanner := (dst).(BoxScanner) + + if src == nil { + return scanner.ScanBox(Box{}) + } + + if len(src) != 32 { + return fmt.Errorf("invalid length for Box: %v", len(src)) + } + + x1 := binary.BigEndian.Uint64(src) + y1 := binary.BigEndian.Uint64(src[8:]) + x2 := binary.BigEndian.Uint64(src[16:]) + y2 := binary.BigEndian.Uint64(src[24:]) + + return scanner.ScanBox(Box{ + P: [2]Vec2{ + {math.Float64frombits(x1), math.Float64frombits(y1)}, + {math.Float64frombits(x2), math.Float64frombits(y2)}, + }, + Valid: true, + }) +} + +type scanPlanTextAnyToBoxScanner struct{} + +func (scanPlanTextAnyToBoxScanner) Scan(src []byte, dst any) error { + scanner := (dst).(BoxScanner) + + if src == nil { + return scanner.ScanBox(Box{}) + } + + if len(src) < 11 { + return fmt.Errorf("invalid length for Box: %v", len(src)) + } + + str := string(src[1:]) + + var end int + end = strings.IndexByte(str, ',') + + x1, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1:] + end = strings.IndexByte(str, ')') + + y1, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+3:] + end = strings.IndexByte(str, ',') + + x2, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1 : len(str)-1] + + y2, err := strconv.ParseFloat(str, 64) + if err != nil { + return err + } + + return scanner.ScanBox(Box{P: [2]Vec2{{x1, y1}, {x2, y2}}, Valid: true}) +} + +func (c BoxCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c BoxCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var box Box + err := codecScan(c, m, oid, format, src, &box) + if err != nil { + return nil, err + } + return box, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/builtin_wrappers.go b/vendor/github.com/jackc/pgx/v5/pgtype/builtin_wrappers.go new file mode 100644 index 0000000..b39d3fa --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/builtin_wrappers.go @@ -0,0 +1,952 @@ +package pgtype + +import ( + "errors" + "fmt" + "math" + "math/big" + "net" + "net/netip" + "reflect" + "time" +) + +type int8Wrapper int8 + +func (w int8Wrapper) SkipUnderlyingTypePlan() {} + +func (w *int8Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *int8") + } + + if v.Int64 < math.MinInt8 { + return fmt.Errorf("%d is less than minimum value for int8", v.Int64) + } + if v.Int64 > math.MaxInt8 { + return fmt.Errorf("%d is greater than maximum value for int8", v.Int64) + } + *w = int8Wrapper(v.Int64) + + return nil +} + +func (w int8Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type int16Wrapper int16 + +func (w int16Wrapper) SkipUnderlyingTypePlan() {} + +func (w *int16Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *int16") + } + + if v.Int64 < math.MinInt16 { + return fmt.Errorf("%d is less than minimum value for int16", v.Int64) + } + if v.Int64 > math.MaxInt16 { + return fmt.Errorf("%d is greater than maximum value for int16", v.Int64) + } + *w = int16Wrapper(v.Int64) + + return nil +} + +func (w int16Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type int32Wrapper int32 + +func (w int32Wrapper) SkipUnderlyingTypePlan() {} + +func (w *int32Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *int32") + } + + if v.Int64 < math.MinInt32 { + return fmt.Errorf("%d is less than minimum value for int32", v.Int64) + } + if v.Int64 > math.MaxInt32 { + return fmt.Errorf("%d is greater than maximum value for int32", v.Int64) + } + *w = int32Wrapper(v.Int64) + + return nil +} + +func (w int32Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type int64Wrapper int64 + +func (w int64Wrapper) SkipUnderlyingTypePlan() {} + +func (w *int64Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *int64") + } + + *w = int64Wrapper(v.Int64) + + return nil +} + +func (w int64Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type intWrapper int + +func (w intWrapper) SkipUnderlyingTypePlan() {} + +func (w *intWrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *int") + } + + if v.Int64 < math.MinInt { + return fmt.Errorf("%d is less than minimum value for int", v.Int64) + } + if v.Int64 > math.MaxInt { + return fmt.Errorf("%d is greater than maximum value for int", v.Int64) + } + + *w = intWrapper(v.Int64) + + return nil +} + +func (w intWrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type uint8Wrapper uint8 + +func (w uint8Wrapper) SkipUnderlyingTypePlan() {} + +func (w *uint8Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint8") + } + + if v.Int64 < 0 { + return fmt.Errorf("%d is less than minimum value for uint8", v.Int64) + } + if v.Int64 > math.MaxUint8 { + return fmt.Errorf("%d is greater than maximum value for uint8", v.Int64) + } + *w = uint8Wrapper(v.Int64) + + return nil +} + +func (w uint8Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type uint16Wrapper uint16 + +func (w uint16Wrapper) SkipUnderlyingTypePlan() {} + +func (w *uint16Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint16") + } + + if v.Int64 < 0 { + return fmt.Errorf("%d is less than minimum value for uint16", v.Int64) + } + if v.Int64 > math.MaxUint16 { + return fmt.Errorf("%d is greater than maximum value for uint16", v.Int64) + } + *w = uint16Wrapper(v.Int64) + + return nil +} + +func (w uint16Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type uint32Wrapper uint32 + +func (w uint32Wrapper) SkipUnderlyingTypePlan() {} + +func (w *uint32Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint32") + } + + if v.Int64 < 0 { + return fmt.Errorf("%d is less than minimum value for uint32", v.Int64) + } + if v.Int64 > math.MaxUint32 { + return fmt.Errorf("%d is greater than maximum value for uint32", v.Int64) + } + *w = uint32Wrapper(v.Int64) + + return nil +} + +func (w uint32Wrapper) Int64Value() (Int8, error) { + return Int8{Int64: int64(w), Valid: true}, nil +} + +type uint64Wrapper uint64 + +func (w uint64Wrapper) SkipUnderlyingTypePlan() {} + +func (w *uint64Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint64") + } + + if v.Int64 < 0 { + return fmt.Errorf("%d is less than minimum value for uint64", v.Int64) + } + + *w = uint64Wrapper(v.Int64) + + return nil +} + +func (w uint64Wrapper) Int64Value() (Int8, error) { + if uint64(w) > uint64(math.MaxInt64) { + return Int8{}, fmt.Errorf("%d is greater than maximum value for int64", w) + } + + return Int8{Int64: int64(w), Valid: true}, nil +} + +func (w *uint64Wrapper) ScanNumeric(v Numeric) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint64") + } + + bi, err := v.toBigInt() + if err != nil { + return fmt.Errorf("cannot scan into *uint64: %w", err) + } + + if !bi.IsUint64() { + return fmt.Errorf("cannot scan %v into *uint64", bi.String()) + } + + *w = uint64Wrapper(bi.Uint64()) + + return nil +} + +func (w uint64Wrapper) NumericValue() (Numeric, error) { + return Numeric{Int: new(big.Int).SetUint64(uint64(w)), Valid: true}, nil +} + +type uintWrapper uint + +func (w uintWrapper) SkipUnderlyingTypePlan() {} + +func (w *uintWrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint64") + } + + if v.Int64 < 0 { + return fmt.Errorf("%d is less than minimum value for uint64", v.Int64) + } + + if uint64(v.Int64) > math.MaxUint { + return fmt.Errorf("%d is greater than maximum value for uint", v.Int64) + } + + *w = uintWrapper(v.Int64) + + return nil +} + +func (w uintWrapper) Int64Value() (Int8, error) { + if uint64(w) > uint64(math.MaxInt64) { + return Int8{}, fmt.Errorf("%d is greater than maximum value for int64", w) + } + + return Int8{Int64: int64(w), Valid: true}, nil +} + +func (w *uintWrapper) ScanNumeric(v Numeric) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *uint") + } + + bi, err := v.toBigInt() + if err != nil { + return fmt.Errorf("cannot scan into *uint: %w", err) + } + + if !bi.IsUint64() { + return fmt.Errorf("cannot scan %v into *uint", bi.String()) + } + + ui := bi.Uint64() + + if math.MaxUint < ui { + return fmt.Errorf("cannot scan %v into *uint", ui) + } + + *w = uintWrapper(ui) + + return nil +} + +func (w uintWrapper) NumericValue() (Numeric, error) { + return Numeric{Int: new(big.Int).SetUint64(uint64(w)), Valid: true}, nil +} + +type float32Wrapper float32 + +func (w float32Wrapper) SkipUnderlyingTypePlan() {} + +func (w *float32Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *float32") + } + + *w = float32Wrapper(v.Int64) + + return nil +} + +func (w float32Wrapper) Int64Value() (Int8, error) { + if w > math.MaxInt64 { + return Int8{}, fmt.Errorf("%f is greater than maximum value for int64", w) + } + + return Int8{Int64: int64(w), Valid: true}, nil +} + +func (w *float32Wrapper) ScanFloat64(v Float8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *float32") + } + + *w = float32Wrapper(v.Float64) + + return nil +} + +func (w float32Wrapper) Float64Value() (Float8, error) { + return Float8{Float64: float64(w), Valid: true}, nil +} + +type float64Wrapper float64 + +func (w float64Wrapper) SkipUnderlyingTypePlan() {} + +func (w *float64Wrapper) ScanInt64(v Int8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *float64") + } + + *w = float64Wrapper(v.Int64) + + return nil +} + +func (w float64Wrapper) Int64Value() (Int8, error) { + if w > math.MaxInt64 { + return Int8{}, fmt.Errorf("%f is greater than maximum value for int64", w) + } + + return Int8{Int64: int64(w), Valid: true}, nil +} + +func (w *float64Wrapper) ScanFloat64(v Float8) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *float64") + } + + *w = float64Wrapper(v.Float64) + + return nil +} + +func (w float64Wrapper) Float64Value() (Float8, error) { + return Float8{Float64: float64(w), Valid: true}, nil +} + +type stringWrapper string + +func (w stringWrapper) SkipUnderlyingTypePlan() {} + +func (w *stringWrapper) ScanText(v Text) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *string") + } + + *w = stringWrapper(v.String) + return nil +} + +func (w stringWrapper) TextValue() (Text, error) { + return Text{String: string(w), Valid: true}, nil +} + +type timeWrapper time.Time + +func (w *timeWrapper) ScanDate(v Date) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *time.Time") + } + + switch v.InfinityModifier { + case Finite: + *w = timeWrapper(v.Time) + return nil + case Infinity: + return fmt.Errorf("cannot scan Infinity into *time.Time") + case NegativeInfinity: + return fmt.Errorf("cannot scan -Infinity into *time.Time") + default: + return fmt.Errorf("invalid InfinityModifier: %v", v.InfinityModifier) + } +} + +func (w timeWrapper) DateValue() (Date, error) { + return Date{Time: time.Time(w), Valid: true}, nil +} + +func (w *timeWrapper) ScanTimestamp(v Timestamp) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *time.Time") + } + + switch v.InfinityModifier { + case Finite: + *w = timeWrapper(v.Time) + return nil + case Infinity: + return fmt.Errorf("cannot scan Infinity into *time.Time") + case NegativeInfinity: + return fmt.Errorf("cannot scan -Infinity into *time.Time") + default: + return fmt.Errorf("invalid InfinityModifier: %v", v.InfinityModifier) + } +} + +func (w timeWrapper) TimestampValue() (Timestamp, error) { + return Timestamp{Time: time.Time(w), Valid: true}, nil +} + +func (w *timeWrapper) ScanTimestamptz(v Timestamptz) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *time.Time") + } + + switch v.InfinityModifier { + case Finite: + *w = timeWrapper(v.Time) + return nil + case Infinity: + return fmt.Errorf("cannot scan Infinity into *time.Time") + case NegativeInfinity: + return fmt.Errorf("cannot scan -Infinity into *time.Time") + default: + return fmt.Errorf("invalid InfinityModifier: %v", v.InfinityModifier) + } +} + +func (w timeWrapper) TimestamptzValue() (Timestamptz, error) { + return Timestamptz{Time: time.Time(w), Valid: true}, nil +} + +func (w *timeWrapper) ScanTime(v Time) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *time.Time") + } + + // 24:00:00 is max allowed time in PostgreSQL, but time.Time will normalize that to 00:00:00 the next day. + var maxRepresentableByTime int64 = 24*60*60*1000000 - 1 + if v.Microseconds > maxRepresentableByTime { + return fmt.Errorf("%d microseconds cannot be represented as time.Time", v.Microseconds) + } + + usec := v.Microseconds + hours := usec / microsecondsPerHour + usec -= hours * microsecondsPerHour + minutes := usec / microsecondsPerMinute + usec -= minutes * microsecondsPerMinute + seconds := usec / microsecondsPerSecond + usec -= seconds * microsecondsPerSecond + ns := usec * 1000 + *w = timeWrapper(time.Date(2000, 1, 1, int(hours), int(minutes), int(seconds), int(ns), time.UTC)) + return nil +} + +func (w timeWrapper) TimeValue() (Time, error) { + t := time.Time(w) + usec := int64(t.Hour())*microsecondsPerHour + + int64(t.Minute())*microsecondsPerMinute + + int64(t.Second())*microsecondsPerSecond + + int64(t.Nanosecond())/1000 + return Time{Microseconds: usec, Valid: true}, nil +} + +type durationWrapper time.Duration + +func (w durationWrapper) SkipUnderlyingTypePlan() {} + +func (w *durationWrapper) ScanInterval(v Interval) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *time.Interval") + } + + us := int64(v.Months)*microsecondsPerMonth + int64(v.Days)*microsecondsPerDay + v.Microseconds + *w = durationWrapper(time.Duration(us) * time.Microsecond) + return nil +} + +func (w durationWrapper) IntervalValue() (Interval, error) { + return Interval{Microseconds: int64(w) / 1000, Valid: true}, nil +} + +type netIPNetWrapper net.IPNet + +func (w *netIPNetWrapper) ScanNetipPrefix(v netip.Prefix) error { + if !v.IsValid() { + return fmt.Errorf("cannot scan NULL into *net.IPNet") + } + + *w = netIPNetWrapper{ + IP: v.Addr().AsSlice(), + Mask: net.CIDRMask(v.Bits(), v.Addr().BitLen()), + } + + return nil +} +func (w netIPNetWrapper) NetipPrefixValue() (netip.Prefix, error) { + ip, ok := netip.AddrFromSlice(w.IP) + if !ok { + return netip.Prefix{}, errors.New("invalid net.IPNet") + } + + ones, _ := w.Mask.Size() + + return netip.PrefixFrom(ip, ones), nil +} + +type netIPWrapper net.IP + +func (w netIPWrapper) SkipUnderlyingTypePlan() {} + +func (w *netIPWrapper) ScanNetipPrefix(v netip.Prefix) error { + if !v.IsValid() { + *w = nil + return nil + } + + if v.Addr().BitLen() != v.Bits() { + return fmt.Errorf("cannot scan %v to *net.IP", v) + } + + *w = netIPWrapper(v.Addr().AsSlice()) + return nil +} + +func (w netIPWrapper) NetipPrefixValue() (netip.Prefix, error) { + if w == nil { + return netip.Prefix{}, nil + } + + addr, ok := netip.AddrFromSlice([]byte(w)) + if !ok { + return netip.Prefix{}, errors.New("invalid net.IP") + } + + return netip.PrefixFrom(addr, addr.BitLen()), nil +} + +type netipPrefixWrapper netip.Prefix + +func (w *netipPrefixWrapper) ScanNetipPrefix(v netip.Prefix) error { + *w = netipPrefixWrapper(v) + return nil +} + +func (w netipPrefixWrapper) NetipPrefixValue() (netip.Prefix, error) { + return netip.Prefix(w), nil +} + +type netipAddrWrapper netip.Addr + +func (w *netipAddrWrapper) ScanNetipPrefix(v netip.Prefix) error { + if !v.IsValid() { + *w = netipAddrWrapper(netip.Addr{}) + return nil + } + + if v.Addr().BitLen() != v.Bits() { + return fmt.Errorf("cannot scan %v to netip.Addr", v) + } + + *w = netipAddrWrapper(v.Addr()) + + return nil +} + +func (w netipAddrWrapper) NetipPrefixValue() (netip.Prefix, error) { + addr := (netip.Addr)(w) + if !addr.IsValid() { + return netip.Prefix{}, nil + } + + return netip.PrefixFrom(addr, addr.BitLen()), nil +} + +type mapStringToPointerStringWrapper map[string]*string + +func (w *mapStringToPointerStringWrapper) ScanHstore(v Hstore) error { + *w = mapStringToPointerStringWrapper(v) + return nil +} + +func (w mapStringToPointerStringWrapper) HstoreValue() (Hstore, error) { + return Hstore(w), nil +} + +type mapStringToStringWrapper map[string]string + +func (w *mapStringToStringWrapper) ScanHstore(v Hstore) error { + *w = make(mapStringToStringWrapper, len(v)) + for k, v := range v { + if v == nil { + return fmt.Errorf("cannot scan NULL to string") + } + (*w)[k] = *v + } + return nil +} + +func (w mapStringToStringWrapper) HstoreValue() (Hstore, error) { + if w == nil { + return nil, nil + } + + hstore := make(Hstore, len(w)) + for k, v := range w { + s := v + hstore[k] = &s + } + return hstore, nil +} + +type fmtStringerWrapper struct { + s fmt.Stringer +} + +func (w fmtStringerWrapper) TextValue() (Text, error) { + return Text{String: w.s.String(), Valid: true}, nil +} + +type byte16Wrapper [16]byte + +func (w *byte16Wrapper) ScanUUID(v UUID) error { + if !v.Valid { + return fmt.Errorf("cannot scan NULL into *[16]byte") + } + *w = byte16Wrapper(v.Bytes) + return nil +} + +func (w byte16Wrapper) UUIDValue() (UUID, error) { + return UUID{Bytes: [16]byte(w), Valid: true}, nil +} + +type byteSliceWrapper []byte + +func (w byteSliceWrapper) SkipUnderlyingTypePlan() {} + +func (w *byteSliceWrapper) ScanText(v Text) error { + if !v.Valid { + *w = nil + return nil + } + + *w = byteSliceWrapper(v.String) + return nil +} + +func (w byteSliceWrapper) TextValue() (Text, error) { + if w == nil { + return Text{}, nil + } + + return Text{String: string(w), Valid: true}, nil +} + +func (w *byteSliceWrapper) ScanUUID(v UUID) error { + if !v.Valid { + *w = nil + return nil + } + *w = make(byteSliceWrapper, 16) + copy(*w, v.Bytes[:]) + return nil +} + +func (w byteSliceWrapper) UUIDValue() (UUID, error) { + if w == nil { + return UUID{}, nil + } + + uuid := UUID{Valid: true} + copy(uuid.Bytes[:], w) + return uuid, nil +} + +// structWrapper implements CompositeIndexGetter for a struct. +type structWrapper struct { + s any + exportedFields []reflect.Value +} + +func (w structWrapper) IsNull() bool { + return w.s == nil +} + +func (w structWrapper) Index(i int) any { + if i >= len(w.exportedFields) { + return fmt.Errorf("%#v only has %d public fields - %d is out of bounds", w.s, len(w.exportedFields), i) + } + + return w.exportedFields[i].Interface() +} + +// ptrStructWrapper implements CompositeIndexScanner for a pointer to a struct. +type ptrStructWrapper struct { + s any + exportedFields []reflect.Value +} + +func (w *ptrStructWrapper) ScanNull() error { + return fmt.Errorf("cannot scan NULL into %#v", w.s) +} + +func (w *ptrStructWrapper) ScanIndex(i int) any { + if i >= len(w.exportedFields) { + return fmt.Errorf("%#v only has %d public fields - %d is out of bounds", w.s, len(w.exportedFields), i) + } + + return w.exportedFields[i].Addr().Interface() +} + +type anySliceArrayReflect struct { + slice reflect.Value +} + +func (a anySliceArrayReflect) Dimensions() []ArrayDimension { + if a.slice.IsNil() { + return nil + } + + return []ArrayDimension{{Length: int32(a.slice.Len()), LowerBound: 1}} +} + +func (a anySliceArrayReflect) Index(i int) any { + return a.slice.Index(i).Interface() +} + +func (a anySliceArrayReflect) IndexType() any { + return reflect.New(a.slice.Type().Elem()).Elem().Interface() +} + +func (a *anySliceArrayReflect) SetDimensions(dimensions []ArrayDimension) error { + sliceType := a.slice.Type() + + if dimensions == nil { + a.slice.Set(reflect.Zero(sliceType)) + return nil + } + + elementCount := cardinality(dimensions) + slice := reflect.MakeSlice(sliceType, elementCount, elementCount) + a.slice.Set(slice) + return nil +} + +func (a *anySliceArrayReflect) ScanIndex(i int) any { + return a.slice.Index(i).Addr().Interface() +} + +func (a *anySliceArrayReflect) ScanIndexType() any { + return reflect.New(a.slice.Type().Elem()).Interface() +} + +type anyMultiDimSliceArray struct { + slice reflect.Value + dims []ArrayDimension +} + +func (a *anyMultiDimSliceArray) Dimensions() []ArrayDimension { + if a.slice.IsNil() { + return nil + } + + s := a.slice + for { + a.dims = append(a.dims, ArrayDimension{Length: int32(s.Len()), LowerBound: 1}) + if s.Len() > 0 { + s = s.Index(0) + } else { + break + } + if s.Type().Kind() == reflect.Slice { + } else { + break + } + } + + return a.dims +} + +func (a *anyMultiDimSliceArray) Index(i int) any { + if len(a.dims) == 1 { + return a.slice.Index(i).Interface() + } + + indexes := make([]int, len(a.dims)) + for j := len(a.dims) - 1; j >= 0; j-- { + dimLen := int(a.dims[j].Length) + indexes[j] = i % dimLen + i = i / dimLen + } + + v := a.slice + for _, si := range indexes { + v = v.Index(si) + } + + return v.Interface() +} + +func (a *anyMultiDimSliceArray) IndexType() any { + lowestSliceType := a.slice.Type() + for ; lowestSliceType.Elem().Kind() == reflect.Slice; lowestSliceType = lowestSliceType.Elem() { + } + return reflect.New(lowestSliceType.Elem()).Elem().Interface() +} + +func (a *anyMultiDimSliceArray) SetDimensions(dimensions []ArrayDimension) error { + sliceType := a.slice.Type() + + if dimensions == nil { + a.slice.Set(reflect.Zero(sliceType)) + return nil + } + + switch len(dimensions) { + case 0: + // Empty, but non-nil array + slice := reflect.MakeSlice(sliceType, 0, 0) + a.slice.Set(slice) + return nil + case 1: + elementCount := cardinality(dimensions) + slice := reflect.MakeSlice(sliceType, elementCount, elementCount) + a.slice.Set(slice) + return nil + default: + sliceDimensionCount := 1 + lowestSliceType := sliceType + for ; lowestSliceType.Elem().Kind() == reflect.Slice; lowestSliceType = lowestSliceType.Elem() { + sliceDimensionCount++ + } + + if sliceDimensionCount != len(dimensions) { + return fmt.Errorf("PostgreSQL array has %d dimensions but slice has %d dimensions", len(dimensions), sliceDimensionCount) + } + + elementCount := cardinality(dimensions) + flatSlice := reflect.MakeSlice(lowestSliceType, elementCount, elementCount) + + multiDimSlice := a.makeMultidimensionalSlice(sliceType, dimensions, flatSlice, 0) + a.slice.Set(multiDimSlice) + + // Now that a.slice is a multi-dimensional slice with the underlying data pointed at flatSlice change a.slice to + // flatSlice so ScanIndex only has to handle simple one dimensional slices. + a.slice = flatSlice + + return nil + } + +} + +func (a *anyMultiDimSliceArray) makeMultidimensionalSlice(sliceType reflect.Type, dimensions []ArrayDimension, flatSlice reflect.Value, flatSliceIdx int) reflect.Value { + if len(dimensions) == 1 { + endIdx := flatSliceIdx + int(dimensions[0].Length) + return flatSlice.Slice3(flatSliceIdx, endIdx, endIdx) + } + + sliceLen := int(dimensions[0].Length) + slice := reflect.MakeSlice(sliceType, sliceLen, sliceLen) + for i := 0; i < sliceLen; i++ { + subSlice := a.makeMultidimensionalSlice(sliceType.Elem(), dimensions[1:], flatSlice, flatSliceIdx+(i*int(dimensions[1].Length))) + slice.Index(i).Set(subSlice) + } + + return slice +} + +func (a *anyMultiDimSliceArray) ScanIndex(i int) any { + return a.slice.Index(i).Addr().Interface() +} + +func (a *anyMultiDimSliceArray) ScanIndexType() any { + lowestSliceType := a.slice.Type() + for ; lowestSliceType.Elem().Kind() == reflect.Slice; lowestSliceType = lowestSliceType.Elem() { + } + return reflect.New(lowestSliceType.Elem()).Interface() +} + +type anyArrayArrayReflect struct { + array reflect.Value +} + +func (a anyArrayArrayReflect) Dimensions() []ArrayDimension { + return []ArrayDimension{{Length: int32(a.array.Len()), LowerBound: 1}} +} + +func (a anyArrayArrayReflect) Index(i int) any { + return a.array.Index(i).Interface() +} + +func (a anyArrayArrayReflect) IndexType() any { + return reflect.New(a.array.Type().Elem()).Elem().Interface() +} + +func (a *anyArrayArrayReflect) SetDimensions(dimensions []ArrayDimension) error { + if dimensions == nil { + return fmt.Errorf("anyArrayArrayReflect: cannot scan NULL into %v", a.array.Type().String()) + } + + if len(dimensions) != 1 { + return fmt.Errorf("anyArrayArrayReflect: cannot scan multi-dimensional array into %v", a.array.Type().String()) + } + + if int(dimensions[0].Length) != a.array.Len() { + return fmt.Errorf("anyArrayArrayReflect: cannot scan array with length %v into %v", dimensions[0].Length, a.array.Type().String()) + } + + return nil +} + +func (a *anyArrayArrayReflect) ScanIndex(i int) any { + return a.array.Index(i).Addr().Interface() +} + +func (a *anyArrayArrayReflect) ScanIndexType() any { + return reflect.New(a.array.Type().Elem()).Interface() +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/bytea.go b/vendor/github.com/jackc/pgx/v5/pgtype/bytea.go new file mode 100644 index 0000000..a247705 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/bytea.go @@ -0,0 +1,255 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/hex" + "fmt" +) + +type BytesScanner interface { + // ScanBytes receives a byte slice of driver memory that is only valid until the next database method call. + ScanBytes(v []byte) error +} + +type BytesValuer interface { + // BytesValue returns a byte slice of the byte data. The caller must not change the returned slice. + BytesValue() ([]byte, error) +} + +// DriverBytes is a byte slice that holds a reference to memory owned by the driver. It is only valid from the time it +// is scanned until Rows.Next or Rows.Close is called. It is never safe to use DriverBytes with QueryRow as Row.Scan +// internally calls Rows.Close before returning. +type DriverBytes []byte + +func (b *DriverBytes) ScanBytes(v []byte) error { + *b = v + return nil +} + +// PreallocBytes is a byte slice of preallocated memory that scanned bytes will be copied to. If it is too small a new +// slice will be allocated. +type PreallocBytes []byte + +func (b *PreallocBytes) ScanBytes(v []byte) error { + if v == nil { + *b = nil + return nil + } + + if len(v) <= len(*b) { + *b = (*b)[:len(v)] + } else { + *b = make(PreallocBytes, len(v)) + } + copy(*b, v) + return nil +} + +// UndecodedBytes can be used as a scan target to get the raw bytes from PostgreSQL without any decoding. +type UndecodedBytes []byte + +type scanPlanAnyToUndecodedBytes struct{} + +func (scanPlanAnyToUndecodedBytes) Scan(src []byte, dst any) error { + dstBuf := dst.(*UndecodedBytes) + if src == nil { + *dstBuf = nil + return nil + } + + *dstBuf = make([]byte, len(src)) + copy(*dstBuf, src) + return nil +} + +type ByteaCodec struct{} + +func (ByteaCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (ByteaCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (ByteaCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case []byte: + return encodePlanBytesCodecBinaryBytes{} + case BytesValuer: + return encodePlanBytesCodecBinaryBytesValuer{} + } + case TextFormatCode: + switch value.(type) { + case []byte: + return encodePlanBytesCodecTextBytes{} + case BytesValuer: + return encodePlanBytesCodecTextBytesValuer{} + } + } + + return nil +} + +type encodePlanBytesCodecBinaryBytes struct{} + +func (encodePlanBytesCodecBinaryBytes) Encode(value any, buf []byte) (newBuf []byte, err error) { + b := value.([]byte) + if b == nil { + return nil, nil + } + + return append(buf, b...), nil +} + +type encodePlanBytesCodecBinaryBytesValuer struct{} + +func (encodePlanBytesCodecBinaryBytesValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + b, err := value.(BytesValuer).BytesValue() + if err != nil { + return nil, err + } + if b == nil { + return nil, nil + } + + return append(buf, b...), nil +} + +type encodePlanBytesCodecTextBytes struct{} + +func (encodePlanBytesCodecTextBytes) Encode(value any, buf []byte) (newBuf []byte, err error) { + b := value.([]byte) + if b == nil { + return nil, nil + } + + buf = append(buf, `\x`...) + buf = append(buf, hex.EncodeToString(b)...) + return buf, nil +} + +type encodePlanBytesCodecTextBytesValuer struct{} + +func (encodePlanBytesCodecTextBytesValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + b, err := value.(BytesValuer).BytesValue() + if err != nil { + return nil, err + } + if b == nil { + return nil, nil + } + + buf = append(buf, `\x`...) + buf = append(buf, hex.EncodeToString(b)...) + return buf, nil +} + +func (ByteaCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *[]byte: + return scanPlanBinaryBytesToBytes{} + case BytesScanner: + return scanPlanBinaryBytesToBytesScanner{} + } + case TextFormatCode: + switch target.(type) { + case *[]byte: + return scanPlanTextByteaToBytes{} + case BytesScanner: + return scanPlanTextByteaToBytesScanner{} + } + } + + return nil +} + +type scanPlanBinaryBytesToBytes struct{} + +func (scanPlanBinaryBytesToBytes) Scan(src []byte, dst any) error { + dstBuf := dst.(*[]byte) + if src == nil { + *dstBuf = nil + return nil + } + + *dstBuf = make([]byte, len(src)) + copy(*dstBuf, src) + return nil +} + +type scanPlanBinaryBytesToBytesScanner struct{} + +func (scanPlanBinaryBytesToBytesScanner) Scan(src []byte, dst any) error { + scanner := (dst).(BytesScanner) + return scanner.ScanBytes(src) +} + +type scanPlanTextByteaToBytes struct{} + +func (scanPlanTextByteaToBytes) Scan(src []byte, dst any) error { + dstBuf := dst.(*[]byte) + if src == nil { + *dstBuf = nil + return nil + } + + buf, err := decodeHexBytea(src) + if err != nil { + return err + } + *dstBuf = buf + + return nil +} + +type scanPlanTextByteaToBytesScanner struct{} + +func (scanPlanTextByteaToBytesScanner) Scan(src []byte, dst any) error { + scanner := (dst).(BytesScanner) + buf, err := decodeHexBytea(src) + if err != nil { + return err + } + return scanner.ScanBytes(buf) +} + +func decodeHexBytea(src []byte) ([]byte, error) { + if src == nil { + return nil, nil + } + + if len(src) < 2 || src[0] != '\\' || src[1] != 'x' { + return nil, fmt.Errorf("invalid hex format") + } + + buf := make([]byte, (len(src)-2)/2) + _, err := hex.Decode(buf, src[2:]) + if err != nil { + return nil, err + } + + return buf, nil +} + +func (c ByteaCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return c.DecodeValue(m, oid, format, src) +} + +func (c ByteaCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var buf []byte + err := codecScan(c, m, oid, format, src, &buf) + if err != nil { + return nil, err + } + return buf, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/circle.go b/vendor/github.com/jackc/pgx/v5/pgtype/circle.go new file mode 100644 index 0000000..e8f118c --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/circle.go @@ -0,0 +1,222 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type CircleScanner interface { + ScanCircle(v Circle) error +} + +type CircleValuer interface { + CircleValue() (Circle, error) +} + +type Circle struct { + P Vec2 + R float64 + Valid bool +} + +func (c *Circle) ScanCircle(v Circle) error { + *c = v + return nil +} + +func (c Circle) CircleValue() (Circle, error) { + return c, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Circle) Scan(src any) error { + if src == nil { + *dst = Circle{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToCircleScanner{}.Scan([]byte(src), dst) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Circle) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + buf, err := CircleCodec{}.PlanEncode(nil, 0, TextFormatCode, src).Encode(src, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type CircleCodec struct{} + +func (CircleCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (CircleCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (CircleCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(CircleValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanCircleCodecBinary{} + case TextFormatCode: + return encodePlanCircleCodecText{} + } + + return nil +} + +type encodePlanCircleCodecBinary struct{} + +func (encodePlanCircleCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + circle, err := value.(CircleValuer).CircleValue() + if err != nil { + return nil, err + } + + if !circle.Valid { + return nil, nil + } + + buf = pgio.AppendUint64(buf, math.Float64bits(circle.P.X)) + buf = pgio.AppendUint64(buf, math.Float64bits(circle.P.Y)) + buf = pgio.AppendUint64(buf, math.Float64bits(circle.R)) + return buf, nil +} + +type encodePlanCircleCodecText struct{} + +func (encodePlanCircleCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + circle, err := value.(CircleValuer).CircleValue() + if err != nil { + return nil, err + } + + if !circle.Valid { + return nil, nil + } + + buf = append(buf, fmt.Sprintf(`<(%s,%s),%s>`, + strconv.FormatFloat(circle.P.X, 'f', -1, 64), + strconv.FormatFloat(circle.P.Y, 'f', -1, 64), + strconv.FormatFloat(circle.R, 'f', -1, 64), + )...) + return buf, nil +} + +func (CircleCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case BinaryFormatCode: + switch target.(type) { + case CircleScanner: + return scanPlanBinaryCircleToCircleScanner{} + } + case TextFormatCode: + switch target.(type) { + case CircleScanner: + return scanPlanTextAnyToCircleScanner{} + } + } + + return nil +} + +func (c CircleCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c CircleCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var circle Circle + err := codecScan(c, m, oid, format, src, &circle) + if err != nil { + return nil, err + } + return circle, nil +} + +type scanPlanBinaryCircleToCircleScanner struct{} + +func (scanPlanBinaryCircleToCircleScanner) Scan(src []byte, dst any) error { + scanner := (dst).(CircleScanner) + + if src == nil { + return scanner.ScanCircle(Circle{}) + } + + if len(src) != 24 { + return fmt.Errorf("invalid length for Circle: %v", len(src)) + } + + x := binary.BigEndian.Uint64(src) + y := binary.BigEndian.Uint64(src[8:]) + r := binary.BigEndian.Uint64(src[16:]) + + return scanner.ScanCircle(Circle{ + P: Vec2{math.Float64frombits(x), math.Float64frombits(y)}, + R: math.Float64frombits(r), + Valid: true, + }) +} + +type scanPlanTextAnyToCircleScanner struct{} + +func (scanPlanTextAnyToCircleScanner) Scan(src []byte, dst any) error { + scanner := (dst).(CircleScanner) + + if src == nil { + return scanner.ScanCircle(Circle{}) + } + + if len(src) < 9 { + return fmt.Errorf("invalid length for Circle: %v", len(src)) + } + + str := string(src[2:]) + end := strings.IndexByte(str, ',') + x, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1:] + end = strings.IndexByte(str, ')') + + y, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+2 : len(str)-1] + + r, err := strconv.ParseFloat(str, 64) + if err != nil { + return err + } + + return scanner.ScanCircle(Circle{P: Vec2{x, y}, R: r, Valid: true}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/composite.go b/vendor/github.com/jackc/pgx/v5/pgtype/composite.go new file mode 100644 index 0000000..fb37232 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/composite.go @@ -0,0 +1,602 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "errors" + "fmt" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +// CompositeIndexGetter is a type accessed by index that can be converted into a PostgreSQL composite. +type CompositeIndexGetter interface { + // IsNull returns true if the value is SQL NULL. + IsNull() bool + + // Index returns the element at i. + Index(i int) any +} + +// CompositeIndexScanner is a type accessed by index that can be scanned from a PostgreSQL composite. +type CompositeIndexScanner interface { + // ScanNull sets the value to SQL NULL. + ScanNull() error + + // ScanIndex returns a value usable as a scan target for i. + ScanIndex(i int) any +} + +type CompositeCodecField struct { + Name string + Type *Type +} + +type CompositeCodec struct { + Fields []CompositeCodecField +} + +func (c *CompositeCodec) FormatSupported(format int16) bool { + for _, f := range c.Fields { + if !f.Type.Codec.FormatSupported(format) { + return false + } + } + + return true +} + +func (c *CompositeCodec) PreferredFormat() int16 { + if c.FormatSupported(BinaryFormatCode) { + return BinaryFormatCode + } + return TextFormatCode +} + +func (c *CompositeCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(CompositeIndexGetter); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return &encodePlanCompositeCodecCompositeIndexGetterToBinary{cc: c, m: m} + case TextFormatCode: + return &encodePlanCompositeCodecCompositeIndexGetterToText{cc: c, m: m} + } + + return nil +} + +type encodePlanCompositeCodecCompositeIndexGetterToBinary struct { + cc *CompositeCodec + m *Map +} + +func (plan *encodePlanCompositeCodecCompositeIndexGetterToBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + getter := value.(CompositeIndexGetter) + + if getter.IsNull() { + return nil, nil + } + + builder := NewCompositeBinaryBuilder(plan.m, buf) + for i, field := range plan.cc.Fields { + builder.AppendValue(field.Type.OID, getter.Index(i)) + } + + return builder.Finish() +} + +type encodePlanCompositeCodecCompositeIndexGetterToText struct { + cc *CompositeCodec + m *Map +} + +func (plan *encodePlanCompositeCodecCompositeIndexGetterToText) Encode(value any, buf []byte) (newBuf []byte, err error) { + getter := value.(CompositeIndexGetter) + + if getter.IsNull() { + return nil, nil + } + + b := NewCompositeTextBuilder(plan.m, buf) + for i, field := range plan.cc.Fields { + b.AppendValue(field.Type.OID, getter.Index(i)) + } + + return b.Finish() +} + +func (c *CompositeCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case BinaryFormatCode: + switch target.(type) { + case CompositeIndexScanner: + return &scanPlanBinaryCompositeToCompositeIndexScanner{cc: c, m: m} + } + case TextFormatCode: + switch target.(type) { + case CompositeIndexScanner: + return &scanPlanTextCompositeToCompositeIndexScanner{cc: c, m: m} + } + } + + return nil +} + +type scanPlanBinaryCompositeToCompositeIndexScanner struct { + cc *CompositeCodec + m *Map +} + +func (plan *scanPlanBinaryCompositeToCompositeIndexScanner) Scan(src []byte, target any) error { + targetScanner := (target).(CompositeIndexScanner) + + if src == nil { + return targetScanner.ScanNull() + } + + scanner := NewCompositeBinaryScanner(plan.m, src) + for i, field := range plan.cc.Fields { + if scanner.Next() { + fieldTarget := targetScanner.ScanIndex(i) + if fieldTarget != nil { + fieldPlan := plan.m.PlanScan(field.Type.OID, BinaryFormatCode, fieldTarget) + if fieldPlan == nil { + return fmt.Errorf("unable to encode %v into OID %d in binary format", field, field.Type.OID) + } + + err := fieldPlan.Scan(scanner.Bytes(), fieldTarget) + if err != nil { + return err + } + } + } else { + return errors.New("read past end of composite") + } + } + + if err := scanner.Err(); err != nil { + return err + } + + return nil +} + +type scanPlanTextCompositeToCompositeIndexScanner struct { + cc *CompositeCodec + m *Map +} + +func (plan *scanPlanTextCompositeToCompositeIndexScanner) Scan(src []byte, target any) error { + targetScanner := (target).(CompositeIndexScanner) + + if src == nil { + return targetScanner.ScanNull() + } + + scanner := NewCompositeTextScanner(plan.m, src) + for i, field := range plan.cc.Fields { + if scanner.Next() { + fieldTarget := targetScanner.ScanIndex(i) + if fieldTarget != nil { + fieldPlan := plan.m.PlanScan(field.Type.OID, TextFormatCode, fieldTarget) + if fieldPlan == nil { + return fmt.Errorf("unable to encode %v into OID %d in text format", field, field.Type.OID) + } + + err := fieldPlan.Scan(scanner.Bytes(), fieldTarget) + if err != nil { + return err + } + } + } else { + return errors.New("read past end of composite") + } + } + + if err := scanner.Err(); err != nil { + return err + } + + return nil +} + +func (c *CompositeCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + return string(src), nil + case BinaryFormatCode: + buf := make([]byte, len(src)) + copy(buf, src) + return buf, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } +} + +func (c *CompositeCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + scanner := NewCompositeTextScanner(m, src) + values := make(map[string]any, len(c.Fields)) + for i := 0; scanner.Next() && i < len(c.Fields); i++ { + var v any + fieldPlan := m.PlanScan(c.Fields[i].Type.OID, TextFormatCode, &v) + if fieldPlan == nil { + return nil, fmt.Errorf("unable to scan OID %d in text format into %v", c.Fields[i].Type.OID, v) + } + + err := fieldPlan.Scan(scanner.Bytes(), &v) + if err != nil { + return nil, err + } + + values[c.Fields[i].Name] = v + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return values, nil + case BinaryFormatCode: + scanner := NewCompositeBinaryScanner(m, src) + values := make(map[string]any, len(c.Fields)) + for i := 0; scanner.Next() && i < len(c.Fields); i++ { + var v any + fieldPlan := m.PlanScan(scanner.OID(), BinaryFormatCode, &v) + if fieldPlan == nil { + return nil, fmt.Errorf("unable to scan OID %d in binary format into %v", scanner.OID(), v) + } + + err := fieldPlan.Scan(scanner.Bytes(), &v) + if err != nil { + return nil, err + } + + values[c.Fields[i].Name] = v + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return values, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } + +} + +type CompositeBinaryScanner struct { + m *Map + rp int + src []byte + + fieldCount int32 + fieldBytes []byte + fieldOID uint32 + err error +} + +// NewCompositeBinaryScanner a scanner over a binary encoded composite balue. +func NewCompositeBinaryScanner(m *Map, src []byte) *CompositeBinaryScanner { + rp := 0 + if len(src[rp:]) < 4 { + return &CompositeBinaryScanner{err: fmt.Errorf("Record incomplete %v", src)} + } + + fieldCount := int32(binary.BigEndian.Uint32(src[rp:])) + rp += 4 + + return &CompositeBinaryScanner{ + m: m, + rp: rp, + src: src, + fieldCount: fieldCount, + } +} + +// Next advances the scanner to the next field. It returns false after the last field is read or an error occurs. After +// Next returns false, the Err method can be called to check if any errors occurred. +func (cfs *CompositeBinaryScanner) Next() bool { + if cfs.err != nil { + return false + } + + if cfs.rp == len(cfs.src) { + return false + } + + if len(cfs.src[cfs.rp:]) < 8 { + cfs.err = fmt.Errorf("Record incomplete %v", cfs.src) + return false + } + cfs.fieldOID = binary.BigEndian.Uint32(cfs.src[cfs.rp:]) + cfs.rp += 4 + + fieldLen := int(int32(binary.BigEndian.Uint32(cfs.src[cfs.rp:]))) + cfs.rp += 4 + + if fieldLen >= 0 { + if len(cfs.src[cfs.rp:]) < fieldLen { + cfs.err = fmt.Errorf("Record incomplete rp=%d src=%v", cfs.rp, cfs.src) + return false + } + cfs.fieldBytes = cfs.src[cfs.rp : cfs.rp+fieldLen] + cfs.rp += fieldLen + } else { + cfs.fieldBytes = nil + } + + return true +} + +func (cfs *CompositeBinaryScanner) FieldCount() int { + return int(cfs.fieldCount) +} + +// Bytes returns the bytes of the field most recently read by Scan(). +func (cfs *CompositeBinaryScanner) Bytes() []byte { + return cfs.fieldBytes +} + +// OID returns the OID of the field most recently read by Scan(). +func (cfs *CompositeBinaryScanner) OID() uint32 { + return cfs.fieldOID +} + +// Err returns any error encountered by the scanner. +func (cfs *CompositeBinaryScanner) Err() error { + return cfs.err +} + +type CompositeTextScanner struct { + m *Map + rp int + src []byte + + fieldBytes []byte + err error +} + +// NewCompositeTextScanner a scanner over a text encoded composite value. +func NewCompositeTextScanner(m *Map, src []byte) *CompositeTextScanner { + if len(src) < 2 { + return &CompositeTextScanner{err: fmt.Errorf("Record incomplete %v", src)} + } + + if src[0] != '(' { + return &CompositeTextScanner{err: fmt.Errorf("composite text format must start with '('")} + } + + if src[len(src)-1] != ')' { + return &CompositeTextScanner{err: fmt.Errorf("composite text format must end with ')'")} + } + + return &CompositeTextScanner{ + m: m, + rp: 1, + src: src, + } +} + +// Next advances the scanner to the next field. It returns false after the last field is read or an error occurs. After +// Next returns false, the Err method can be called to check if any errors occurred. +func (cfs *CompositeTextScanner) Next() bool { + if cfs.err != nil { + return false + } + + if cfs.rp == len(cfs.src) { + return false + } + + switch cfs.src[cfs.rp] { + case ',', ')': // null + cfs.rp++ + cfs.fieldBytes = nil + return true + case '"': // quoted value + cfs.rp++ + cfs.fieldBytes = make([]byte, 0, 16) + for { + ch := cfs.src[cfs.rp] + + if ch == '"' { + cfs.rp++ + if cfs.src[cfs.rp] == '"' { + cfs.fieldBytes = append(cfs.fieldBytes, '"') + cfs.rp++ + } else { + break + } + } else if ch == '\\' { + cfs.rp++ + cfs.fieldBytes = append(cfs.fieldBytes, cfs.src[cfs.rp]) + cfs.rp++ + } else { + cfs.fieldBytes = append(cfs.fieldBytes, ch) + cfs.rp++ + } + } + cfs.rp++ + return true + default: // unquoted value + start := cfs.rp + for { + ch := cfs.src[cfs.rp] + if ch == ',' || ch == ')' { + break + } + cfs.rp++ + } + cfs.fieldBytes = cfs.src[start:cfs.rp] + cfs.rp++ + return true + } +} + +// Bytes returns the bytes of the field most recently read by Scan(). +func (cfs *CompositeTextScanner) Bytes() []byte { + return cfs.fieldBytes +} + +// Err returns any error encountered by the scanner. +func (cfs *CompositeTextScanner) Err() error { + return cfs.err +} + +type CompositeBinaryBuilder struct { + m *Map + buf []byte + startIdx int + fieldCount uint32 + err error +} + +func NewCompositeBinaryBuilder(m *Map, buf []byte) *CompositeBinaryBuilder { + startIdx := len(buf) + buf = append(buf, 0, 0, 0, 0) // allocate room for number of fields + return &CompositeBinaryBuilder{m: m, buf: buf, startIdx: startIdx} +} + +func (b *CompositeBinaryBuilder) AppendValue(oid uint32, field any) { + if b.err != nil { + return + } + + if field == nil { + b.buf = pgio.AppendUint32(b.buf, oid) + b.buf = pgio.AppendInt32(b.buf, -1) + b.fieldCount++ + return + } + + plan := b.m.PlanEncode(oid, BinaryFormatCode, field) + if plan == nil { + b.err = fmt.Errorf("unable to encode %v into OID %d in binary format", field, oid) + return + } + + b.buf = pgio.AppendUint32(b.buf, oid) + lengthPos := len(b.buf) + b.buf = pgio.AppendInt32(b.buf, -1) + fieldBuf, err := plan.Encode(field, b.buf) + if err != nil { + b.err = err + return + } + if fieldBuf != nil { + binary.BigEndian.PutUint32(fieldBuf[lengthPos:], uint32(len(fieldBuf)-len(b.buf))) + b.buf = fieldBuf + } + + b.fieldCount++ +} + +func (b *CompositeBinaryBuilder) Finish() ([]byte, error) { + if b.err != nil { + return nil, b.err + } + + binary.BigEndian.PutUint32(b.buf[b.startIdx:], b.fieldCount) + return b.buf, nil +} + +type CompositeTextBuilder struct { + m *Map + buf []byte + startIdx int + fieldCount uint32 + err error + fieldBuf [32]byte +} + +func NewCompositeTextBuilder(m *Map, buf []byte) *CompositeTextBuilder { + buf = append(buf, '(') // allocate room for number of fields + return &CompositeTextBuilder{m: m, buf: buf} +} + +func (b *CompositeTextBuilder) AppendValue(oid uint32, field any) { + if b.err != nil { + return + } + + if field == nil { + b.buf = append(b.buf, ',') + return + } + + plan := b.m.PlanEncode(oid, TextFormatCode, field) + if plan == nil { + b.err = fmt.Errorf("unable to encode %v into OID %d in text format", field, oid) + return + } + + fieldBuf, err := plan.Encode(field, b.fieldBuf[0:0]) + if err != nil { + b.err = err + return + } + if fieldBuf != nil { + b.buf = append(b.buf, quoteCompositeFieldIfNeeded(string(fieldBuf))...) + } + + b.buf = append(b.buf, ',') +} + +func (b *CompositeTextBuilder) Finish() ([]byte, error) { + if b.err != nil { + return nil, b.err + } + + b.buf[len(b.buf)-1] = ')' + return b.buf, nil +} + +var quoteCompositeReplacer = strings.NewReplacer(`\`, `\\`, `"`, `\"`) + +func quoteCompositeField(src string) string { + return `"` + quoteCompositeReplacer.Replace(src) + `"` +} + +func quoteCompositeFieldIfNeeded(src string) string { + if src == "" || src[0] == ' ' || src[len(src)-1] == ' ' || strings.ContainsAny(src, `(),"\`) { + return quoteCompositeField(src) + } + return src +} + +// CompositeFields represents the values of a composite value. It can be used as an encoding source or as a scan target. +// It cannot scan a NULL, but the composite fields can be NULL. +type CompositeFields []any + +func (cf CompositeFields) SkipUnderlyingTypePlan() {} + +func (cf CompositeFields) IsNull() bool { + return cf == nil +} + +func (cf CompositeFields) Index(i int) any { + return cf[i] +} + +func (cf CompositeFields) ScanNull() error { + return fmt.Errorf("cannot scan NULL into CompositeFields") +} + +func (cf CompositeFields) ScanIndex(i int) any { + return cf[i] +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/convert.go b/vendor/github.com/jackc/pgx/v5/pgtype/convert.go new file mode 100644 index 0000000..8a9cee9 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/convert.go @@ -0,0 +1,108 @@ +package pgtype + +import ( + "reflect" +) + +func NullAssignTo(dst any) error { + dstPtr := reflect.ValueOf(dst) + + // AssignTo dst must always be a pointer + if dstPtr.Kind() != reflect.Ptr { + return &nullAssignmentError{dst: dst} + } + + dstVal := dstPtr.Elem() + + switch dstVal.Kind() { + case reflect.Ptr, reflect.Slice, reflect.Map: + dstVal.Set(reflect.Zero(dstVal.Type())) + return nil + } + + return &nullAssignmentError{dst: dst} +} + +var kindTypes map[reflect.Kind]reflect.Type + +func toInterface(dst reflect.Value, t reflect.Type) (any, bool) { + nextDst := dst.Convert(t) + return nextDst.Interface(), dst.Type() != nextDst.Type() +} + +// GetAssignToDstType attempts to convert dst to something AssignTo can assign +// to. If dst is a pointer to pointer it allocates a value and returns the +// dereferences pointer. If dst is a named type such as *Foo where Foo is type +// Foo int16, it converts dst to *int16. +// +// GetAssignToDstType returns the converted dst and a bool representing if any +// change was made. +func GetAssignToDstType(dst any) (any, bool) { + dstPtr := reflect.ValueOf(dst) + + // AssignTo dst must always be a pointer + if dstPtr.Kind() != reflect.Ptr { + return nil, false + } + + dstVal := dstPtr.Elem() + + // if dst is a pointer to pointer, allocate space try again with the dereferenced pointer + if dstVal.Kind() == reflect.Ptr { + dstVal.Set(reflect.New(dstVal.Type().Elem())) + return dstVal.Interface(), true + } + + // if dst is pointer to a base type that has been renamed + if baseValType, ok := kindTypes[dstVal.Kind()]; ok { + return toInterface(dstPtr, reflect.PtrTo(baseValType)) + } + + if dstVal.Kind() == reflect.Slice { + if baseElemType, ok := kindTypes[dstVal.Type().Elem().Kind()]; ok { + return toInterface(dstPtr, reflect.PtrTo(reflect.SliceOf(baseElemType))) + } + } + + if dstVal.Kind() == reflect.Array { + if baseElemType, ok := kindTypes[dstVal.Type().Elem().Kind()]; ok { + return toInterface(dstPtr, reflect.PtrTo(reflect.ArrayOf(dstVal.Len(), baseElemType))) + } + } + + if dstVal.Kind() == reflect.Struct { + if dstVal.Type().NumField() == 1 && dstVal.Type().Field(0).Anonymous { + dstPtr = dstVal.Field(0).Addr() + nested := dstVal.Type().Field(0).Type + if nested.Kind() == reflect.Array { + if baseElemType, ok := kindTypes[nested.Elem().Kind()]; ok { + return toInterface(dstPtr, reflect.PtrTo(reflect.ArrayOf(nested.Len(), baseElemType))) + } + } + if _, ok := kindTypes[nested.Kind()]; ok && dstPtr.CanInterface() { + return dstPtr.Interface(), true + } + } + } + + return nil, false +} + +func init() { + kindTypes = map[reflect.Kind]reflect.Type{ + reflect.Bool: reflect.TypeOf(false), + reflect.Float32: reflect.TypeOf(float32(0)), + reflect.Float64: reflect.TypeOf(float64(0)), + reflect.Int: reflect.TypeOf(int(0)), + reflect.Int8: reflect.TypeOf(int8(0)), + reflect.Int16: reflect.TypeOf(int16(0)), + reflect.Int32: reflect.TypeOf(int32(0)), + reflect.Int64: reflect.TypeOf(int64(0)), + reflect.Uint: reflect.TypeOf(uint(0)), + reflect.Uint8: reflect.TypeOf(uint8(0)), + reflect.Uint16: reflect.TypeOf(uint16(0)), + reflect.Uint32: reflect.TypeOf(uint32(0)), + reflect.Uint64: reflect.TypeOf(uint64(0)), + reflect.String: reflect.TypeOf(""), + } +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/date.go b/vendor/github.com/jackc/pgx/v5/pgtype/date.go new file mode 100644 index 0000000..784b16d --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/date.go @@ -0,0 +1,351 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "regexp" + "strconv" + "time" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type DateScanner interface { + ScanDate(v Date) error +} + +type DateValuer interface { + DateValue() (Date, error) +} + +type Date struct { + Time time.Time + InfinityModifier InfinityModifier + Valid bool +} + +func (d *Date) ScanDate(v Date) error { + *d = v + return nil +} + +func (d Date) DateValue() (Date, error) { + return d, nil +} + +const ( + negativeInfinityDayOffset = -2147483648 + infinityDayOffset = 2147483647 +) + +// Scan implements the database/sql Scanner interface. +func (dst *Date) Scan(src any) error { + if src == nil { + *dst = Date{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToDateScanner{}.Scan([]byte(src), dst) + case time.Time: + *dst = Date{Time: src, Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Date) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + if src.InfinityModifier != Finite { + return src.InfinityModifier.String(), nil + } + return src.Time, nil +} + +func (src Date) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + + var s string + + switch src.InfinityModifier { + case Finite: + s = src.Time.Format("2006-01-02") + case Infinity: + s = "infinity" + case NegativeInfinity: + s = "-infinity" + } + + return json.Marshal(s) +} + +func (dst *Date) UnmarshalJSON(b []byte) error { + var s *string + err := json.Unmarshal(b, &s) + if err != nil { + return err + } + + if s == nil { + *dst = Date{} + return nil + } + + switch *s { + case "infinity": + *dst = Date{Valid: true, InfinityModifier: Infinity} + case "-infinity": + *dst = Date{Valid: true, InfinityModifier: -Infinity} + default: + t, err := time.ParseInLocation("2006-01-02", *s, time.UTC) + if err != nil { + return err + } + + *dst = Date{Time: t, Valid: true} + } + + return nil +} + +type DateCodec struct{} + +func (DateCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (DateCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (DateCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(DateValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanDateCodecBinary{} + case TextFormatCode: + return encodePlanDateCodecText{} + } + + return nil +} + +type encodePlanDateCodecBinary struct{} + +func (encodePlanDateCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + date, err := value.(DateValuer).DateValue() + if err != nil { + return nil, err + } + + if !date.Valid { + return nil, nil + } + + var daysSinceDateEpoch int32 + switch date.InfinityModifier { + case Finite: + tUnix := time.Date(date.Time.Year(), date.Time.Month(), date.Time.Day(), 0, 0, 0, 0, time.UTC).Unix() + dateEpoch := time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC).Unix() + + secSinceDateEpoch := tUnix - dateEpoch + daysSinceDateEpoch = int32(secSinceDateEpoch / 86400) + case Infinity: + daysSinceDateEpoch = infinityDayOffset + case NegativeInfinity: + daysSinceDateEpoch = negativeInfinityDayOffset + } + + return pgio.AppendInt32(buf, daysSinceDateEpoch), nil +} + +type encodePlanDateCodecText struct{} + +func (encodePlanDateCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + date, err := value.(DateValuer).DateValue() + if err != nil { + return nil, err + } + + if !date.Valid { + return nil, nil + } + + switch date.InfinityModifier { + case Finite: + // Year 0000 is 1 BC + bc := false + year := date.Time.Year() + if year <= 0 { + year = -year + 1 + bc = true + } + + yearBytes := strconv.AppendInt(make([]byte, 0, 6), int64(year), 10) + for i := len(yearBytes); i < 4; i++ { + buf = append(buf, '0') + } + buf = append(buf, yearBytes...) + buf = append(buf, '-') + if date.Time.Month() < 10 { + buf = append(buf, '0') + } + buf = strconv.AppendInt(buf, int64(date.Time.Month()), 10) + buf = append(buf, '-') + if date.Time.Day() < 10 { + buf = append(buf, '0') + } + buf = strconv.AppendInt(buf, int64(date.Time.Day()), 10) + + if bc { + buf = append(buf, " BC"...) + } + case Infinity: + buf = append(buf, "infinity"...) + case NegativeInfinity: + buf = append(buf, "-infinity"...) + } + + return buf, nil +} + +func (DateCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case DateScanner: + return scanPlanBinaryDateToDateScanner{} + } + case TextFormatCode: + switch target.(type) { + case DateScanner: + return scanPlanTextAnyToDateScanner{} + } + } + + return nil +} + +type scanPlanBinaryDateToDateScanner struct{} + +func (scanPlanBinaryDateToDateScanner) Scan(src []byte, dst any) error { + scanner := (dst).(DateScanner) + + if src == nil { + return scanner.ScanDate(Date{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for date: %v", len(src)) + } + + dayOffset := int32(binary.BigEndian.Uint32(src)) + + switch dayOffset { + case infinityDayOffset: + return scanner.ScanDate(Date{InfinityModifier: Infinity, Valid: true}) + case negativeInfinityDayOffset: + return scanner.ScanDate(Date{InfinityModifier: -Infinity, Valid: true}) + default: + t := time.Date(2000, 1, int(1+dayOffset), 0, 0, 0, 0, time.UTC) + return scanner.ScanDate(Date{Time: t, Valid: true}) + } +} + +type scanPlanTextAnyToDateScanner struct{} + +var dateRegexp = regexp.MustCompile(`^(\d{4,})-(\d\d)-(\d\d)( BC)?$`) + +func (scanPlanTextAnyToDateScanner) Scan(src []byte, dst any) error { + scanner := (dst).(DateScanner) + + if src == nil { + return scanner.ScanDate(Date{}) + } + + sbuf := string(src) + match := dateRegexp.FindStringSubmatch(sbuf) + if match != nil { + year, err := strconv.ParseInt(match[1], 10, 32) + if err != nil { + return fmt.Errorf("BUG: cannot parse date that regexp matched (year): %w", err) + } + + month, err := strconv.ParseInt(match[2], 10, 32) + if err != nil { + return fmt.Errorf("BUG: cannot parse date that regexp matched (month): %w", err) + } + + day, err := strconv.ParseInt(match[3], 10, 32) + if err != nil { + return fmt.Errorf("BUG: cannot parse date that regexp matched (month): %w", err) + } + + // BC matched + if len(match[4]) > 0 { + year = -year + 1 + } + + t := time.Date(int(year), time.Month(month), int(day), 0, 0, 0, 0, time.UTC) + return scanner.ScanDate(Date{Time: t, Valid: true}) + } + + switch sbuf { + case "infinity": + return scanner.ScanDate(Date{InfinityModifier: Infinity, Valid: true}) + case "-infinity": + return scanner.ScanDate(Date{InfinityModifier: -Infinity, Valid: true}) + default: + return fmt.Errorf("invalid date format") + } +} + +func (c DateCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var date Date + err := codecScan(c, m, oid, format, src, &date) + if err != nil { + return nil, err + } + + if date.InfinityModifier != Finite { + return date.InfinityModifier.String(), nil + } + + return date.Time, nil +} + +func (c DateCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var date Date + err := codecScan(c, m, oid, format, src, &date) + if err != nil { + return nil, err + } + + if date.InfinityModifier != Finite { + return date.InfinityModifier, nil + } + + return date.Time, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/doc.go b/vendor/github.com/jackc/pgx/v5/pgtype/doc.go new file mode 100644 index 0000000..7687ea8 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/doc.go @@ -0,0 +1,191 @@ +// Package pgtype converts between Go and PostgreSQL values. +/* +The primary type is the Map type. It is a map of PostgreSQL types identified by OID (object ID) to a Codec. A Codec is +responsible for converting between Go and PostgreSQL values. NewMap creates a Map with all supported standard PostgreSQL +types already registered. Additional types can be registered with Map.RegisterType. + +Use Map.Scan and Map.Encode to decode PostgreSQL values to Go and encode Go values to PostgreSQL respectively. + +Base Type Mapping + +pgtype maps between all common base types directly between Go and PostgreSQL. In particular: + + Go PostgreSQL + ----------------------- + string varchar + text + + // Integers are automatically be converted to any other integer type if + // it can be done without overflow or underflow. + int8 + int16 smallint + int32 int + int64 bigint + int + uint8 + uint16 + uint32 + uint64 + uint + + // Floats are strict and do not automatically convert like integers. + float32 float4 + float64 float8 + + time.Time date + timestamp + timestamptz + + netip.Addr inet + netip.Prefix cidr + + []byte bytea + +Null Values + +pgtype can map NULLs in two ways. The first is types that can directly represent NULL such as Int4. They work in a +similar fashion to database/sql. The second is to use a pointer to a pointer. + + var foo pgtype.Text + var bar *string + err := conn.QueryRow("select foo, bar from widgets where id=$1", 42).Scan(&foo, &bar) + if err != nil { + return err + } + +When using nullable pgtype types as parameters for queries, one has to remember +to explicitly set their Valid field to true, otherwise the parameter's value will be NULL. + +JSON Support + +pgtype automatically marshals and unmarshals data from json and jsonb PostgreSQL types. + +Extending Existing PostgreSQL Type Support + +Generally, all Codecs will support interfaces that can be implemented to enable scanning and encoding. For example, +PointCodec can use any Go type that implements the PointScanner and PointValuer interfaces. So rather than use +pgtype.Point and application can directly use its own point type with pgtype as long as it implements those interfaces. + +See example_custom_type_test.go for an example of a custom type for the PostgreSQL point type. + +Sometimes pgx supports a PostgreSQL type such as numeric but the Go type is in an external package that does not have +pgx support such as github.com/shopspring/decimal. These types can be registered with pgtype with custom conversion +logic. See https://github.com/jackc/pgx-shopspring-decimal and https://github.com/jackc/pgx-gofrs-uuid for example +integrations. + +New PostgreSQL Type Support + +pgtype uses the PostgreSQL OID to determine how to encode or decode a value. pgtype supports array, composite, domain, +and enum types. However, any type created in PostgreSQL with CREATE TYPE will receive a new OID. This means that the OID +of each new PostgreSQL type must be registered for pgtype to handle values of that type with the correct Codec. + +The pgx.Conn LoadType method can return a *Type for array, composite, domain, and enum types by inspecting the database +metadata. This *Type can then be registered with Map.RegisterType. + +For example, the following function could be called after a connection is established: + + func RegisterDataTypes(ctx context.Context, conn *pgx.Conn) error { + dataTypeNames := []string{ + "foo", + "_foo", + "bar", + "_bar", + } + + for _, typeName := range dataTypeNames { + dataType, err := conn.LoadType(ctx, typeName) + if err != nil { + return err + } + conn.TypeMap().RegisterType(dataType) + } + + return nil + } + +A type cannot be registered unless all types it depends on are already registered. e.g. An array type cannot be +registered until its element type is registered. + +ArrayCodec implements support for arrays. If pgtype supports type T then it can easily support []T by registering an +ArrayCodec for the appropriate PostgreSQL OID. In addition, Array[T] type can support multi-dimensional arrays. + +CompositeCodec implements support for PostgreSQL composite types. Go structs can be scanned into if the public fields of +the struct are in the exact order and type of the PostgreSQL type or by implementing CompositeIndexScanner and +CompositeIndexGetter. + +Domain types are treated as their underlying type if the underlying type and the domain type are registered. + +PostgreSQL enums can usually be treated as text. However, EnumCodec implements support for interning strings which can +reduce memory usage. + +While pgtype will often still work with unregistered types it is highly recommended that all types be registered due to +an improvement in performance and the elimination of certain edge cases. + +If an entirely new PostgreSQL type (e.g. PostGIS types) is used then the application or a library can create a new +Codec. Then the OID / Codec mapping can be registered with Map.RegisterType. There is no difference between a Codec +defined and registered by the application and a Codec built in to pgtype. See any of the Codecs in pgtype for Codec +examples and for examples of type registration. + +Encoding Unknown Types + +pgtype works best when the OID of the PostgreSQL type is known. But in some cases such as using the simple protocol the +OID is unknown. In this case Map.RegisterDefaultPgType can be used to register an assumed OID for a particular Go type. + +Renamed Types + +If pgtype does not recognize a type and that type is a renamed simple type simple (e.g. type MyInt32 int32) pgtype acts +as if it is the underlying type. It currently cannot automatically detect the underlying type of renamed structs (eg.g. +type MyTime time.Time). + +Compatibility with database/sql + +pgtype also includes support for custom types implementing the database/sql.Scanner and database/sql/driver.Valuer +interfaces. + +Encoding Typed Nils + +pgtype encodes untyped and typed nils (e.g. nil and []byte(nil)) to the SQL NULL value without going through the Codec +system. This means that Codecs and other encoding logic do not have to handle nil or *T(nil). + +However, database/sql compatibility requires Value to be called on T(nil) when T implements driver.Valuer. Therefore, +driver.Valuer values are only considered NULL when *T(nil) where driver.Valuer is implemented on T not on *T. See +https://github.com/golang/go/issues/8415 and +https://github.com/golang/go/commit/0ce1d79a6a771f7449ec493b993ed2a720917870. + +Child Records + +pgtype's support for arrays and composite records can be used to load records and their children in a single query. See +example_child_records_test.go for an example. + +Overview of Scanning Implementation + +The first step is to use the OID to lookup the correct Codec. If the OID is unavailable, Map will try to find the OID +from previous calls of Map.RegisterDefaultPgType. The Map will call the Codec's PlanScan method to get a plan for +scanning into the Go value. A Codec will support scanning into one or more Go types. Oftentime these Go types are +interfaces rather than explicit types. For example, PointCodec can use any Go type that implements the PointScanner and +PointValuer interfaces. + +If a Go value is not supported directly by a Codec then Map will try wrapping it with additional logic and try again. +For example, Int8Codec does not support scanning into a renamed type (e.g. type myInt64 int64). But Map will detect that +myInt64 is a renamed type and create a plan that converts the value to the underlying int64 type and then passes that to +the Codec (see TryFindUnderlyingTypeScanPlan). + +These plan wrappers are contained in Map.TryWrapScanPlanFuncs. By default these contain shared logic to handle renamed +types, pointers to pointers, slices, composite types, etc. Additional plan wrappers can be added to seamlessly integrate +types that do not support pgx directly. For example, the before mentioned +https://github.com/jackc/pgx-shopspring-decimal package detects decimal.Decimal values, wraps them in something +implementing NumericScanner and passes that to the Codec. + +Map.Scan and Map.Encode are convenience methods that wrap Map.PlanScan and Map.PlanEncode. Determining how to scan or +encode a particular type may be a time consuming operation. Hence the planning and execution steps of a conversion are +internally separated. + +Reducing Compiled Binary Size + +pgx.QueryExecModeExec and pgx.QueryExecModeSimpleProtocol require the default PostgreSQL type to be registered for each +Go type used as a query parameter. By default pgx does this for all supported types and their array variants. If an +application does not use those query execution modes or manually registers the default PostgreSQL type for the types it +uses as query parameters it can use the build tag nopgxregisterdefaulttypes. This omits the default type registration +and reduces the compiled binary size by ~2MB. +*/ +package pgtype diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/enum_codec.go b/vendor/github.com/jackc/pgx/v5/pgtype/enum_codec.go new file mode 100644 index 0000000..5e787c1 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/enum_codec.go @@ -0,0 +1,109 @@ +package pgtype + +import ( + "database/sql/driver" + "fmt" +) + +// EnumCodec is a codec that caches the strings it decodes. If the same string is read multiple times only one copy is +// allocated. These strings are only garbage collected when the EnumCodec is garbage collected. EnumCodec can be used +// for any text type not only enums, but it should only be used when there are a small number of possible values. +type EnumCodec struct { + membersMap map[string]string // map to quickly lookup member and reuse string instead of allocating +} + +func (EnumCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (EnumCodec) PreferredFormat() int16 { + return TextFormatCode +} + +func (EnumCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case TextFormatCode, BinaryFormatCode: + switch value.(type) { + case string: + return encodePlanTextCodecString{} + case []byte: + return encodePlanTextCodecByteSlice{} + case TextValuer: + return encodePlanTextCodecTextValuer{} + } + } + + return nil +} + +func (c *EnumCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case TextFormatCode, BinaryFormatCode: + switch target.(type) { + case *string: + return &scanPlanTextAnyToEnumString{codec: c} + case *[]byte: + return scanPlanAnyToNewByteSlice{} + case TextScanner: + return &scanPlanTextAnyToEnumTextScanner{codec: c} + } + } + + return nil +} + +func (c *EnumCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return c.DecodeValue(m, oid, format, src) +} + +func (c *EnumCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + return c.lookupAndCacheString(src), nil +} + +// lookupAndCacheString looks for src in the members map. If it is not found it is added to the map. +func (c *EnumCodec) lookupAndCacheString(src []byte) string { + if c.membersMap == nil { + c.membersMap = make(map[string]string) + } + + if s, found := c.membersMap[string(src)]; found { + return s + } + + s := string(src) + c.membersMap[s] = s + return s +} + +type scanPlanTextAnyToEnumString struct { + codec *EnumCodec +} + +func (plan *scanPlanTextAnyToEnumString) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p := (dst).(*string) + *p = plan.codec.lookupAndCacheString(src) + + return nil +} + +type scanPlanTextAnyToEnumTextScanner struct { + codec *EnumCodec +} + +func (plan *scanPlanTextAnyToEnumTextScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TextScanner) + + if src == nil { + return scanner.ScanText(Text{}) + } + + return scanner.ScanText(Text{String: plan.codec.lookupAndCacheString(src), Valid: true}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/float4.go b/vendor/github.com/jackc/pgx/v5/pgtype/float4.go new file mode 100644 index 0000000..8646d9d --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/float4.go @@ -0,0 +1,319 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "math" + "strconv" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type Float4 struct { + Float32 float32 + Valid bool +} + +// ScanFloat64 implements the Float64Scanner interface. +func (f *Float4) ScanFloat64(n Float8) error { + *f = Float4{Float32: float32(n.Float64), Valid: n.Valid} + return nil +} + +func (f Float4) Float64Value() (Float8, error) { + return Float8{Float64: float64(f.Float32), Valid: f.Valid}, nil +} + +func (f *Float4) ScanInt64(n Int8) error { + *f = Float4{Float32: float32(n.Int64), Valid: n.Valid} + return nil +} + +func (f Float4) Int64Value() (Int8, error) { + return Int8{Int64: int64(f.Float32), Valid: f.Valid}, nil +} + +// Scan implements the database/sql Scanner interface. +func (f *Float4) Scan(src any) error { + if src == nil { + *f = Float4{} + return nil + } + + switch src := src.(type) { + case float64: + *f = Float4{Float32: float32(src), Valid: true} + return nil + case string: + n, err := strconv.ParseFloat(string(src), 32) + if err != nil { + return err + } + *f = Float4{Float32: float32(n), Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (f Float4) Value() (driver.Value, error) { + if !f.Valid { + return nil, nil + } + return float64(f.Float32), nil +} + +func (f Float4) MarshalJSON() ([]byte, error) { + if !f.Valid { + return []byte("null"), nil + } + return json.Marshal(f.Float32) +} + +func (f *Float4) UnmarshalJSON(b []byte) error { + var n *float32 + err := json.Unmarshal(b, &n) + if err != nil { + return err + } + + if n == nil { + *f = Float4{} + } else { + *f = Float4{Float32: *n, Valid: true} + } + + return nil +} + +type Float4Codec struct{} + +func (Float4Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Float4Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Float4Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case float32: + return encodePlanFloat4CodecBinaryFloat32{} + case Float64Valuer: + return encodePlanFloat4CodecBinaryFloat64Valuer{} + case Int64Valuer: + return encodePlanFloat4CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case float32: + return encodePlanTextFloat32{} + case Float64Valuer: + return encodePlanTextFloat64Valuer{} + case Int64Valuer: + return encodePlanTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanFloat4CodecBinaryFloat32 struct{} + +func (encodePlanFloat4CodecBinaryFloat32) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(float32) + return pgio.AppendUint32(buf, math.Float32bits(n)), nil +} + +type encodePlanTextFloat32 struct{} + +func (encodePlanTextFloat32) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(float32) + return append(buf, strconv.FormatFloat(float64(n), 'f', -1, 32)...), nil +} + +type encodePlanFloat4CodecBinaryFloat64Valuer struct{} + +func (encodePlanFloat4CodecBinaryFloat64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Float64Valuer).Float64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + return pgio.AppendUint32(buf, math.Float32bits(float32(n.Float64))), nil +} + +type encodePlanFloat4CodecBinaryInt64Valuer struct{} + +func (encodePlanFloat4CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + f := float32(n.Int64) + return pgio.AppendUint32(buf, math.Float32bits(f)), nil +} + +func (Float4Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *float32: + return scanPlanBinaryFloat4ToFloat32{} + case Float64Scanner: + return scanPlanBinaryFloat4ToFloat64Scanner{} + case Int64Scanner: + return scanPlanBinaryFloat4ToInt64Scanner{} + case TextScanner: + return scanPlanBinaryFloat4ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *float32: + return scanPlanTextAnyToFloat32{} + case Float64Scanner: + return scanPlanTextAnyToFloat64Scanner{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +type scanPlanBinaryFloat4ToFloat32 struct{} + +func (scanPlanBinaryFloat4ToFloat32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for float4: %v", len(src)) + } + + n := int32(binary.BigEndian.Uint32(src)) + f := (dst).(*float32) + *f = math.Float32frombits(uint32(n)) + + return nil +} + +type scanPlanBinaryFloat4ToFloat64Scanner struct{} + +func (scanPlanBinaryFloat4ToFloat64Scanner) Scan(src []byte, dst any) error { + s := (dst).(Float64Scanner) + + if src == nil { + return s.ScanFloat64(Float8{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for float4: %v", len(src)) + } + + n := int32(binary.BigEndian.Uint32(src)) + return s.ScanFloat64(Float8{Float64: float64(math.Float32frombits(uint32(n))), Valid: true}) +} + +type scanPlanBinaryFloat4ToInt64Scanner struct{} + +func (scanPlanBinaryFloat4ToInt64Scanner) Scan(src []byte, dst any) error { + s := (dst).(Int64Scanner) + + if src == nil { + return s.ScanInt64(Int8{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for float4: %v", len(src)) + } + + ui32 := int32(binary.BigEndian.Uint32(src)) + f32 := math.Float32frombits(uint32(ui32)) + i64 := int64(f32) + if f32 != float32(i64) { + return fmt.Errorf("cannot losslessly convert %v to int64", f32) + } + + return s.ScanInt64(Int8{Int64: i64, Valid: true}) +} + +type scanPlanBinaryFloat4ToTextScanner struct{} + +func (scanPlanBinaryFloat4ToTextScanner) Scan(src []byte, dst any) error { + s := (dst).(TextScanner) + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for float4: %v", len(src)) + } + + ui32 := int32(binary.BigEndian.Uint32(src)) + f32 := math.Float32frombits(uint32(ui32)) + + return s.ScanText(Text{String: strconv.FormatFloat(float64(f32), 'f', -1, 32), Valid: true}) +} + +type scanPlanTextAnyToFloat32 struct{} + +func (scanPlanTextAnyToFloat32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + n, err := strconv.ParseFloat(string(src), 32) + if err != nil { + return err + } + + f := (dst).(*float32) + *f = float32(n) + + return nil +} + +func (c Float4Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var n float32 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return float64(n), nil +} + +func (c Float4Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n float32 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/float8.go b/vendor/github.com/jackc/pgx/v5/pgtype/float8.go new file mode 100644 index 0000000..9c923c9 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/float8.go @@ -0,0 +1,365 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "math" + "strconv" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type Float64Scanner interface { + ScanFloat64(Float8) error +} + +type Float64Valuer interface { + Float64Value() (Float8, error) +} + +type Float8 struct { + Float64 float64 + Valid bool +} + +// ScanFloat64 implements the Float64Scanner interface. +func (f *Float8) ScanFloat64(n Float8) error { + *f = n + return nil +} + +func (f Float8) Float64Value() (Float8, error) { + return f, nil +} + +func (f *Float8) ScanInt64(n Int8) error { + *f = Float8{Float64: float64(n.Int64), Valid: n.Valid} + return nil +} + +func (f Float8) Int64Value() (Int8, error) { + return Int8{Int64: int64(f.Float64), Valid: f.Valid}, nil +} + +// Scan implements the database/sql Scanner interface. +func (f *Float8) Scan(src any) error { + if src == nil { + *f = Float8{} + return nil + } + + switch src := src.(type) { + case float64: + *f = Float8{Float64: src, Valid: true} + return nil + case string: + n, err := strconv.ParseFloat(string(src), 64) + if err != nil { + return err + } + *f = Float8{Float64: n, Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (f Float8) Value() (driver.Value, error) { + if !f.Valid { + return nil, nil + } + return f.Float64, nil +} + +func (f Float8) MarshalJSON() ([]byte, error) { + if !f.Valid { + return []byte("null"), nil + } + return json.Marshal(f.Float64) +} + +func (f *Float8) UnmarshalJSON(b []byte) error { + var n *float64 + err := json.Unmarshal(b, &n) + if err != nil { + return err + } + + if n == nil { + *f = Float8{} + } else { + *f = Float8{Float64: *n, Valid: true} + } + + return nil +} + +type Float8Codec struct{} + +func (Float8Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Float8Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Float8Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case float64: + return encodePlanFloat8CodecBinaryFloat64{} + case Float64Valuer: + return encodePlanFloat8CodecBinaryFloat64Valuer{} + case Int64Valuer: + return encodePlanFloat8CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case float64: + return encodePlanTextFloat64{} + case Float64Valuer: + return encodePlanTextFloat64Valuer{} + case Int64Valuer: + return encodePlanTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanFloat8CodecBinaryFloat64 struct{} + +func (encodePlanFloat8CodecBinaryFloat64) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(float64) + return pgio.AppendUint64(buf, math.Float64bits(n)), nil +} + +type encodePlanTextFloat64 struct{} + +func (encodePlanTextFloat64) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(float64) + return append(buf, strconv.FormatFloat(n, 'f', -1, 64)...), nil +} + +type encodePlanFloat8CodecBinaryFloat64Valuer struct{} + +func (encodePlanFloat8CodecBinaryFloat64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Float64Valuer).Float64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + return pgio.AppendUint64(buf, math.Float64bits(n.Float64)), nil +} + +type encodePlanTextFloat64Valuer struct{} + +func (encodePlanTextFloat64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Float64Valuer).Float64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + return append(buf, strconv.FormatFloat(n.Float64, 'f', -1, 64)...), nil +} + +type encodePlanFloat8CodecBinaryInt64Valuer struct{} + +func (encodePlanFloat8CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + f := float64(n.Int64) + return pgio.AppendUint64(buf, math.Float64bits(f)), nil +} + +type encodePlanTextInt64Valuer struct{} + +func (encodePlanTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + return append(buf, strconv.FormatInt(n.Int64, 10)...), nil +} + +func (Float8Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *float64: + return scanPlanBinaryFloat8ToFloat64{} + case Float64Scanner: + return scanPlanBinaryFloat8ToFloat64Scanner{} + case Int64Scanner: + return scanPlanBinaryFloat8ToInt64Scanner{} + case TextScanner: + return scanPlanBinaryFloat8ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *float64: + return scanPlanTextAnyToFloat64{} + case Float64Scanner: + return scanPlanTextAnyToFloat64Scanner{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +type scanPlanBinaryFloat8ToFloat64 struct{} + +func (scanPlanBinaryFloat8ToFloat64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for float8: %v", len(src)) + } + + n := int64(binary.BigEndian.Uint64(src)) + f := (dst).(*float64) + *f = math.Float64frombits(uint64(n)) + + return nil +} + +type scanPlanBinaryFloat8ToFloat64Scanner struct{} + +func (scanPlanBinaryFloat8ToFloat64Scanner) Scan(src []byte, dst any) error { + s := (dst).(Float64Scanner) + + if src == nil { + return s.ScanFloat64(Float8{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for float8: %v", len(src)) + } + + n := int64(binary.BigEndian.Uint64(src)) + return s.ScanFloat64(Float8{Float64: math.Float64frombits(uint64(n)), Valid: true}) +} + +type scanPlanBinaryFloat8ToInt64Scanner struct{} + +func (scanPlanBinaryFloat8ToInt64Scanner) Scan(src []byte, dst any) error { + s := (dst).(Int64Scanner) + + if src == nil { + return s.ScanInt64(Int8{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for float8: %v", len(src)) + } + + ui64 := int64(binary.BigEndian.Uint64(src)) + f64 := math.Float64frombits(uint64(ui64)) + i64 := int64(f64) + if f64 != float64(i64) { + return fmt.Errorf("cannot losslessly convert %v to int64", f64) + } + + return s.ScanInt64(Int8{Int64: i64, Valid: true}) +} + +type scanPlanBinaryFloat8ToTextScanner struct{} + +func (scanPlanBinaryFloat8ToTextScanner) Scan(src []byte, dst any) error { + s := (dst).(TextScanner) + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for float8: %v", len(src)) + } + + ui64 := int64(binary.BigEndian.Uint64(src)) + f64 := math.Float64frombits(uint64(ui64)) + + return s.ScanText(Text{String: strconv.FormatFloat(f64, 'f', -1, 64), Valid: true}) +} + +type scanPlanTextAnyToFloat64 struct{} + +func (scanPlanTextAnyToFloat64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + n, err := strconv.ParseFloat(string(src), 64) + if err != nil { + return err + } + + f := (dst).(*float64) + *f = n + + return nil +} + +type scanPlanTextAnyToFloat64Scanner struct{} + +func (scanPlanTextAnyToFloat64Scanner) Scan(src []byte, dst any) error { + s := (dst).(Float64Scanner) + + if src == nil { + return s.ScanFloat64(Float8{}) + } + + n, err := strconv.ParseFloat(string(src), 64) + if err != nil { + return err + } + + return s.ScanFloat64(Float8{Float64: n, Valid: true}) +} + +func (c Float8Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return c.DecodeValue(m, oid, format, src) +} + +func (c Float8Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n float64 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/hstore.go b/vendor/github.com/jackc/pgx/v5/pgtype/hstore.go new file mode 100644 index 0000000..2f34f4c --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/hstore.go @@ -0,0 +1,486 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "errors" + "fmt" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type HstoreScanner interface { + ScanHstore(v Hstore) error +} + +type HstoreValuer interface { + HstoreValue() (Hstore, error) +} + +// Hstore represents an hstore column that can be null or have null values +// associated with its keys. +type Hstore map[string]*string + +func (h *Hstore) ScanHstore(v Hstore) error { + *h = v + return nil +} + +func (h Hstore) HstoreValue() (Hstore, error) { + return h, nil +} + +// Scan implements the database/sql Scanner interface. +func (h *Hstore) Scan(src any) error { + if src == nil { + *h = nil + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToHstoreScanner{}.scanString(src, h) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (h Hstore) Value() (driver.Value, error) { + if h == nil { + return nil, nil + } + + buf, err := HstoreCodec{}.PlanEncode(nil, 0, TextFormatCode, h).Encode(h, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type HstoreCodec struct{} + +func (HstoreCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (HstoreCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (HstoreCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(HstoreValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanHstoreCodecBinary{} + case TextFormatCode: + return encodePlanHstoreCodecText{} + } + + return nil +} + +type encodePlanHstoreCodecBinary struct{} + +func (encodePlanHstoreCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + hstore, err := value.(HstoreValuer).HstoreValue() + if err != nil { + return nil, err + } + + if hstore == nil { + return nil, nil + } + + buf = pgio.AppendInt32(buf, int32(len(hstore))) + + for k, v := range hstore { + buf = pgio.AppendInt32(buf, int32(len(k))) + buf = append(buf, k...) + + if v == nil { + buf = pgio.AppendInt32(buf, -1) + } else { + buf = pgio.AppendInt32(buf, int32(len(*v))) + buf = append(buf, (*v)...) + } + } + + return buf, nil +} + +type encodePlanHstoreCodecText struct{} + +func (encodePlanHstoreCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + hstore, err := value.(HstoreValuer).HstoreValue() + if err != nil { + return nil, err + } + + if len(hstore) == 0 { + // distinguish between empty and nil: Not strictly required by Postgres, since its protocol + // explicitly marks NULL column values separately. However, the Binary codec does this, and + // this means we can "round trip" Encode and Scan without data loss. + // nil: []byte(nil); empty: []byte{} + if hstore == nil { + return nil, nil + } + return []byte{}, nil + } + + firstPair := true + + for k, v := range hstore { + if firstPair { + firstPair = false + } else { + buf = append(buf, ',', ' ') + } + + // unconditionally quote hstore keys/values like Postgres does + // this avoids a Mac OS X Postgres hstore parsing bug: + // https://www.postgresql.org/message-id/CA%2BHWA9awUW0%2BRV_gO9r1ABZwGoZxPztcJxPy8vMFSTbTfi4jig%40mail.gmail.com + buf = append(buf, '"') + buf = append(buf, quoteArrayReplacer.Replace(k)...) + buf = append(buf, '"') + buf = append(buf, "=>"...) + + if v == nil { + buf = append(buf, "NULL"...) + } else { + buf = append(buf, '"') + buf = append(buf, quoteArrayReplacer.Replace(*v)...) + buf = append(buf, '"') + } + } + + return buf, nil +} + +func (HstoreCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case HstoreScanner: + return scanPlanBinaryHstoreToHstoreScanner{} + } + case TextFormatCode: + switch target.(type) { + case HstoreScanner: + return scanPlanTextAnyToHstoreScanner{} + } + } + + return nil +} + +type scanPlanBinaryHstoreToHstoreScanner struct{} + +func (scanPlanBinaryHstoreToHstoreScanner) Scan(src []byte, dst any) error { + scanner := (dst).(HstoreScanner) + + if src == nil { + return scanner.ScanHstore(Hstore(nil)) + } + + rp := 0 + + const uint32Len = 4 + if len(src[rp:]) < uint32Len { + return fmt.Errorf("hstore incomplete %v", src) + } + pairCount := int(int32(binary.BigEndian.Uint32(src[rp:]))) + rp += uint32Len + + hstore := make(Hstore, pairCount) + // one allocation for all *string, rather than one per string, just like text parsing + valueStrings := make([]string, pairCount) + + for i := 0; i < pairCount; i++ { + if len(src[rp:]) < uint32Len { + return fmt.Errorf("hstore incomplete %v", src) + } + keyLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) + rp += uint32Len + + if len(src[rp:]) < keyLen { + return fmt.Errorf("hstore incomplete %v", src) + } + key := string(src[rp : rp+keyLen]) + rp += keyLen + + if len(src[rp:]) < uint32Len { + return fmt.Errorf("hstore incomplete %v", src) + } + valueLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) + rp += 4 + + if valueLen >= 0 { + valueStrings[i] = string(src[rp : rp+valueLen]) + rp += valueLen + + hstore[key] = &valueStrings[i] + } else { + hstore[key] = nil + } + } + + return scanner.ScanHstore(hstore) +} + +type scanPlanTextAnyToHstoreScanner struct{} + +func (s scanPlanTextAnyToHstoreScanner) Scan(src []byte, dst any) error { + scanner := (dst).(HstoreScanner) + + if src == nil { + return scanner.ScanHstore(Hstore(nil)) + } + return s.scanString(string(src), scanner) +} + +// scanString does not return nil hstore values because string cannot be nil. +func (scanPlanTextAnyToHstoreScanner) scanString(src string, scanner HstoreScanner) error { + hstore, err := parseHstore(src) + if err != nil { + return err + } + return scanner.ScanHstore(hstore) +} + +func (c HstoreCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c HstoreCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var hstore Hstore + err := codecScan(c, m, oid, format, src, &hstore) + if err != nil { + return nil, err + } + return hstore, nil +} + +type hstoreParser struct { + str string + pos int + nextBackslash int +} + +func newHSP(in string) *hstoreParser { + return &hstoreParser{ + pos: 0, + str: in, + nextBackslash: strings.IndexByte(in, '\\'), + } +} + +func (p *hstoreParser) atEnd() bool { + return p.pos >= len(p.str) +} + +// consume returns the next byte of the string, or end if the string is done. +func (p *hstoreParser) consume() (b byte, end bool) { + if p.pos >= len(p.str) { + return 0, true + } + b = p.str[p.pos] + p.pos++ + return b, false +} + +func unexpectedByteErr(actualB byte, expectedB byte) error { + return fmt.Errorf("expected '%c' ('%#v'); found '%c' ('%#v')", expectedB, expectedB, actualB, actualB) +} + +// consumeExpectedByte consumes expectedB from the string, or returns an error. +func (p *hstoreParser) consumeExpectedByte(expectedB byte) error { + nextB, end := p.consume() + if end { + return fmt.Errorf("expected '%c' ('%#v'); found end", expectedB, expectedB) + } + if nextB != expectedB { + return unexpectedByteErr(nextB, expectedB) + } + return nil +} + +// consumeExpected2 consumes two expected bytes or returns an error. +// This was a bit faster than using a string argument (better inlining? Not sure). +func (p *hstoreParser) consumeExpected2(one byte, two byte) error { + if p.pos+2 > len(p.str) { + return errors.New("unexpected end of string") + } + if p.str[p.pos] != one { + return unexpectedByteErr(p.str[p.pos], one) + } + if p.str[p.pos+1] != two { + return unexpectedByteErr(p.str[p.pos+1], two) + } + p.pos += 2 + return nil +} + +var errEOSInQuoted = errors.New(`found end before closing double-quote ('"')`) + +// consumeDoubleQuoted consumes a double-quoted string from p. The double quote must have been +// parsed already. This copies the string from the backing string so it can be garbage collected. +func (p *hstoreParser) consumeDoubleQuoted() (string, error) { + // fast path: assume most keys/values do not contain escapes + nextDoubleQuote := strings.IndexByte(p.str[p.pos:], '"') + if nextDoubleQuote == -1 { + return "", errEOSInQuoted + } + nextDoubleQuote += p.pos + if p.nextBackslash == -1 || p.nextBackslash > nextDoubleQuote { + // clone the string from the source string to ensure it can be garbage collected separately + // TODO: use strings.Clone on Go 1.20; this could get optimized away + s := strings.Clone(p.str[p.pos:nextDoubleQuote]) + p.pos = nextDoubleQuote + 1 + return s, nil + } + + // slow path: string contains escapes + s, err := p.consumeDoubleQuotedWithEscapes(p.nextBackslash) + p.nextBackslash = strings.IndexByte(p.str[p.pos:], '\\') + if p.nextBackslash != -1 { + p.nextBackslash += p.pos + } + return s, err +} + +// consumeDoubleQuotedWithEscapes consumes a double-quoted string containing escapes, starting +// at p.pos, and with the first backslash at firstBackslash. This copies the string so it can be +// garbage collected separately. +func (p *hstoreParser) consumeDoubleQuotedWithEscapes(firstBackslash int) (string, error) { + // copy the prefix that does not contain backslashes + var builder strings.Builder + builder.WriteString(p.str[p.pos:firstBackslash]) + + // skip to the backslash + p.pos = firstBackslash + + // copy bytes until the end, unescaping backslashes + for { + nextB, end := p.consume() + if end { + return "", errEOSInQuoted + } else if nextB == '"' { + break + } else if nextB == '\\' { + // escape: skip the backslash and copy the char + nextB, end = p.consume() + if end { + return "", errEOSInQuoted + } + if !(nextB == '\\' || nextB == '"') { + return "", fmt.Errorf("unexpected escape in quoted string: found '%#v'", nextB) + } + builder.WriteByte(nextB) + } else { + // normal byte: copy it + builder.WriteByte(nextB) + } + } + return builder.String(), nil +} + +// consumePairSeparator consumes the Hstore pair separator ", " or returns an error. +func (p *hstoreParser) consumePairSeparator() error { + return p.consumeExpected2(',', ' ') +} + +// consumeKVSeparator consumes the Hstore key/value separator "=>" or returns an error. +func (p *hstoreParser) consumeKVSeparator() error { + return p.consumeExpected2('=', '>') +} + +// consumeDoubleQuotedOrNull consumes the Hstore key/value separator "=>" or returns an error. +func (p *hstoreParser) consumeDoubleQuotedOrNull() (Text, error) { + // peek at the next byte + if p.atEnd() { + return Text{}, errors.New("found end instead of value") + } + next := p.str[p.pos] + if next == 'N' { + // must be the exact string NULL: use consumeExpected2 twice + err := p.consumeExpected2('N', 'U') + if err != nil { + return Text{}, err + } + err = p.consumeExpected2('L', 'L') + if err != nil { + return Text{}, err + } + return Text{String: "", Valid: false}, nil + } else if next != '"' { + return Text{}, unexpectedByteErr(next, '"') + } + + // skip the double quote + p.pos += 1 + s, err := p.consumeDoubleQuoted() + if err != nil { + return Text{}, err + } + return Text{String: s, Valid: true}, nil +} + +func parseHstore(s string) (Hstore, error) { + p := newHSP(s) + + // This is an over-estimate of the number of key/value pairs. Use '>' because I am guessing it + // is less likely to occur in keys/values than '=' or ','. + numPairsEstimate := strings.Count(s, ">") + // makes one allocation of strings for the entire Hstore, rather than one allocation per value. + valueStrings := make([]string, 0, numPairsEstimate) + result := make(Hstore, numPairsEstimate) + first := true + for !p.atEnd() { + if !first { + err := p.consumePairSeparator() + if err != nil { + return nil, err + } + } else { + first = false + } + + err := p.consumeExpectedByte('"') + if err != nil { + return nil, err + } + + key, err := p.consumeDoubleQuoted() + if err != nil { + return nil, err + } + + err = p.consumeKVSeparator() + if err != nil { + return nil, err + } + + value, err := p.consumeDoubleQuotedOrNull() + if err != nil { + return nil, err + } + if value.Valid { + valueStrings = append(valueStrings, value.String) + result[key] = &valueStrings[len(valueStrings)-1] + } else { + result[key] = nil + } + } + + return result, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/inet.go b/vendor/github.com/jackc/pgx/v5/pgtype/inet.go new file mode 100644 index 0000000..6ca10ea --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/inet.go @@ -0,0 +1,200 @@ +package pgtype + +import ( + "bytes" + "database/sql/driver" + "errors" + "fmt" + "net/netip" +) + +// Network address family is dependent on server socket.h value for AF_INET. +// In practice, all platforms appear to have the same value. See +// src/include/utils/inet.h for more information. +const ( + defaultAFInet = 2 + defaultAFInet6 = 3 +) + +type NetipPrefixScanner interface { + ScanNetipPrefix(v netip.Prefix) error +} + +type NetipPrefixValuer interface { + NetipPrefixValue() (netip.Prefix, error) +} + +// InetCodec handles both inet and cidr PostgreSQL types. The preferred Go types are netip.Prefix and netip.Addr. If +// IsValid() is false then they are treated as SQL NULL. +type InetCodec struct{} + +func (InetCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (InetCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (InetCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(NetipPrefixValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanInetCodecBinary{} + case TextFormatCode: + return encodePlanInetCodecText{} + } + + return nil +} + +type encodePlanInetCodecBinary struct{} + +func (encodePlanInetCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + prefix, err := value.(NetipPrefixValuer).NetipPrefixValue() + if err != nil { + return nil, err + } + + if !prefix.IsValid() { + return nil, nil + } + + var family byte + if prefix.Addr().Is4() { + family = defaultAFInet + } else { + family = defaultAFInet6 + } + + buf = append(buf, family) + + ones := prefix.Bits() + buf = append(buf, byte(ones)) + + // is_cidr is ignored on server + buf = append(buf, 0) + + if family == defaultAFInet { + buf = append(buf, byte(4)) + b := prefix.Addr().As4() + buf = append(buf, b[:]...) + } else { + buf = append(buf, byte(16)) + b := prefix.Addr().As16() + buf = append(buf, b[:]...) + } + + return buf, nil +} + +type encodePlanInetCodecText struct{} + +func (encodePlanInetCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + prefix, err := value.(NetipPrefixValuer).NetipPrefixValue() + if err != nil { + return nil, err + } + + if !prefix.IsValid() { + return nil, nil + } + + return append(buf, prefix.String()...), nil +} + +func (InetCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case NetipPrefixScanner: + return scanPlanBinaryInetToNetipPrefixScanner{} + } + case TextFormatCode: + switch target.(type) { + case NetipPrefixScanner: + return scanPlanTextAnyToNetipPrefixScanner{} + } + } + + return nil +} + +func (c InetCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c InetCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var prefix netip.Prefix + err := codecScan(c, m, oid, format, src, (*netipPrefixWrapper)(&prefix)) + if err != nil { + return nil, err + } + + if !prefix.IsValid() { + return nil, nil + } + + return prefix, nil +} + +type scanPlanBinaryInetToNetipPrefixScanner struct{} + +func (scanPlanBinaryInetToNetipPrefixScanner) Scan(src []byte, dst any) error { + scanner := (dst).(NetipPrefixScanner) + + if src == nil { + return scanner.ScanNetipPrefix(netip.Prefix{}) + } + + if len(src) != 8 && len(src) != 20 { + return fmt.Errorf("Received an invalid size for an inet: %d", len(src)) + } + + // ignore family + bits := src[1] + // ignore is_cidr + // ignore addressLength - implicit in length of message + + addr, ok := netip.AddrFromSlice(src[4:]) + if !ok { + return errors.New("netip.AddrFromSlice failed") + } + + return scanner.ScanNetipPrefix(netip.PrefixFrom(addr, int(bits))) +} + +type scanPlanTextAnyToNetipPrefixScanner struct{} + +func (scanPlanTextAnyToNetipPrefixScanner) Scan(src []byte, dst any) error { + scanner := (dst).(NetipPrefixScanner) + + if src == nil { + return scanner.ScanNetipPrefix(netip.Prefix{}) + } + + var prefix netip.Prefix + if bytes.IndexByte(src, '/') == -1 { + addr, err := netip.ParseAddr(string(src)) + if err != nil { + return err + } + prefix = netip.PrefixFrom(addr, addr.BitLen()) + } else { + var err error + prefix, err = netip.ParsePrefix(string(src)) + if err != nil { + return err + } + } + + return scanner.ScanNetipPrefix(prefix) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/int.go b/vendor/github.com/jackc/pgx/v5/pgtype/int.go new file mode 100644 index 0000000..90a20a2 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/int.go @@ -0,0 +1,1980 @@ +// Do not edit. Generated from pgtype/int.go.erb +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "math" + "strconv" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type Int64Scanner interface { + ScanInt64(Int8) error +} + +type Int64Valuer interface { + Int64Value() (Int8, error) +} + +type Int2 struct { + Int16 int16 + Valid bool +} + +// ScanInt64 implements the Int64Scanner interface. +func (dst *Int2) ScanInt64(n Int8) error { + if !n.Valid { + *dst = Int2{} + return nil + } + + if n.Int64 < math.MinInt16 { + return fmt.Errorf("%d is less than minimum value for Int2", n.Int64) + } + if n.Int64 > math.MaxInt16 { + return fmt.Errorf("%d is greater than maximum value for Int2", n.Int64) + } + *dst = Int2{Int16: int16(n.Int64), Valid: true} + + return nil +} + +func (n Int2) Int64Value() (Int8, error) { + return Int8{Int64: int64(n.Int16), Valid: n.Valid}, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Int2) Scan(src any) error { + if src == nil { + *dst = Int2{} + return nil + } + + var n int64 + + switch src := src.(type) { + case int64: + n = src + case string: + var err error + n, err = strconv.ParseInt(src, 10, 16) + if err != nil { + return err + } + case []byte: + var err error + n, err = strconv.ParseInt(string(src), 10, 16) + if err != nil { + return err + } + default: + return fmt.Errorf("cannot scan %T", src) + } + + if n < math.MinInt16 { + return fmt.Errorf("%d is greater than maximum value for Int2", n) + } + if n > math.MaxInt16 { + return fmt.Errorf("%d is greater than maximum value for Int2", n) + } + *dst = Int2{Int16: int16(n), Valid: true} + + return nil +} + +// Value implements the database/sql/driver Valuer interface. +func (src Int2) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + return int64(src.Int16), nil +} + +func (src Int2) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + return []byte(strconv.FormatInt(int64(src.Int16), 10)), nil +} + +func (dst *Int2) UnmarshalJSON(b []byte) error { + var n *int16 + err := json.Unmarshal(b, &n) + if err != nil { + return err + } + + if n == nil { + *dst = Int2{} + } else { + *dst = Int2{Int16: *n, Valid: true} + } + + return nil +} + +type Int2Codec struct{} + +func (Int2Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Int2Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Int2Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case int16: + return encodePlanInt2CodecBinaryInt16{} + case Int64Valuer: + return encodePlanInt2CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case int16: + return encodePlanInt2CodecTextInt16{} + case Int64Valuer: + return encodePlanInt2CodecTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanInt2CodecBinaryInt16 struct{} + +func (encodePlanInt2CodecBinaryInt16) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int16) + return pgio.AppendInt16(buf, int16(n)), nil +} + +type encodePlanInt2CodecTextInt16 struct{} + +func (encodePlanInt2CodecTextInt16) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int16) + return append(buf, strconv.FormatInt(int64(n), 10)...), nil +} + +type encodePlanInt2CodecBinaryInt64Valuer struct{} + +func (encodePlanInt2CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt16 { + return nil, fmt.Errorf("%d is greater than maximum value for int2", n.Int64) + } + if n.Int64 < math.MinInt16 { + return nil, fmt.Errorf("%d is less than minimum value for int2", n.Int64) + } + + return pgio.AppendInt16(buf, int16(n.Int64)), nil +} + +type encodePlanInt2CodecTextInt64Valuer struct{} + +func (encodePlanInt2CodecTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt16 { + return nil, fmt.Errorf("%d is greater than maximum value for int2", n.Int64) + } + if n.Int64 < math.MinInt16 { + return nil, fmt.Errorf("%d is less than minimum value for int2", n.Int64) + } + + return append(buf, strconv.FormatInt(n.Int64, 10)...), nil +} + +func (Int2Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *int8: + return scanPlanBinaryInt2ToInt8{} + case *int16: + return scanPlanBinaryInt2ToInt16{} + case *int32: + return scanPlanBinaryInt2ToInt32{} + case *int64: + return scanPlanBinaryInt2ToInt64{} + case *int: + return scanPlanBinaryInt2ToInt{} + case *uint8: + return scanPlanBinaryInt2ToUint8{} + case *uint16: + return scanPlanBinaryInt2ToUint16{} + case *uint32: + return scanPlanBinaryInt2ToUint32{} + case *uint64: + return scanPlanBinaryInt2ToUint64{} + case *uint: + return scanPlanBinaryInt2ToUint{} + case Int64Scanner: + return scanPlanBinaryInt2ToInt64Scanner{} + case TextScanner: + return scanPlanBinaryInt2ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *int8: + return scanPlanTextAnyToInt8{} + case *int16: + return scanPlanTextAnyToInt16{} + case *int32: + return scanPlanTextAnyToInt32{} + case *int64: + return scanPlanTextAnyToInt64{} + case *int: + return scanPlanTextAnyToInt{} + case *uint8: + return scanPlanTextAnyToUint8{} + case *uint16: + return scanPlanTextAnyToUint16{} + case *uint32: + return scanPlanTextAnyToUint32{} + case *uint64: + return scanPlanTextAnyToUint64{} + case *uint: + return scanPlanTextAnyToUint{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +func (c Int2Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var n int64 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +func (c Int2Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n int16 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +type scanPlanBinaryInt2ToInt8 struct{} + +func (scanPlanBinaryInt2ToInt8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + p, ok := (dst).(*int8) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int16(binary.BigEndian.Uint16(src)) + if n < math.MinInt8 { + return fmt.Errorf("%d is less than minimum value for int8", n) + } else if n > math.MaxInt8 { + return fmt.Errorf("%d is greater than maximum value for int8", n) + } + + *p = int8(n) + + return nil +} + +type scanPlanBinaryInt2ToUint8 struct{} + +func (scanPlanBinaryInt2ToUint8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for uint2: %v", len(src)) + } + + p, ok := (dst).(*uint8) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int16(binary.BigEndian.Uint16(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint8", n) + } + + if n > math.MaxUint8 { + return fmt.Errorf("%d is greater than maximum value for uint8", n) + } + + *p = uint8(n) + + return nil +} + +type scanPlanBinaryInt2ToInt16 struct{} + +func (scanPlanBinaryInt2ToInt16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + p, ok := (dst).(*int16) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int16(binary.BigEndian.Uint16(src)) + + return nil +} + +type scanPlanBinaryInt2ToUint16 struct{} + +func (scanPlanBinaryInt2ToUint16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for uint2: %v", len(src)) + } + + p, ok := (dst).(*uint16) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int16(binary.BigEndian.Uint16(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint16", n) + } + + *p = uint16(n) + + return nil +} + +type scanPlanBinaryInt2ToInt32 struct{} + +func (scanPlanBinaryInt2ToInt32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + p, ok := (dst).(*int32) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int32(int16(binary.BigEndian.Uint16(src))) + + return nil +} + +type scanPlanBinaryInt2ToUint32 struct{} + +func (scanPlanBinaryInt2ToUint32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for uint2: %v", len(src)) + } + + p, ok := (dst).(*uint32) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int16(binary.BigEndian.Uint16(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint32", n) + } + + *p = uint32(n) + + return nil +} + +type scanPlanBinaryInt2ToInt64 struct{} + +func (scanPlanBinaryInt2ToInt64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + p, ok := (dst).(*int64) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int64(int16(binary.BigEndian.Uint16(src))) + + return nil +} + +type scanPlanBinaryInt2ToUint64 struct{} + +func (scanPlanBinaryInt2ToUint64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for uint2: %v", len(src)) + } + + p, ok := (dst).(*uint64) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int16(binary.BigEndian.Uint16(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint64", n) + } + + *p = uint64(n) + + return nil +} + +type scanPlanBinaryInt2ToInt struct{} + +func (scanPlanBinaryInt2ToInt) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + p, ok := (dst).(*int) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int(int16(binary.BigEndian.Uint16(src))) + + return nil +} + +type scanPlanBinaryInt2ToUint struct{} + +func (scanPlanBinaryInt2ToUint) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for uint2: %v", len(src)) + } + + p, ok := (dst).(*uint) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(int16(binary.BigEndian.Uint16(src))) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint", n) + } + + *p = uint(n) + + return nil +} + +type scanPlanBinaryInt2ToInt64Scanner struct{} + +func (scanPlanBinaryInt2ToInt64Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Int64Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanInt64(Int8{}) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + n := int64(int16(binary.BigEndian.Uint16(src))) + + return s.ScanInt64(Int8{Int64: n, Valid: true}) +} + +type scanPlanBinaryInt2ToTextScanner struct{} + +func (scanPlanBinaryInt2ToTextScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(TextScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != 2 { + return fmt.Errorf("invalid length for int2: %v", len(src)) + } + + n := int64(int16(binary.BigEndian.Uint16(src))) + + return s.ScanText(Text{String: strconv.FormatInt(n, 10), Valid: true}) +} + +type Int4 struct { + Int32 int32 + Valid bool +} + +// ScanInt64 implements the Int64Scanner interface. +func (dst *Int4) ScanInt64(n Int8) error { + if !n.Valid { + *dst = Int4{} + return nil + } + + if n.Int64 < math.MinInt32 { + return fmt.Errorf("%d is less than minimum value for Int4", n.Int64) + } + if n.Int64 > math.MaxInt32 { + return fmt.Errorf("%d is greater than maximum value for Int4", n.Int64) + } + *dst = Int4{Int32: int32(n.Int64), Valid: true} + + return nil +} + +func (n Int4) Int64Value() (Int8, error) { + return Int8{Int64: int64(n.Int32), Valid: n.Valid}, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Int4) Scan(src any) error { + if src == nil { + *dst = Int4{} + return nil + } + + var n int64 + + switch src := src.(type) { + case int64: + n = src + case string: + var err error + n, err = strconv.ParseInt(src, 10, 32) + if err != nil { + return err + } + case []byte: + var err error + n, err = strconv.ParseInt(string(src), 10, 32) + if err != nil { + return err + } + default: + return fmt.Errorf("cannot scan %T", src) + } + + if n < math.MinInt32 { + return fmt.Errorf("%d is greater than maximum value for Int4", n) + } + if n > math.MaxInt32 { + return fmt.Errorf("%d is greater than maximum value for Int4", n) + } + *dst = Int4{Int32: int32(n), Valid: true} + + return nil +} + +// Value implements the database/sql/driver Valuer interface. +func (src Int4) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + return int64(src.Int32), nil +} + +func (src Int4) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + return []byte(strconv.FormatInt(int64(src.Int32), 10)), nil +} + +func (dst *Int4) UnmarshalJSON(b []byte) error { + var n *int32 + err := json.Unmarshal(b, &n) + if err != nil { + return err + } + + if n == nil { + *dst = Int4{} + } else { + *dst = Int4{Int32: *n, Valid: true} + } + + return nil +} + +type Int4Codec struct{} + +func (Int4Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Int4Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Int4Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case int32: + return encodePlanInt4CodecBinaryInt32{} + case Int64Valuer: + return encodePlanInt4CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case int32: + return encodePlanInt4CodecTextInt32{} + case Int64Valuer: + return encodePlanInt4CodecTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanInt4CodecBinaryInt32 struct{} + +func (encodePlanInt4CodecBinaryInt32) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int32) + return pgio.AppendInt32(buf, int32(n)), nil +} + +type encodePlanInt4CodecTextInt32 struct{} + +func (encodePlanInt4CodecTextInt32) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int32) + return append(buf, strconv.FormatInt(int64(n), 10)...), nil +} + +type encodePlanInt4CodecBinaryInt64Valuer struct{} + +func (encodePlanInt4CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt32 { + return nil, fmt.Errorf("%d is greater than maximum value for int4", n.Int64) + } + if n.Int64 < math.MinInt32 { + return nil, fmt.Errorf("%d is less than minimum value for int4", n.Int64) + } + + return pgio.AppendInt32(buf, int32(n.Int64)), nil +} + +type encodePlanInt4CodecTextInt64Valuer struct{} + +func (encodePlanInt4CodecTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt32 { + return nil, fmt.Errorf("%d is greater than maximum value for int4", n.Int64) + } + if n.Int64 < math.MinInt32 { + return nil, fmt.Errorf("%d is less than minimum value for int4", n.Int64) + } + + return append(buf, strconv.FormatInt(n.Int64, 10)...), nil +} + +func (Int4Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *int8: + return scanPlanBinaryInt4ToInt8{} + case *int16: + return scanPlanBinaryInt4ToInt16{} + case *int32: + return scanPlanBinaryInt4ToInt32{} + case *int64: + return scanPlanBinaryInt4ToInt64{} + case *int: + return scanPlanBinaryInt4ToInt{} + case *uint8: + return scanPlanBinaryInt4ToUint8{} + case *uint16: + return scanPlanBinaryInt4ToUint16{} + case *uint32: + return scanPlanBinaryInt4ToUint32{} + case *uint64: + return scanPlanBinaryInt4ToUint64{} + case *uint: + return scanPlanBinaryInt4ToUint{} + case Int64Scanner: + return scanPlanBinaryInt4ToInt64Scanner{} + case TextScanner: + return scanPlanBinaryInt4ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *int8: + return scanPlanTextAnyToInt8{} + case *int16: + return scanPlanTextAnyToInt16{} + case *int32: + return scanPlanTextAnyToInt32{} + case *int64: + return scanPlanTextAnyToInt64{} + case *int: + return scanPlanTextAnyToInt{} + case *uint8: + return scanPlanTextAnyToUint8{} + case *uint16: + return scanPlanTextAnyToUint16{} + case *uint32: + return scanPlanTextAnyToUint32{} + case *uint64: + return scanPlanTextAnyToUint64{} + case *uint: + return scanPlanTextAnyToUint{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +func (c Int4Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var n int64 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +func (c Int4Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n int32 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +type scanPlanBinaryInt4ToInt8 struct{} + +func (scanPlanBinaryInt4ToInt8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + p, ok := (dst).(*int8) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int32(binary.BigEndian.Uint32(src)) + if n < math.MinInt8 { + return fmt.Errorf("%d is less than minimum value for int8", n) + } else if n > math.MaxInt8 { + return fmt.Errorf("%d is greater than maximum value for int8", n) + } + + *p = int8(n) + + return nil +} + +type scanPlanBinaryInt4ToUint8 struct{} + +func (scanPlanBinaryInt4ToUint8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint4: %v", len(src)) + } + + p, ok := (dst).(*uint8) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int32(binary.BigEndian.Uint32(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint8", n) + } + + if n > math.MaxUint8 { + return fmt.Errorf("%d is greater than maximum value for uint8", n) + } + + *p = uint8(n) + + return nil +} + +type scanPlanBinaryInt4ToInt16 struct{} + +func (scanPlanBinaryInt4ToInt16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + p, ok := (dst).(*int16) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int32(binary.BigEndian.Uint32(src)) + if n < math.MinInt16 { + return fmt.Errorf("%d is less than minimum value for int16", n) + } else if n > math.MaxInt16 { + return fmt.Errorf("%d is greater than maximum value for int16", n) + } + + *p = int16(n) + + return nil +} + +type scanPlanBinaryInt4ToUint16 struct{} + +func (scanPlanBinaryInt4ToUint16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint4: %v", len(src)) + } + + p, ok := (dst).(*uint16) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int32(binary.BigEndian.Uint32(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint16", n) + } + + if n > math.MaxUint16 { + return fmt.Errorf("%d is greater than maximum value for uint16", n) + } + + *p = uint16(n) + + return nil +} + +type scanPlanBinaryInt4ToInt32 struct{} + +func (scanPlanBinaryInt4ToInt32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + p, ok := (dst).(*int32) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int32(binary.BigEndian.Uint32(src)) + + return nil +} + +type scanPlanBinaryInt4ToUint32 struct{} + +func (scanPlanBinaryInt4ToUint32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint4: %v", len(src)) + } + + p, ok := (dst).(*uint32) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int32(binary.BigEndian.Uint32(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint32", n) + } + + *p = uint32(n) + + return nil +} + +type scanPlanBinaryInt4ToInt64 struct{} + +func (scanPlanBinaryInt4ToInt64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + p, ok := (dst).(*int64) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int64(int32(binary.BigEndian.Uint32(src))) + + return nil +} + +type scanPlanBinaryInt4ToUint64 struct{} + +func (scanPlanBinaryInt4ToUint64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint4: %v", len(src)) + } + + p, ok := (dst).(*uint64) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int32(binary.BigEndian.Uint32(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint64", n) + } + + *p = uint64(n) + + return nil +} + +type scanPlanBinaryInt4ToInt struct{} + +func (scanPlanBinaryInt4ToInt) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + p, ok := (dst).(*int) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int(int32(binary.BigEndian.Uint32(src))) + + return nil +} + +type scanPlanBinaryInt4ToUint struct{} + +func (scanPlanBinaryInt4ToUint) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint4: %v", len(src)) + } + + p, ok := (dst).(*uint) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(int32(binary.BigEndian.Uint32(src))) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint", n) + } + + *p = uint(n) + + return nil +} + +type scanPlanBinaryInt4ToInt64Scanner struct{} + +func (scanPlanBinaryInt4ToInt64Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Int64Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanInt64(Int8{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + n := int64(int32(binary.BigEndian.Uint32(src))) + + return s.ScanInt64(Int8{Int64: n, Valid: true}) +} + +type scanPlanBinaryInt4ToTextScanner struct{} + +func (scanPlanBinaryInt4ToTextScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(TextScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for int4: %v", len(src)) + } + + n := int64(int32(binary.BigEndian.Uint32(src))) + + return s.ScanText(Text{String: strconv.FormatInt(n, 10), Valid: true}) +} + +type Int8 struct { + Int64 int64 + Valid bool +} + +// ScanInt64 implements the Int64Scanner interface. +func (dst *Int8) ScanInt64(n Int8) error { + if !n.Valid { + *dst = Int8{} + return nil + } + + if n.Int64 < math.MinInt64 { + return fmt.Errorf("%d is less than minimum value for Int8", n.Int64) + } + if n.Int64 > math.MaxInt64 { + return fmt.Errorf("%d is greater than maximum value for Int8", n.Int64) + } + *dst = Int8{Int64: int64(n.Int64), Valid: true} + + return nil +} + +func (n Int8) Int64Value() (Int8, error) { + return Int8{Int64: int64(n.Int64), Valid: n.Valid}, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Int8) Scan(src any) error { + if src == nil { + *dst = Int8{} + return nil + } + + var n int64 + + switch src := src.(type) { + case int64: + n = src + case string: + var err error + n, err = strconv.ParseInt(src, 10, 64) + if err != nil { + return err + } + case []byte: + var err error + n, err = strconv.ParseInt(string(src), 10, 64) + if err != nil { + return err + } + default: + return fmt.Errorf("cannot scan %T", src) + } + + if n < math.MinInt64 { + return fmt.Errorf("%d is greater than maximum value for Int8", n) + } + if n > math.MaxInt64 { + return fmt.Errorf("%d is greater than maximum value for Int8", n) + } + *dst = Int8{Int64: int64(n), Valid: true} + + return nil +} + +// Value implements the database/sql/driver Valuer interface. +func (src Int8) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + return int64(src.Int64), nil +} + +func (src Int8) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + return []byte(strconv.FormatInt(int64(src.Int64), 10)), nil +} + +func (dst *Int8) UnmarshalJSON(b []byte) error { + var n *int64 + err := json.Unmarshal(b, &n) + if err != nil { + return err + } + + if n == nil { + *dst = Int8{} + } else { + *dst = Int8{Int64: *n, Valid: true} + } + + return nil +} + +type Int8Codec struct{} + +func (Int8Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Int8Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Int8Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case int64: + return encodePlanInt8CodecBinaryInt64{} + case Int64Valuer: + return encodePlanInt8CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case int64: + return encodePlanInt8CodecTextInt64{} + case Int64Valuer: + return encodePlanInt8CodecTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanInt8CodecBinaryInt64 struct{} + +func (encodePlanInt8CodecBinaryInt64) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int64) + return pgio.AppendInt64(buf, int64(n)), nil +} + +type encodePlanInt8CodecTextInt64 struct{} + +func (encodePlanInt8CodecTextInt64) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int64) + return append(buf, strconv.FormatInt(int64(n), 10)...), nil +} + +type encodePlanInt8CodecBinaryInt64Valuer struct{} + +func (encodePlanInt8CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt64 { + return nil, fmt.Errorf("%d is greater than maximum value for int8", n.Int64) + } + if n.Int64 < math.MinInt64 { + return nil, fmt.Errorf("%d is less than minimum value for int8", n.Int64) + } + + return pgio.AppendInt64(buf, int64(n.Int64)), nil +} + +type encodePlanInt8CodecTextInt64Valuer struct{} + +func (encodePlanInt8CodecTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt64 { + return nil, fmt.Errorf("%d is greater than maximum value for int8", n.Int64) + } + if n.Int64 < math.MinInt64 { + return nil, fmt.Errorf("%d is less than minimum value for int8", n.Int64) + } + + return append(buf, strconv.FormatInt(n.Int64, 10)...), nil +} + +func (Int8Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *int8: + return scanPlanBinaryInt8ToInt8{} + case *int16: + return scanPlanBinaryInt8ToInt16{} + case *int32: + return scanPlanBinaryInt8ToInt32{} + case *int64: + return scanPlanBinaryInt8ToInt64{} + case *int: + return scanPlanBinaryInt8ToInt{} + case *uint8: + return scanPlanBinaryInt8ToUint8{} + case *uint16: + return scanPlanBinaryInt8ToUint16{} + case *uint32: + return scanPlanBinaryInt8ToUint32{} + case *uint64: + return scanPlanBinaryInt8ToUint64{} + case *uint: + return scanPlanBinaryInt8ToUint{} + case Int64Scanner: + return scanPlanBinaryInt8ToInt64Scanner{} + case TextScanner: + return scanPlanBinaryInt8ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *int8: + return scanPlanTextAnyToInt8{} + case *int16: + return scanPlanTextAnyToInt16{} + case *int32: + return scanPlanTextAnyToInt32{} + case *int64: + return scanPlanTextAnyToInt64{} + case *int: + return scanPlanTextAnyToInt{} + case *uint8: + return scanPlanTextAnyToUint8{} + case *uint16: + return scanPlanTextAnyToUint16{} + case *uint32: + return scanPlanTextAnyToUint32{} + case *uint64: + return scanPlanTextAnyToUint64{} + case *uint: + return scanPlanTextAnyToUint{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +func (c Int8Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var n int64 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +func (c Int8Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n int64 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +type scanPlanBinaryInt8ToInt8 struct{} + +func (scanPlanBinaryInt8ToInt8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + p, ok := (dst).(*int8) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < math.MinInt8 { + return fmt.Errorf("%d is less than minimum value for int8", n) + } else if n > math.MaxInt8 { + return fmt.Errorf("%d is greater than maximum value for int8", n) + } + + *p = int8(n) + + return nil +} + +type scanPlanBinaryInt8ToUint8 struct{} + +func (scanPlanBinaryInt8ToUint8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for uint8: %v", len(src)) + } + + p, ok := (dst).(*uint8) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint8", n) + } + + if n > math.MaxUint8 { + return fmt.Errorf("%d is greater than maximum value for uint8", n) + } + + *p = uint8(n) + + return nil +} + +type scanPlanBinaryInt8ToInt16 struct{} + +func (scanPlanBinaryInt8ToInt16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + p, ok := (dst).(*int16) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < math.MinInt16 { + return fmt.Errorf("%d is less than minimum value for int16", n) + } else if n > math.MaxInt16 { + return fmt.Errorf("%d is greater than maximum value for int16", n) + } + + *p = int16(n) + + return nil +} + +type scanPlanBinaryInt8ToUint16 struct{} + +func (scanPlanBinaryInt8ToUint16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for uint8: %v", len(src)) + } + + p, ok := (dst).(*uint16) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint16", n) + } + + if n > math.MaxUint16 { + return fmt.Errorf("%d is greater than maximum value for uint16", n) + } + + *p = uint16(n) + + return nil +} + +type scanPlanBinaryInt8ToInt32 struct{} + +func (scanPlanBinaryInt8ToInt32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + p, ok := (dst).(*int32) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < math.MinInt32 { + return fmt.Errorf("%d is less than minimum value for int32", n) + } else if n > math.MaxInt32 { + return fmt.Errorf("%d is greater than maximum value for int32", n) + } + + *p = int32(n) + + return nil +} + +type scanPlanBinaryInt8ToUint32 struct{} + +func (scanPlanBinaryInt8ToUint32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for uint8: %v", len(src)) + } + + p, ok := (dst).(*uint32) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint32", n) + } + + if n > math.MaxUint32 { + return fmt.Errorf("%d is greater than maximum value for uint32", n) + } + + *p = uint32(n) + + return nil +} + +type scanPlanBinaryInt8ToInt64 struct{} + +func (scanPlanBinaryInt8ToInt64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + p, ok := (dst).(*int64) + if !ok { + return ErrScanTargetTypeChanged + } + + *p = int64(binary.BigEndian.Uint64(src)) + + return nil +} + +type scanPlanBinaryInt8ToUint64 struct{} + +func (scanPlanBinaryInt8ToUint64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for uint8: %v", len(src)) + } + + p, ok := (dst).(*uint64) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint64", n) + } + + *p = uint64(n) + + return nil +} + +type scanPlanBinaryInt8ToInt struct{} + +func (scanPlanBinaryInt8ToInt) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + p, ok := (dst).(*int) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(binary.BigEndian.Uint64(src)) + if n < math.MinInt { + return fmt.Errorf("%d is less than minimum value for int", n) + } else if n > math.MaxInt { + return fmt.Errorf("%d is greater than maximum value for int", n) + } + + *p = int(n) + + return nil +} + +type scanPlanBinaryInt8ToUint struct{} + +func (scanPlanBinaryInt8ToUint) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for uint8: %v", len(src)) + } + + p, ok := (dst).(*uint) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(int64(binary.BigEndian.Uint64(src))) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint", n) + } + + if uint64(n) > math.MaxUint { + return fmt.Errorf("%d is greater than maximum value for uint", n) + } + + *p = uint(n) + + return nil +} + +type scanPlanBinaryInt8ToInt64Scanner struct{} + +func (scanPlanBinaryInt8ToInt64Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Int64Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanInt64(Int8{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + n := int64(int64(binary.BigEndian.Uint64(src))) + + return s.ScanInt64(Int8{Int64: n, Valid: true}) +} + +type scanPlanBinaryInt8ToTextScanner struct{} + +func (scanPlanBinaryInt8ToTextScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(TextScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for int8: %v", len(src)) + } + + n := int64(int64(binary.BigEndian.Uint64(src))) + + return s.ScanText(Text{String: strconv.FormatInt(n, 10), Valid: true}) +} + +type scanPlanTextAnyToInt8 struct{} + +func (scanPlanTextAnyToInt8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*int8) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseInt(string(src), 10, 8) + if err != nil { + return err + } + + *p = int8(n) + return nil +} + +type scanPlanTextAnyToUint8 struct{} + +func (scanPlanTextAnyToUint8) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*uint8) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseUint(string(src), 10, 8) + if err != nil { + return err + } + + *p = uint8(n) + return nil +} + +type scanPlanTextAnyToInt16 struct{} + +func (scanPlanTextAnyToInt16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*int16) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseInt(string(src), 10, 16) + if err != nil { + return err + } + + *p = int16(n) + return nil +} + +type scanPlanTextAnyToUint16 struct{} + +func (scanPlanTextAnyToUint16) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*uint16) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseUint(string(src), 10, 16) + if err != nil { + return err + } + + *p = uint16(n) + return nil +} + +type scanPlanTextAnyToInt32 struct{} + +func (scanPlanTextAnyToInt32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*int32) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseInt(string(src), 10, 32) + if err != nil { + return err + } + + *p = int32(n) + return nil +} + +type scanPlanTextAnyToUint32 struct{} + +func (scanPlanTextAnyToUint32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*uint32) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseUint(string(src), 10, 32) + if err != nil { + return err + } + + *p = uint32(n) + return nil +} + +type scanPlanTextAnyToInt64 struct{} + +func (scanPlanTextAnyToInt64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*int64) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseInt(string(src), 10, 64) + if err != nil { + return err + } + + *p = int64(n) + return nil +} + +type scanPlanTextAnyToUint64 struct{} + +func (scanPlanTextAnyToUint64) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*uint64) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseUint(string(src), 10, 64) + if err != nil { + return err + } + + *p = uint64(n) + return nil +} + +type scanPlanTextAnyToInt struct{} + +func (scanPlanTextAnyToInt) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*int) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseInt(string(src), 10, 0) + if err != nil { + return err + } + + *p = int(n) + return nil +} + +type scanPlanTextAnyToUint struct{} + +func (scanPlanTextAnyToUint) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*uint) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseUint(string(src), 10, 0) + if err != nil { + return err + } + + *p = uint(n) + return nil +} + +type scanPlanTextAnyToInt64Scanner struct{} + +func (scanPlanTextAnyToInt64Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Int64Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanInt64(Int8{}) + } + + n, err := strconv.ParseInt(string(src), 10, 64) + if err != nil { + return err + } + + err = s.ScanInt64(Int8{Int64: n, Valid: true}) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/int.go.erb b/vendor/github.com/jackc/pgx/v5/pgtype/int.go.erb new file mode 100644 index 0000000..e0c8b7a --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/int.go.erb @@ -0,0 +1,548 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "math" + "strconv" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type Int64Scanner interface { + ScanInt64(Int8) error +} + +type Int64Valuer interface { + Int64Value() (Int8, error) +} + + +<% [2, 4, 8].each do |pg_byte_size| %> +<% pg_bit_size = pg_byte_size * 8 %> +type Int<%= pg_byte_size %> struct { + Int<%= pg_bit_size %> int<%= pg_bit_size %> + Valid bool +} + +// ScanInt64 implements the Int64Scanner interface. +func (dst *Int<%= pg_byte_size %>) ScanInt64(n Int8) error { + if !n.Valid { + *dst = Int<%= pg_byte_size %>{} + return nil + } + + if n.Int64 < math.MinInt<%= pg_bit_size %> { + return fmt.Errorf("%d is less than minimum value for Int<%= pg_byte_size %>", n.Int64) + } + if n.Int64 > math.MaxInt<%= pg_bit_size %> { + return fmt.Errorf("%d is greater than maximum value for Int<%= pg_byte_size %>", n.Int64) + } + *dst = Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: int<%= pg_bit_size %>(n.Int64), Valid: true} + + return nil +} + +func (n Int<%= pg_byte_size %>) Int64Value() (Int8, error) { + return Int8{Int64: int64(n.Int<%= pg_bit_size %>), Valid: n.Valid}, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Int<%= pg_byte_size %>) Scan(src any) error { + if src == nil { + *dst = Int<%= pg_byte_size %>{} + return nil + } + + var n int64 + + switch src := src.(type) { + case int64: + n = src + case string: + var err error + n, err = strconv.ParseInt(src, 10, <%= pg_bit_size %>) + if err != nil { + return err + } + case []byte: + var err error + n, err = strconv.ParseInt(string(src), 10, <%= pg_bit_size %>) + if err != nil { + return err + } + default: + return fmt.Errorf("cannot scan %T", src) + } + + if n < math.MinInt<%= pg_bit_size %> { + return fmt.Errorf("%d is greater than maximum value for Int<%= pg_byte_size %>", n) + } + if n > math.MaxInt<%= pg_bit_size %> { + return fmt.Errorf("%d is greater than maximum value for Int<%= pg_byte_size %>", n) + } + *dst = Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: int<%= pg_bit_size %>(n), Valid: true} + + return nil +} + +// Value implements the database/sql/driver Valuer interface. +func (src Int<%= pg_byte_size %>) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + return int64(src.Int<%= pg_bit_size %>), nil +} + +func (src Int<%= pg_byte_size %>) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + return []byte(strconv.FormatInt(int64(src.Int<%= pg_bit_size %>), 10)), nil +} + +func (dst *Int<%= pg_byte_size %>) UnmarshalJSON(b []byte) error { + var n *int<%= pg_bit_size %> + err := json.Unmarshal(b, &n) + if err != nil { + return err + } + + if n == nil { + *dst = Int<%= pg_byte_size %>{} + } else { + *dst = Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: *n, Valid: true} + } + + return nil +} + +type Int<%= pg_byte_size %>Codec struct{} + +func (Int<%= pg_byte_size %>Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Int<%= pg_byte_size %>Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Int<%= pg_byte_size %>Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case int<%= pg_bit_size %>: + return encodePlanInt<%= pg_byte_size %>CodecBinaryInt<%= pg_bit_size %>{} + case Int64Valuer: + return encodePlanInt<%= pg_byte_size %>CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case int<%= pg_bit_size %>: + return encodePlanInt<%= pg_byte_size %>CodecTextInt<%= pg_bit_size %>{} + case Int64Valuer: + return encodePlanInt<%= pg_byte_size %>CodecTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanInt<%= pg_byte_size %>CodecBinaryInt<%= pg_bit_size %> struct{} + +func (encodePlanInt<%= pg_byte_size %>CodecBinaryInt<%= pg_bit_size %>) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int<%= pg_bit_size %>) + return pgio.AppendInt<%= pg_bit_size %>(buf, int<%= pg_bit_size %>(n)), nil +} + +type encodePlanInt<%= pg_byte_size %>CodecTextInt<%= pg_bit_size %> struct{} + +func (encodePlanInt<%= pg_byte_size %>CodecTextInt<%= pg_bit_size %>) Encode(value any, buf []byte) (newBuf []byte, err error) { + n := value.(int<%= pg_bit_size %>) + return append(buf, strconv.FormatInt(int64(n), 10)...), nil +} + +type encodePlanInt<%= pg_byte_size %>CodecBinaryInt64Valuer struct{} + +func (encodePlanInt<%= pg_byte_size %>CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt<%= pg_bit_size %> { + return nil, fmt.Errorf("%d is greater than maximum value for int<%= pg_byte_size %>", n.Int64) + } + if n.Int64 < math.MinInt<%= pg_bit_size %> { + return nil, fmt.Errorf("%d is less than minimum value for int<%= pg_byte_size %>", n.Int64) + } + + return pgio.AppendInt<%= pg_bit_size %>(buf, int<%= pg_bit_size %>(n.Int64)), nil +} + +type encodePlanInt<%= pg_byte_size %>CodecTextInt64Valuer struct{} + +func (encodePlanInt<%= pg_byte_size %>CodecTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if n.Int64 > math.MaxInt<%= pg_bit_size %> { + return nil, fmt.Errorf("%d is greater than maximum value for int<%= pg_byte_size %>", n.Int64) + } + if n.Int64 < math.MinInt<%= pg_bit_size %> { + return nil, fmt.Errorf("%d is less than minimum value for int<%= pg_byte_size %>", n.Int64) + } + + return append(buf, strconv.FormatInt(n.Int64, 10)...), nil +} + +func (Int<%= pg_byte_size %>Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *int8: + return scanPlanBinaryInt<%= pg_byte_size %>ToInt8{} + case *int16: + return scanPlanBinaryInt<%= pg_byte_size %>ToInt16{} + case *int32: + return scanPlanBinaryInt<%= pg_byte_size %>ToInt32{} + case *int64: + return scanPlanBinaryInt<%= pg_byte_size %>ToInt64{} + case *int: + return scanPlanBinaryInt<%= pg_byte_size %>ToInt{} + case *uint8: + return scanPlanBinaryInt<%= pg_byte_size %>ToUint8{} + case *uint16: + return scanPlanBinaryInt<%= pg_byte_size %>ToUint16{} + case *uint32: + return scanPlanBinaryInt<%= pg_byte_size %>ToUint32{} + case *uint64: + return scanPlanBinaryInt<%= pg_byte_size %>ToUint64{} + case *uint: + return scanPlanBinaryInt<%= pg_byte_size %>ToUint{} + case Int64Scanner: + return scanPlanBinaryInt<%= pg_byte_size %>ToInt64Scanner{} + case TextScanner: + return scanPlanBinaryInt<%= pg_byte_size %>ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *int8: + return scanPlanTextAnyToInt8{} + case *int16: + return scanPlanTextAnyToInt16{} + case *int32: + return scanPlanTextAnyToInt32{} + case *int64: + return scanPlanTextAnyToInt64{} + case *int: + return scanPlanTextAnyToInt{} + case *uint8: + return scanPlanTextAnyToUint8{} + case *uint16: + return scanPlanTextAnyToUint16{} + case *uint32: + return scanPlanTextAnyToUint32{} + case *uint64: + return scanPlanTextAnyToUint64{} + case *uint: + return scanPlanTextAnyToUint{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +func (c Int<%= pg_byte_size %>Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var n int64 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +func (c Int<%= pg_byte_size %>Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n int<%= pg_bit_size %> + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +<%# PostgreSQL binary format integer to fixed size Go integers %> +<% [8, 16, 32, 64].each do |dst_bit_size| %> +type scanPlanBinaryInt<%= pg_byte_size %>ToInt<%= dst_bit_size %> struct{} + +func (scanPlanBinaryInt<%= pg_byte_size %>ToInt<%= dst_bit_size %>) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != <%= pg_byte_size %> { + return fmt.Errorf("invalid length for int<%= pg_byte_size %>: %v", len(src)) + } + + p, ok := (dst).(*int<%= dst_bit_size %>) + if !ok { + return ErrScanTargetTypeChanged + } + + <% if dst_bit_size < pg_bit_size %> + n := int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src)) + if n < math.MinInt<%= dst_bit_size %> { + return fmt.Errorf("%d is less than minimum value for int<%= dst_bit_size %>", n) + } else if n > math.MaxInt<%= dst_bit_size %> { + return fmt.Errorf("%d is greater than maximum value for int<%= dst_bit_size %>", n) + } + + *p = int<%= dst_bit_size %>(n) + <% elsif dst_bit_size == pg_bit_size %> + *p = int<%= dst_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src)) + <% else %> + *p = int<%= dst_bit_size %>(int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src))) + <% end %> + + return nil +} + +type scanPlanBinaryInt<%= pg_byte_size %>ToUint<%= dst_bit_size %> struct{} + +func (scanPlanBinaryInt<%= pg_byte_size %>ToUint<%= dst_bit_size %>) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != <%= pg_byte_size %> { + return fmt.Errorf("invalid length for uint<%= pg_byte_size %>: %v", len(src)) + } + + p, ok := (dst).(*uint<%= dst_bit_size %>) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src)) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint<%= dst_bit_size %>", n) + } + <% if dst_bit_size < pg_bit_size %> + if n > math.MaxUint<%= dst_bit_size %> { + return fmt.Errorf("%d is greater than maximum value for uint<%= dst_bit_size %>", n) + } + <% end %> + *p = uint<%= dst_bit_size %>(n) + + return nil +} +<% end %> + +<%# PostgreSQL binary format integer to Go machine integers %> +type scanPlanBinaryInt<%= pg_byte_size %>ToInt struct{} + +func (scanPlanBinaryInt<%= pg_byte_size %>ToInt) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != <%= pg_byte_size %> { + return fmt.Errorf("invalid length for int<%= pg_byte_size %>: %v", len(src)) + } + + p, ok := (dst).(*int) + if !ok { + return ErrScanTargetTypeChanged + } + + <% if 32 < pg_bit_size %> + n := int64(binary.BigEndian.Uint<%= pg_bit_size %>(src)) + if n < math.MinInt { + return fmt.Errorf("%d is less than minimum value for int", n) + } else if n > math.MaxInt { + return fmt.Errorf("%d is greater than maximum value for int", n) + } + + *p = int(n) + <% else %> + *p = int(int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src))) + <% end %> + + return nil +} + +type scanPlanBinaryInt<%= pg_byte_size %>ToUint struct{} + +func (scanPlanBinaryInt<%= pg_byte_size %>ToUint) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != <%= pg_byte_size %> { + return fmt.Errorf("invalid length for uint<%= pg_byte_size %>: %v", len(src)) + } + + p, ok := (dst).(*uint) + if !ok { + return ErrScanTargetTypeChanged + } + + n := int64(int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src))) + if n < 0 { + return fmt.Errorf("%d is less than minimum value for uint", n) + } + <% if 32 < pg_bit_size %> + if uint64(n) > math.MaxUint { + return fmt.Errorf("%d is greater than maximum value for uint", n) + } + <% end %> + *p = uint(n) + + return nil +} + +<%# PostgreSQL binary format integer to Go Int64Scanner %> +type scanPlanBinaryInt<%= pg_byte_size %>ToInt64Scanner struct{} + +func (scanPlanBinaryInt<%= pg_byte_size %>ToInt64Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Int64Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanInt64(Int8{}) + } + + if len(src) != <%= pg_byte_size %> { + return fmt.Errorf("invalid length for int<%= pg_byte_size %>: %v", len(src)) + } + + + n := int64(int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src))) + + return s.ScanInt64(Int8{Int64: n, Valid: true}) +} + +<%# PostgreSQL binary format integer to Go TextScanner %> +type scanPlanBinaryInt<%= pg_byte_size %>ToTextScanner struct{} + +func (scanPlanBinaryInt<%= pg_byte_size %>ToTextScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(TextScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != <%= pg_byte_size %> { + return fmt.Errorf("invalid length for int<%= pg_byte_size %>: %v", len(src)) + } + + + n := int64(int<%= pg_bit_size %>(binary.BigEndian.Uint<%= pg_bit_size %>(src))) + + return s.ScanText(Text{String: strconv.FormatInt(n, 10), Valid: true}) +} +<% end %> + +<%# Any text to all integer types %> +<% [ + ["8", 8], + ["16", 16], + ["32", 32], + ["64", 64], + ["", 0] +].each do |type_suffix, bit_size| %> +type scanPlanTextAnyToInt<%= type_suffix %> struct{} + +func (scanPlanTextAnyToInt<%= type_suffix %>) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*int<%= type_suffix %>) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseInt(string(src), 10, <%= bit_size %>) + if err != nil { + return err + } + + *p = int<%= type_suffix %>(n) + return nil +} + +type scanPlanTextAnyToUint<%= type_suffix %> struct{} + +func (scanPlanTextAnyToUint<%= type_suffix %>) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p, ok := (dst).(*uint<%= type_suffix %>) + if !ok { + return ErrScanTargetTypeChanged + } + + n, err := strconv.ParseUint(string(src), 10, <%= bit_size %>) + if err != nil { + return err + } + + *p = uint<%= type_suffix %>(n) + return nil +} +<% end %> + +type scanPlanTextAnyToInt64Scanner struct{} + +func (scanPlanTextAnyToInt64Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Int64Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanInt64(Int8{}) + } + + n, err := strconv.ParseInt(string(src), 10, 64) + if err != nil { + return err + } + + err = s.ScanInt64(Int8{Int64: n, Valid: true}) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/int_test.go.erb b/vendor/github.com/jackc/pgx/v5/pgtype/int_test.go.erb new file mode 100644 index 0000000..ac9a3f1 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/int_test.go.erb @@ -0,0 +1,93 @@ +package pgtype_test + +import ( + "math" + "testing" + + "github.com/jackc/pgx/v5/pgtype" +) + +<% [2, 4, 8].each do |pg_byte_size| %> +<% pg_bit_size = pg_byte_size * 8 %> +func TestInt<%= pg_byte_size %>Codec(t *testing.T) { + pgxtest.RunValueRoundTripTests(context.Background(), t, defaultConnTestRunner, nil, "int<%= pg_byte_size %>", []pgxtest.ValueRoundTripTest{ + {int8(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {int16(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {int32(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {int64(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {uint8(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {uint16(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {uint32(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {uint64(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {int(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {uint(1), new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: 1, Valid: true}, new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {int32(-1), new(pgtype.Int<%= pg_byte_size %>), isExpectedEq(pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: -1, Valid: true})}, + {1, new(int8), isExpectedEq(int8(1))}, + {1, new(int16), isExpectedEq(int16(1))}, + {1, new(int32), isExpectedEq(int32(1))}, + {1, new(int64), isExpectedEq(int64(1))}, + {1, new(uint8), isExpectedEq(uint8(1))}, + {1, new(uint16), isExpectedEq(uint16(1))}, + {1, new(uint32), isExpectedEq(uint32(1))}, + {1, new(uint64), isExpectedEq(uint64(1))}, + {1, new(int), isExpectedEq(int(1))}, + {1, new(uint), isExpectedEq(uint(1))}, + {-1, new(int8), isExpectedEq(int8(-1))}, + {-1, new(int16), isExpectedEq(int16(-1))}, + {-1, new(int32), isExpectedEq(int32(-1))}, + {-1, new(int64), isExpectedEq(int64(-1))}, + {-1, new(int), isExpectedEq(int(-1))}, + {math.MinInt<%= pg_bit_size %>, new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(math.MinInt<%= pg_bit_size %>))}, + {-1, new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(-1))}, + {0, new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(0))}, + {1, new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(1))}, + {math.MaxInt<%= pg_bit_size %>, new(int<%= pg_bit_size %>), isExpectedEq(int<%= pg_bit_size %>(math.MaxInt<%= pg_bit_size %>))}, + {1, new(pgtype.Int<%= pg_byte_size %>), isExpectedEq(pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: 1, Valid: true})}, + {"1", new(string), isExpectedEq("1")}, + {pgtype.Int<%= pg_byte_size %>{}, new(pgtype.Int<%= pg_byte_size %>), isExpectedEq(pgtype.Int<%= pg_byte_size %>{})}, + {nil, new(*int<%= pg_bit_size %>), isExpectedEq((*int<%= pg_bit_size %>)(nil))}, + }) +} + +func TestInt<%= pg_byte_size %>MarshalJSON(t *testing.T) { + successfulTests := []struct { + source pgtype.Int<%= pg_byte_size %> + result string + }{ + {source: pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: 0}, result: "null"}, + {source: pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: 1, Valid: true}, result: "1"}, + } + for i, tt := range successfulTests { + r, err := tt.source.MarshalJSON() + if err != nil { + t.Errorf("%d: %v", i, err) + } + + if string(r) != tt.result { + t.Errorf("%d: expected %v to convert to %v, but it was %v", i, tt.source, tt.result, string(r)) + } + } +} + +func TestInt<%= pg_byte_size %>UnmarshalJSON(t *testing.T) { + successfulTests := []struct { + source string + result pgtype.Int<%= pg_byte_size %> + }{ + {source: "null", result: pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: 0}}, + {source: "1", result: pgtype.Int<%= pg_byte_size %>{Int<%= pg_bit_size %>: 1, Valid: true}}, + } + for i, tt := range successfulTests { + var r pgtype.Int<%= pg_byte_size %> + err := r.UnmarshalJSON([]byte(tt.source)) + if err != nil { + t.Errorf("%d: %v", i, err) + } + + if r != tt.result { + t.Errorf("%d: expected %v to convert to %v, but it was %v", i, tt.source, tt.result, r) + } + } +} +<% end %> diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/integration_benchmark_test.go.erb b/vendor/github.com/jackc/pgx/v5/pgtype/integration_benchmark_test.go.erb new file mode 100644 index 0000000..0175700 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/integration_benchmark_test.go.erb @@ -0,0 +1,62 @@ +package pgtype_test + +import ( + "context" + "testing" + + "github.com/jackc/pgx/v5/pgtype/testutil" + "github.com/jackc/pgx/v5" +) + +<% + [ + ["int4", ["int16", "int32", "int64", "uint64", "pgtype.Int4"], [[1, 1], [1, 10], [10, 1], [100, 10]]], + ["numeric", ["int64", "float64", "pgtype.Numeric"], [[1, 1], [1, 10], [10, 1], [100, 10]]], + ].each do |pg_type, go_types, rows_columns| +%> +<% go_types.each do |go_type| %> +<% rows_columns.each do |rows, columns| %> +<% [["Text", "pgx.TextFormatCode"], ["Binary", "pgx.BinaryFormatCode"]].each do |format_name, format_code| %> +func BenchmarkQuery<%= format_name %>FormatDecode_PG_<%= pg_type %>_to_Go_<%= go_type.gsub(/\W/, "_") %>_<%= rows %>_rows_<%= columns %>_columns(b *testing.B) { + defaultConnTestRunner.RunTest(context.Background(), b, func(ctx context.Context, _ testing.TB, conn *pgx.Conn) { + b.ResetTimer() + var v [<%= columns %>]<%= go_type %> + for i := 0; i < b.N; i++ { + rows, _ := conn.Query( + ctx, + `select <% columns.times do |col_idx| %><% if col_idx != 0 %>, <% end %>n::<%= pg_type %> + <%= col_idx%><% end %> from generate_series(1, <%= rows %>) n`, + []any{pgx.QueryResultFormats{<%= format_code %>}}, + ) + _, err := pgx.ForEachRow(rows, []any{<% columns.times do |col_idx| %><% if col_idx != 0 %>, <% end %>&v[<%= col_idx%>]<% end %>}, func() error { return nil }) + if err != nil { + b.Fatal(err) + } + } + }) +} +<% end %> +<% end %> +<% end %> +<% end %> + +<% [10, 100, 1000].each do |array_size| %> +<% [["Text", "pgx.TextFormatCode"], ["Binary", "pgx.BinaryFormatCode"]].each do |format_name, format_code| %> +func BenchmarkQuery<%= format_name %>FormatDecode_PG_Int4Array_With_Go_Int4Array_<%= array_size %>(b *testing.B) { + defaultConnTestRunner.RunTest(context.Background(), b, func(ctx context.Context, _ testing.TB, conn *pgx.Conn) { + b.ResetTimer() + var v []int32 + for i := 0; i < b.N; i++ { + rows, _ := conn.Query( + ctx, + `select array_agg(n) from generate_series(1, <%= array_size %>) n`, + []any{pgx.QueryResultFormats{<%= format_code %>}}, + ) + _, err := pgx.ForEachRow(rows, []any{&v}, func() error { return nil }) + if err != nil { + b.Fatal(err) + } + } + }) +} +<% end %> +<% end %> diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/integration_benchmark_test_gen.sh b/vendor/github.com/jackc/pgx/v5/pgtype/integration_benchmark_test_gen.sh new file mode 100644 index 0000000..22ac01a --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/integration_benchmark_test_gen.sh @@ -0,0 +1,2 @@ +erb integration_benchmark_test.go.erb > integration_benchmark_test.go +goimports -w integration_benchmark_test.go diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/interval.go b/vendor/github.com/jackc/pgx/v5/pgtype/interval.go new file mode 100644 index 0000000..4b51162 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/interval.go @@ -0,0 +1,297 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +const ( + microsecondsPerSecond = 1000000 + microsecondsPerMinute = 60 * microsecondsPerSecond + microsecondsPerHour = 60 * microsecondsPerMinute + microsecondsPerDay = 24 * microsecondsPerHour + microsecondsPerMonth = 30 * microsecondsPerDay +) + +type IntervalScanner interface { + ScanInterval(v Interval) error +} + +type IntervalValuer interface { + IntervalValue() (Interval, error) +} + +type Interval struct { + Microseconds int64 + Days int32 + Months int32 + Valid bool +} + +func (interval *Interval) ScanInterval(v Interval) error { + *interval = v + return nil +} + +func (interval Interval) IntervalValue() (Interval, error) { + return interval, nil +} + +// Scan implements the database/sql Scanner interface. +func (interval *Interval) Scan(src any) error { + if src == nil { + *interval = Interval{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToIntervalScanner{}.Scan([]byte(src), interval) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (interval Interval) Value() (driver.Value, error) { + if !interval.Valid { + return nil, nil + } + + buf, err := IntervalCodec{}.PlanEncode(nil, 0, TextFormatCode, interval).Encode(interval, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type IntervalCodec struct{} + +func (IntervalCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (IntervalCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (IntervalCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(IntervalValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanIntervalCodecBinary{} + case TextFormatCode: + return encodePlanIntervalCodecText{} + } + + return nil +} + +type encodePlanIntervalCodecBinary struct{} + +func (encodePlanIntervalCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + interval, err := value.(IntervalValuer).IntervalValue() + if err != nil { + return nil, err + } + + if !interval.Valid { + return nil, nil + } + + buf = pgio.AppendInt64(buf, interval.Microseconds) + buf = pgio.AppendInt32(buf, interval.Days) + buf = pgio.AppendInt32(buf, interval.Months) + return buf, nil +} + +type encodePlanIntervalCodecText struct{} + +func (encodePlanIntervalCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + interval, err := value.(IntervalValuer).IntervalValue() + if err != nil { + return nil, err + } + + if !interval.Valid { + return nil, nil + } + + if interval.Months != 0 { + buf = append(buf, strconv.FormatInt(int64(interval.Months), 10)...) + buf = append(buf, " mon "...) + } + + if interval.Days != 0 { + buf = append(buf, strconv.FormatInt(int64(interval.Days), 10)...) + buf = append(buf, " day "...) + } + + absMicroseconds := interval.Microseconds + if absMicroseconds < 0 { + absMicroseconds = -absMicroseconds + buf = append(buf, '-') + } + + hours := absMicroseconds / microsecondsPerHour + minutes := (absMicroseconds % microsecondsPerHour) / microsecondsPerMinute + seconds := (absMicroseconds % microsecondsPerMinute) / microsecondsPerSecond + + timeStr := fmt.Sprintf("%02d:%02d:%02d", hours, minutes, seconds) + buf = append(buf, timeStr...) + + microseconds := absMicroseconds % microsecondsPerSecond + if microseconds != 0 { + buf = append(buf, fmt.Sprintf(".%06d", microseconds)...) + } + + return buf, nil +} + +func (IntervalCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case IntervalScanner: + return scanPlanBinaryIntervalToIntervalScanner{} + } + case TextFormatCode: + switch target.(type) { + case IntervalScanner: + return scanPlanTextAnyToIntervalScanner{} + } + } + + return nil +} + +type scanPlanBinaryIntervalToIntervalScanner struct{} + +func (scanPlanBinaryIntervalToIntervalScanner) Scan(src []byte, dst any) error { + scanner := (dst).(IntervalScanner) + + if src == nil { + return scanner.ScanInterval(Interval{}) + } + + if len(src) != 16 { + return fmt.Errorf("Received an invalid size for an interval: %d", len(src)) + } + + microseconds := int64(binary.BigEndian.Uint64(src)) + days := int32(binary.BigEndian.Uint32(src[8:])) + months := int32(binary.BigEndian.Uint32(src[12:])) + + return scanner.ScanInterval(Interval{Microseconds: microseconds, Days: days, Months: months, Valid: true}) +} + +type scanPlanTextAnyToIntervalScanner struct{} + +func (scanPlanTextAnyToIntervalScanner) Scan(src []byte, dst any) error { + scanner := (dst).(IntervalScanner) + + if src == nil { + return scanner.ScanInterval(Interval{}) + } + + var microseconds int64 + var days int32 + var months int32 + + parts := strings.Split(string(src), " ") + + for i := 0; i < len(parts)-1; i += 2 { + scalar, err := strconv.ParseInt(parts[i], 10, 64) + if err != nil { + return fmt.Errorf("bad interval format") + } + + switch parts[i+1] { + case "year", "years": + months += int32(scalar * 12) + case "mon", "mons": + months += int32(scalar) + case "day", "days": + days = int32(scalar) + } + } + + if len(parts)%2 == 1 { + timeParts := strings.SplitN(parts[len(parts)-1], ":", 3) + if len(timeParts) != 3 { + return fmt.Errorf("bad interval format") + } + + var negative bool + if timeParts[0][0] == '-' { + negative = true + timeParts[0] = timeParts[0][1:] + } + + hours, err := strconv.ParseInt(timeParts[0], 10, 64) + if err != nil { + return fmt.Errorf("bad interval hour format: %s", timeParts[0]) + } + + minutes, err := strconv.ParseInt(timeParts[1], 10, 64) + if err != nil { + return fmt.Errorf("bad interval minute format: %s", timeParts[1]) + } + + sec, secFrac, secFracFound := strings.Cut(timeParts[2], ".") + + seconds, err := strconv.ParseInt(sec, 10, 64) + if err != nil { + return fmt.Errorf("bad interval second format: %s", sec) + } + + var uSeconds int64 + if secFracFound { + uSeconds, err = strconv.ParseInt(secFrac, 10, 64) + if err != nil { + return fmt.Errorf("bad interval decimal format: %s", secFrac) + } + + for i := 0; i < 6-len(secFrac); i++ { + uSeconds *= 10 + } + } + + microseconds = hours * microsecondsPerHour + microseconds += minutes * microsecondsPerMinute + microseconds += seconds * microsecondsPerSecond + microseconds += uSeconds + + if negative { + microseconds = -microseconds + } + } + + return scanner.ScanInterval(Interval{Months: months, Days: days, Microseconds: microseconds, Valid: true}) +} + +func (c IntervalCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c IntervalCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var interval Interval + err := codecScan(c, m, oid, format, src, &interval) + if err != nil { + return nil, err + } + return interval, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/json.go b/vendor/github.com/jackc/pgx/v5/pgtype/json.go new file mode 100644 index 0000000..c2aa0d3 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/json.go @@ -0,0 +1,223 @@ +package pgtype + +import ( + "database/sql" + "database/sql/driver" + "encoding/json" + "fmt" + "reflect" +) + +type JSONCodec struct { + Marshal func(v any) ([]byte, error) + Unmarshal func(data []byte, v any) error +} + +func (*JSONCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (*JSONCodec) PreferredFormat() int16 { + return TextFormatCode +} + +func (c *JSONCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch value.(type) { + case string: + return encodePlanJSONCodecEitherFormatString{} + case []byte: + return encodePlanJSONCodecEitherFormatByteSlice{} + + // Handle json.RawMessage specifically because if it is run through json.Marshal it may be mutated. + // e.g. `{"foo": "bar"}` -> `{"foo":"bar"}`. + case json.RawMessage: + return encodePlanJSONCodecEitherFormatJSONRawMessage{} + + // Cannot rely on driver.Valuer being handled later because anything can be marshalled. + // + // https://github.com/jackc/pgx/issues/1430 + // + // Check for driver.Valuer must come before json.Marshaler so that it is guaranteed to be used + // when both are implemented https://github.com/jackc/pgx/issues/1805 + case driver.Valuer: + return &encodePlanDriverValuer{m: m, oid: oid, formatCode: format} + + // Must come before trying wrap encode plans because a pointer to a struct may be unwrapped to a struct that can be + // marshalled. + // + // https://github.com/jackc/pgx/issues/1681 + case json.Marshaler: + return &encodePlanJSONCodecEitherFormatMarshal{ + marshal: c.Marshal, + } + } + + // Because anything can be marshalled the normal wrapping in Map.PlanScan doesn't get a chance to run. So try the + // appropriate wrappers here. + for _, f := range []TryWrapEncodePlanFunc{ + TryWrapDerefPointerEncodePlan, + TryWrapFindUnderlyingTypeEncodePlan, + } { + if wrapperPlan, nextValue, ok := f(value); ok { + if nextPlan := c.PlanEncode(m, oid, format, nextValue); nextPlan != nil { + wrapperPlan.SetNext(nextPlan) + return wrapperPlan + } + } + } + + return &encodePlanJSONCodecEitherFormatMarshal{ + marshal: c.Marshal, + } +} + +type encodePlanJSONCodecEitherFormatString struct{} + +func (encodePlanJSONCodecEitherFormatString) Encode(value any, buf []byte) (newBuf []byte, err error) { + jsonString := value.(string) + buf = append(buf, jsonString...) + return buf, nil +} + +type encodePlanJSONCodecEitherFormatByteSlice struct{} + +func (encodePlanJSONCodecEitherFormatByteSlice) Encode(value any, buf []byte) (newBuf []byte, err error) { + jsonBytes := value.([]byte) + if jsonBytes == nil { + return nil, nil + } + + buf = append(buf, jsonBytes...) + return buf, nil +} + +type encodePlanJSONCodecEitherFormatJSONRawMessage struct{} + +func (encodePlanJSONCodecEitherFormatJSONRawMessage) Encode(value any, buf []byte) (newBuf []byte, err error) { + jsonBytes := value.(json.RawMessage) + if jsonBytes == nil { + return nil, nil + } + + buf = append(buf, jsonBytes...) + return buf, nil +} + +type encodePlanJSONCodecEitherFormatMarshal struct { + marshal func(v any) ([]byte, error) +} + +func (e *encodePlanJSONCodecEitherFormatMarshal) Encode(value any, buf []byte) (newBuf []byte, err error) { + jsonBytes, err := e.marshal(value) + if err != nil { + return nil, err + } + + buf = append(buf, jsonBytes...) + return buf, nil +} + +func (c *JSONCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch target.(type) { + case *string: + return scanPlanAnyToString{} + + case **string: + // This is to fix **string scanning. It seems wrong to special case **string, but it's not clear what a better + // solution would be. + // + // https://github.com/jackc/pgx/issues/1470 -- **string + // https://github.com/jackc/pgx/issues/1691 -- ** anything else + + if wrapperPlan, nextDst, ok := TryPointerPointerScanPlan(target); ok { + if nextPlan := m.planScan(oid, format, nextDst); nextPlan != nil { + if _, failed := nextPlan.(*scanPlanFail); !failed { + wrapperPlan.SetNext(nextPlan) + return wrapperPlan + } + } + } + + case *[]byte: + return scanPlanJSONToByteSlice{} + case BytesScanner: + return scanPlanBinaryBytesToBytesScanner{} + + // Cannot rely on sql.Scanner being handled later because scanPlanJSONToJSONUnmarshal will take precedence. + // + // https://github.com/jackc/pgx/issues/1418 + case sql.Scanner: + return &scanPlanSQLScanner{formatCode: format} + } + + return &scanPlanJSONToJSONUnmarshal{ + unmarshal: c.Unmarshal, + } +} + +type scanPlanAnyToString struct{} + +func (scanPlanAnyToString) Scan(src []byte, dst any) error { + p := dst.(*string) + *p = string(src) + return nil +} + +type scanPlanJSONToByteSlice struct{} + +func (scanPlanJSONToByteSlice) Scan(src []byte, dst any) error { + dstBuf := dst.(*[]byte) + if src == nil { + *dstBuf = nil + return nil + } + + *dstBuf = make([]byte, len(src)) + copy(*dstBuf, src) + return nil +} + +type scanPlanJSONToJSONUnmarshal struct { + unmarshal func(data []byte, v any) error +} + +func (s *scanPlanJSONToJSONUnmarshal) Scan(src []byte, dst any) error { + if src == nil { + dstValue := reflect.ValueOf(dst) + if dstValue.Kind() == reflect.Ptr { + el := dstValue.Elem() + switch el.Kind() { + case reflect.Ptr, reflect.Slice, reflect.Map, reflect.Interface: + el.Set(reflect.Zero(el.Type())) + return nil + } + } + + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + elem := reflect.ValueOf(dst).Elem() + elem.Set(reflect.Zero(elem.Type())) + + return s.unmarshal(src, dst) +} + +func (c *JSONCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + dstBuf := make([]byte, len(src)) + copy(dstBuf, src) + return dstBuf, nil +} + +func (c *JSONCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var dst any + err := c.Unmarshal(src, &dst) + return dst, err +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/jsonb.go b/vendor/github.com/jackc/pgx/v5/pgtype/jsonb.go new file mode 100644 index 0000000..4d4eb58 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/jsonb.go @@ -0,0 +1,129 @@ +package pgtype + +import ( + "database/sql/driver" + "fmt" +) + +type JSONBCodec struct { + Marshal func(v any) ([]byte, error) + Unmarshal func(data []byte, v any) error +} + +func (*JSONBCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (*JSONBCodec) PreferredFormat() int16 { + return TextFormatCode +} + +func (c *JSONBCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + plan := (&JSONCodec{Marshal: c.Marshal, Unmarshal: c.Unmarshal}).PlanEncode(m, oid, TextFormatCode, value) + if plan != nil { + return &encodePlanJSONBCodecBinaryWrapper{textPlan: plan} + } + case TextFormatCode: + return (&JSONCodec{Marshal: c.Marshal, Unmarshal: c.Unmarshal}).PlanEncode(m, oid, format, value) + } + + return nil +} + +type encodePlanJSONBCodecBinaryWrapper struct { + textPlan EncodePlan +} + +func (plan *encodePlanJSONBCodecBinaryWrapper) Encode(value any, buf []byte) (newBuf []byte, err error) { + buf = append(buf, 1) + return plan.textPlan.Encode(value, buf) +} + +func (c *JSONBCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case BinaryFormatCode: + plan := (&JSONCodec{Marshal: c.Marshal, Unmarshal: c.Unmarshal}).PlanScan(m, oid, TextFormatCode, target) + if plan != nil { + return &scanPlanJSONBCodecBinaryUnwrapper{textPlan: plan} + } + case TextFormatCode: + return (&JSONCodec{Marshal: c.Marshal, Unmarshal: c.Unmarshal}).PlanScan(m, oid, format, target) + } + + return nil +} + +type scanPlanJSONBCodecBinaryUnwrapper struct { + textPlan ScanPlan +} + +func (plan *scanPlanJSONBCodecBinaryUnwrapper) Scan(src []byte, dst any) error { + if src == nil { + return plan.textPlan.Scan(src, dst) + } + + if len(src) == 0 { + return fmt.Errorf("jsonb too short") + } + + if src[0] != 1 { + return fmt.Errorf("unknown jsonb version number %d", src[0]) + } + + return plan.textPlan.Scan(src[1:], dst) +} + +func (c *JSONBCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + switch format { + case BinaryFormatCode: + if len(src) == 0 { + return nil, fmt.Errorf("jsonb too short") + } + + if src[0] != 1 { + return nil, fmt.Errorf("unknown jsonb version number %d", src[0]) + } + + dstBuf := make([]byte, len(src)-1) + copy(dstBuf, src[1:]) + return dstBuf, nil + case TextFormatCode: + dstBuf := make([]byte, len(src)) + copy(dstBuf, src) + return dstBuf, nil + default: + return nil, fmt.Errorf("unknown format code: %v", format) + } +} + +func (c *JSONBCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + switch format { + case BinaryFormatCode: + if len(src) == 0 { + return nil, fmt.Errorf("jsonb too short") + } + + if src[0] != 1 { + return nil, fmt.Errorf("unknown jsonb version number %d", src[0]) + } + + src = src[1:] + case TextFormatCode: + default: + return nil, fmt.Errorf("unknown format code: %v", format) + } + + var dst any + err := c.Unmarshal(src, &dst) + return dst, err +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/line.go b/vendor/github.com/jackc/pgx/v5/pgtype/line.go new file mode 100644 index 0000000..4ae8003 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/line.go @@ -0,0 +1,225 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type LineScanner interface { + ScanLine(v Line) error +} + +type LineValuer interface { + LineValue() (Line, error) +} + +type Line struct { + A, B, C float64 + Valid bool +} + +func (line *Line) ScanLine(v Line) error { + *line = v + return nil +} + +func (line Line) LineValue() (Line, error) { + return line, nil +} + +func (line *Line) Set(src any) error { + return fmt.Errorf("cannot convert %v to Line", src) +} + +// Scan implements the database/sql Scanner interface. +func (line *Line) Scan(src any) error { + if src == nil { + *line = Line{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToLineScanner{}.Scan([]byte(src), line) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (line Line) Value() (driver.Value, error) { + if !line.Valid { + return nil, nil + } + + buf, err := LineCodec{}.PlanEncode(nil, 0, TextFormatCode, line).Encode(line, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type LineCodec struct{} + +func (LineCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (LineCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (LineCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(LineValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanLineCodecBinary{} + case TextFormatCode: + return encodePlanLineCodecText{} + } + + return nil +} + +type encodePlanLineCodecBinary struct{} + +func (encodePlanLineCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + line, err := value.(LineValuer).LineValue() + if err != nil { + return nil, err + } + + if !line.Valid { + return nil, nil + } + + buf = pgio.AppendUint64(buf, math.Float64bits(line.A)) + buf = pgio.AppendUint64(buf, math.Float64bits(line.B)) + buf = pgio.AppendUint64(buf, math.Float64bits(line.C)) + return buf, nil +} + +type encodePlanLineCodecText struct{} + +func (encodePlanLineCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + line, err := value.(LineValuer).LineValue() + if err != nil { + return nil, err + } + + if !line.Valid { + return nil, nil + } + + buf = append(buf, fmt.Sprintf(`{%s,%s,%s}`, + strconv.FormatFloat(line.A, 'f', -1, 64), + strconv.FormatFloat(line.B, 'f', -1, 64), + strconv.FormatFloat(line.C, 'f', -1, 64), + )...) + return buf, nil +} + +func (LineCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case LineScanner: + return scanPlanBinaryLineToLineScanner{} + } + case TextFormatCode: + switch target.(type) { + case LineScanner: + return scanPlanTextAnyToLineScanner{} + } + } + + return nil +} + +type scanPlanBinaryLineToLineScanner struct{} + +func (scanPlanBinaryLineToLineScanner) Scan(src []byte, dst any) error { + scanner := (dst).(LineScanner) + + if src == nil { + return scanner.ScanLine(Line{}) + } + + if len(src) != 24 { + return fmt.Errorf("invalid length for line: %v", len(src)) + } + + a := binary.BigEndian.Uint64(src) + b := binary.BigEndian.Uint64(src[8:]) + c := binary.BigEndian.Uint64(src[16:]) + + return scanner.ScanLine(Line{ + A: math.Float64frombits(a), + B: math.Float64frombits(b), + C: math.Float64frombits(c), + Valid: true, + }) +} + +type scanPlanTextAnyToLineScanner struct{} + +func (scanPlanTextAnyToLineScanner) Scan(src []byte, dst any) error { + scanner := (dst).(LineScanner) + + if src == nil { + return scanner.ScanLine(Line{}) + } + + if len(src) < 7 { + return fmt.Errorf("invalid length for line: %v", len(src)) + } + + parts := strings.SplitN(string(src[1:len(src)-1]), ",", 3) + if len(parts) < 3 { + return fmt.Errorf("invalid format for line") + } + + a, err := strconv.ParseFloat(parts[0], 64) + if err != nil { + return err + } + + b, err := strconv.ParseFloat(parts[1], 64) + if err != nil { + return err + } + + c, err := strconv.ParseFloat(parts[2], 64) + if err != nil { + return err + } + + return scanner.ScanLine(Line{A: a, B: b, C: c, Valid: true}) +} + +func (c LineCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c LineCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var line Line + err := codecScan(c, m, oid, format, src, &line) + if err != nil { + return nil, err + } + return line, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/lseg.go b/vendor/github.com/jackc/pgx/v5/pgtype/lseg.go new file mode 100644 index 0000000..05a86e1 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/lseg.go @@ -0,0 +1,238 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type LsegScanner interface { + ScanLseg(v Lseg) error +} + +type LsegValuer interface { + LsegValue() (Lseg, error) +} + +type Lseg struct { + P [2]Vec2 + Valid bool +} + +func (lseg *Lseg) ScanLseg(v Lseg) error { + *lseg = v + return nil +} + +func (lseg Lseg) LsegValue() (Lseg, error) { + return lseg, nil +} + +// Scan implements the database/sql Scanner interface. +func (lseg *Lseg) Scan(src any) error { + if src == nil { + *lseg = Lseg{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToLsegScanner{}.Scan([]byte(src), lseg) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (lseg Lseg) Value() (driver.Value, error) { + if !lseg.Valid { + return nil, nil + } + + buf, err := LsegCodec{}.PlanEncode(nil, 0, TextFormatCode, lseg).Encode(lseg, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type LsegCodec struct{} + +func (LsegCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (LsegCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (LsegCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(LsegValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanLsegCodecBinary{} + case TextFormatCode: + return encodePlanLsegCodecText{} + } + + return nil +} + +type encodePlanLsegCodecBinary struct{} + +func (encodePlanLsegCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + lseg, err := value.(LsegValuer).LsegValue() + if err != nil { + return nil, err + } + + if !lseg.Valid { + return nil, nil + } + + buf = pgio.AppendUint64(buf, math.Float64bits(lseg.P[0].X)) + buf = pgio.AppendUint64(buf, math.Float64bits(lseg.P[0].Y)) + buf = pgio.AppendUint64(buf, math.Float64bits(lseg.P[1].X)) + buf = pgio.AppendUint64(buf, math.Float64bits(lseg.P[1].Y)) + return buf, nil +} + +type encodePlanLsegCodecText struct{} + +func (encodePlanLsegCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + lseg, err := value.(LsegValuer).LsegValue() + if err != nil { + return nil, err + } + + if !lseg.Valid { + return nil, nil + } + + buf = append(buf, fmt.Sprintf(`[(%s,%s),(%s,%s)]`, + strconv.FormatFloat(lseg.P[0].X, 'f', -1, 64), + strconv.FormatFloat(lseg.P[0].Y, 'f', -1, 64), + strconv.FormatFloat(lseg.P[1].X, 'f', -1, 64), + strconv.FormatFloat(lseg.P[1].Y, 'f', -1, 64), + )...) + return buf, nil +} + +func (LsegCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case LsegScanner: + return scanPlanBinaryLsegToLsegScanner{} + } + case TextFormatCode: + switch target.(type) { + case LsegScanner: + return scanPlanTextAnyToLsegScanner{} + } + } + + return nil +} + +type scanPlanBinaryLsegToLsegScanner struct{} + +func (scanPlanBinaryLsegToLsegScanner) Scan(src []byte, dst any) error { + scanner := (dst).(LsegScanner) + + if src == nil { + return scanner.ScanLseg(Lseg{}) + } + + if len(src) != 32 { + return fmt.Errorf("invalid length for lseg: %v", len(src)) + } + + x1 := binary.BigEndian.Uint64(src) + y1 := binary.BigEndian.Uint64(src[8:]) + x2 := binary.BigEndian.Uint64(src[16:]) + y2 := binary.BigEndian.Uint64(src[24:]) + + return scanner.ScanLseg(Lseg{ + P: [2]Vec2{ + {math.Float64frombits(x1), math.Float64frombits(y1)}, + {math.Float64frombits(x2), math.Float64frombits(y2)}, + }, + Valid: true, + }) +} + +type scanPlanTextAnyToLsegScanner struct{} + +func (scanPlanTextAnyToLsegScanner) Scan(src []byte, dst any) error { + scanner := (dst).(LsegScanner) + + if src == nil { + return scanner.ScanLseg(Lseg{}) + } + + if len(src) < 11 { + return fmt.Errorf("invalid length for lseg: %v", len(src)) + } + + str := string(src[2:]) + + var end int + end = strings.IndexByte(str, ',') + + x1, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1:] + end = strings.IndexByte(str, ')') + + y1, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+3:] + end = strings.IndexByte(str, ',') + + x2, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1 : len(str)-2] + + y2, err := strconv.ParseFloat(str, 64) + if err != nil { + return err + } + + return scanner.ScanLseg(Lseg{P: [2]Vec2{{x1, y1}, {x2, y2}}, Valid: true}) +} + +func (c LsegCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c LsegCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var lseg Lseg + err := codecScan(c, m, oid, format, src, &lseg) + if err != nil { + return nil, err + } + return lseg, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/ltree.go b/vendor/github.com/jackc/pgx/v5/pgtype/ltree.go new file mode 100644 index 0000000..6af3177 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/ltree.go @@ -0,0 +1,122 @@ +package pgtype + +import ( + "database/sql/driver" + "fmt" +) + +type LtreeCodec struct{} + +func (l LtreeCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +// PreferredFormat returns the preferred format. +func (l LtreeCodec) PreferredFormat() int16 { + return TextFormatCode +} + +// PlanEncode returns an EncodePlan for encoding value into PostgreSQL format for oid and format. If no plan can be +// found then nil is returned. +func (l LtreeCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case TextFormatCode: + return (TextCodec)(l).PlanEncode(m, oid, format, value) + case BinaryFormatCode: + switch value.(type) { + case string: + return encodeLtreeCodecBinaryString{} + case []byte: + return encodeLtreeCodecBinaryByteSlice{} + case TextValuer: + return encodeLtreeCodecBinaryTextValuer{} + } + } + + return nil +} + +type encodeLtreeCodecBinaryString struct{} + +func (encodeLtreeCodecBinaryString) Encode(value any, buf []byte) (newBuf []byte, err error) { + ltree := value.(string) + buf = append(buf, 1) + return append(buf, ltree...), nil +} + +type encodeLtreeCodecBinaryByteSlice struct{} + +func (encodeLtreeCodecBinaryByteSlice) Encode(value any, buf []byte) (newBuf []byte, err error) { + ltree := value.([]byte) + buf = append(buf, 1) + return append(buf, ltree...), nil +} + +type encodeLtreeCodecBinaryTextValuer struct{} + +func (encodeLtreeCodecBinaryTextValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + t, err := value.(TextValuer).TextValue() + if err != nil { + return nil, err + } + if !t.Valid { + return nil, nil + } + + buf = append(buf, 1) + return append(buf, t.String...), nil +} + +// PlanScan returns a ScanPlan for scanning a PostgreSQL value into a destination with the same type as target. If +// no plan can be found then nil is returned. +func (l LtreeCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case TextFormatCode: + return (TextCodec)(l).PlanScan(m, oid, format, target) + case BinaryFormatCode: + switch target.(type) { + case *string: + return scanPlanBinaryLtreeToString{} + case TextScanner: + return scanPlanBinaryLtreeToTextScanner{} + } + } + + return nil +} + +type scanPlanBinaryLtreeToString struct{} + +func (scanPlanBinaryLtreeToString) Scan(src []byte, target any) error { + version := src[0] + if version != 1 { + return fmt.Errorf("unsupported ltree version %d", version) + } + + p := (target).(*string) + *p = string(src[1:]) + + return nil +} + +type scanPlanBinaryLtreeToTextScanner struct{} + +func (scanPlanBinaryLtreeToTextScanner) Scan(src []byte, target any) error { + version := src[0] + if version != 1 { + return fmt.Errorf("unsupported ltree version %d", version) + } + + scanner := (target).(TextScanner) + return scanner.ScanText(Text{String: string(src[1:]), Valid: true}) +} + +// DecodeDatabaseSQLValue returns src decoded into a value compatible with the sql.Scanner interface. +func (l LtreeCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return (TextCodec)(l).DecodeDatabaseSQLValue(m, oid, format, src) +} + +// DecodeValue returns src decoded into its default format. +func (l LtreeCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + return (TextCodec)(l).DecodeValue(m, oid, format, src) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/macaddr.go b/vendor/github.com/jackc/pgx/v5/pgtype/macaddr.go new file mode 100644 index 0000000..e913ec9 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/macaddr.go @@ -0,0 +1,162 @@ +package pgtype + +import ( + "database/sql/driver" + "net" +) + +type MacaddrCodec struct{} + +func (MacaddrCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (MacaddrCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (MacaddrCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case net.HardwareAddr: + return encodePlanMacaddrCodecBinaryHardwareAddr{} + case TextValuer: + return encodePlanMacAddrCodecTextValuer{} + + } + case TextFormatCode: + switch value.(type) { + case net.HardwareAddr: + return encodePlanMacaddrCodecTextHardwareAddr{} + case TextValuer: + return encodePlanTextCodecTextValuer{} + } + } + + return nil +} + +type encodePlanMacaddrCodecBinaryHardwareAddr struct{} + +func (encodePlanMacaddrCodecBinaryHardwareAddr) Encode(value any, buf []byte) (newBuf []byte, err error) { + addr := value.(net.HardwareAddr) + if addr == nil { + return nil, nil + } + + return append(buf, addr...), nil +} + +type encodePlanMacAddrCodecTextValuer struct{} + +func (encodePlanMacAddrCodecTextValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + t, err := value.(TextValuer).TextValue() + if err != nil { + return nil, err + } + if !t.Valid { + return nil, nil + } + + addr, err := net.ParseMAC(t.String) + if err != nil { + return nil, err + } + + return append(buf, addr...), nil +} + +type encodePlanMacaddrCodecTextHardwareAddr struct{} + +func (encodePlanMacaddrCodecTextHardwareAddr) Encode(value any, buf []byte) (newBuf []byte, err error) { + addr := value.(net.HardwareAddr) + if addr == nil { + return nil, nil + } + + return append(buf, addr.String()...), nil +} + +func (MacaddrCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case BinaryFormatCode: + switch target.(type) { + case *net.HardwareAddr: + return scanPlanBinaryMacaddrToHardwareAddr{} + case TextScanner: + return scanPlanBinaryMacaddrToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *net.HardwareAddr: + return scanPlanTextMacaddrToHardwareAddr{} + case TextScanner: + return scanPlanTextAnyToTextScanner{} + } + } + + return nil +} + +type scanPlanBinaryMacaddrToHardwareAddr struct{} + +func (scanPlanBinaryMacaddrToHardwareAddr) Scan(src []byte, dst any) error { + dstBuf := dst.(*net.HardwareAddr) + if src == nil { + *dstBuf = nil + return nil + } + + *dstBuf = make([]byte, len(src)) + copy(*dstBuf, src) + return nil +} + +type scanPlanBinaryMacaddrToTextScanner struct{} + +func (scanPlanBinaryMacaddrToTextScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TextScanner) + if src == nil { + return scanner.ScanText(Text{}) + } + + return scanner.ScanText(Text{String: net.HardwareAddr(src).String(), Valid: true}) +} + +type scanPlanTextMacaddrToHardwareAddr struct{} + +func (scanPlanTextMacaddrToHardwareAddr) Scan(src []byte, dst any) error { + p := dst.(*net.HardwareAddr) + + if src == nil { + *p = nil + return nil + } + + addr, err := net.ParseMAC(string(src)) + if err != nil { + return err + } + + *p = addr + + return nil +} + +func (c MacaddrCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c MacaddrCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var addr net.HardwareAddr + err := codecScan(c, m, oid, format, src, &addr) + if err != nil { + return nil, err + } + return addr, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/multirange.go b/vendor/github.com/jackc/pgx/v5/pgtype/multirange.go new file mode 100644 index 0000000..e576378 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/multirange.go @@ -0,0 +1,443 @@ +package pgtype + +import ( + "bytes" + "database/sql/driver" + "encoding/binary" + "fmt" + "reflect" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +// MultirangeGetter is a type that can be converted into a PostgreSQL multirange. +type MultirangeGetter interface { + // IsNull returns true if the value is SQL NULL. + IsNull() bool + + // Len returns the number of elements in the multirange. + Len() int + + // Index returns the element at i. + Index(i int) any + + // IndexType returns a non-nil scan target of the type Index will return. This is used by MultirangeCodec.PlanEncode. + IndexType() any +} + +// MultirangeSetter is a type can be set from a PostgreSQL multirange. +type MultirangeSetter interface { + // ScanNull sets the value to SQL NULL. + ScanNull() error + + // SetLen prepares the value such that ScanIndex can be called for each element. This will remove any existing + // elements. + SetLen(n int) error + + // ScanIndex returns a value usable as a scan target for i. SetLen must be called before ScanIndex. + ScanIndex(i int) any + + // ScanIndexType returns a non-nil scan target of the type ScanIndex will return. This is used by + // MultirangeCodec.PlanScan. + ScanIndexType() any +} + +// MultirangeCodec is a codec for any multirange type. +type MultirangeCodec struct { + ElementType *Type +} + +func (c *MultirangeCodec) FormatSupported(format int16) bool { + return c.ElementType.Codec.FormatSupported(format) +} + +func (c *MultirangeCodec) PreferredFormat() int16 { + return c.ElementType.Codec.PreferredFormat() +} + +func (c *MultirangeCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + multirangeValuer, ok := value.(MultirangeGetter) + if !ok { + return nil + } + + elementType := multirangeValuer.IndexType() + + elementEncodePlan := m.PlanEncode(c.ElementType.OID, format, elementType) + if elementEncodePlan == nil { + return nil + } + + switch format { + case BinaryFormatCode: + return &encodePlanMultirangeCodecBinary{ac: c, m: m, oid: oid} + case TextFormatCode: + return &encodePlanMultirangeCodecText{ac: c, m: m, oid: oid} + } + + return nil +} + +type encodePlanMultirangeCodecText struct { + ac *MultirangeCodec + m *Map + oid uint32 +} + +func (p *encodePlanMultirangeCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + multirange := value.(MultirangeGetter) + + if multirange.IsNull() { + return nil, nil + } + + elementCount := multirange.Len() + + buf = append(buf, '{') + + var encodePlan EncodePlan + var lastElemType reflect.Type + inElemBuf := make([]byte, 0, 32) + for i := 0; i < elementCount; i++ { + if i > 0 { + buf = append(buf, ',') + } + + elem := multirange.Index(i) + var elemBuf []byte + if elem != nil { + elemType := reflect.TypeOf(elem) + if lastElemType != elemType { + lastElemType = elemType + encodePlan = p.m.PlanEncode(p.ac.ElementType.OID, TextFormatCode, elem) + if encodePlan == nil { + return nil, fmt.Errorf("unable to encode %v", multirange.Index(i)) + } + } + elemBuf, err = encodePlan.Encode(elem, inElemBuf) + if err != nil { + return nil, err + } + } + + if elemBuf == nil { + return nil, fmt.Errorf("multirange cannot contain NULL element") + } else { + buf = append(buf, elemBuf...) + } + } + + buf = append(buf, '}') + + return buf, nil +} + +type encodePlanMultirangeCodecBinary struct { + ac *MultirangeCodec + m *Map + oid uint32 +} + +func (p *encodePlanMultirangeCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + multirange := value.(MultirangeGetter) + + if multirange.IsNull() { + return nil, nil + } + + elementCount := multirange.Len() + + buf = pgio.AppendInt32(buf, int32(elementCount)) + + var encodePlan EncodePlan + var lastElemType reflect.Type + for i := 0; i < elementCount; i++ { + sp := len(buf) + buf = pgio.AppendInt32(buf, -1) + + elem := multirange.Index(i) + var elemBuf []byte + if elem != nil { + elemType := reflect.TypeOf(elem) + if lastElemType != elemType { + lastElemType = elemType + encodePlan = p.m.PlanEncode(p.ac.ElementType.OID, BinaryFormatCode, elem) + if encodePlan == nil { + return nil, fmt.Errorf("unable to encode %v", multirange.Index(i)) + } + } + elemBuf, err = encodePlan.Encode(elem, buf) + if err != nil { + return nil, err + } + } + + if elemBuf == nil { + return nil, fmt.Errorf("multirange cannot contain NULL element") + } else { + buf = elemBuf + pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) + } + } + + return buf, nil +} + +func (c *MultirangeCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + multirangeScanner, ok := target.(MultirangeSetter) + if !ok { + return nil + } + + elementType := multirangeScanner.ScanIndexType() + + elementScanPlan := m.PlanScan(c.ElementType.OID, format, elementType) + if _, ok := elementScanPlan.(*scanPlanFail); ok { + return nil + } + + return &scanPlanMultirangeCodec{ + multirangeCodec: c, + m: m, + oid: oid, + formatCode: format, + } +} + +func (c *MultirangeCodec) decodeBinary(m *Map, multirangeOID uint32, src []byte, multirange MultirangeSetter) error { + rp := 0 + + elementCount := int(binary.BigEndian.Uint32(src[rp:])) + rp += 4 + + err := multirange.SetLen(elementCount) + if err != nil { + return err + } + + if elementCount == 0 { + return nil + } + + elementScanPlan := c.ElementType.Codec.PlanScan(m, c.ElementType.OID, BinaryFormatCode, multirange.ScanIndex(0)) + if elementScanPlan == nil { + elementScanPlan = m.PlanScan(c.ElementType.OID, BinaryFormatCode, multirange.ScanIndex(0)) + } + + for i := 0; i < elementCount; i++ { + elem := multirange.ScanIndex(i) + elemLen := int(int32(binary.BigEndian.Uint32(src[rp:]))) + rp += 4 + var elemSrc []byte + if elemLen >= 0 { + elemSrc = src[rp : rp+elemLen] + rp += elemLen + } + err = elementScanPlan.Scan(elemSrc, elem) + if err != nil { + return fmt.Errorf("failed to scan multirange element %d: %w", i, err) + } + } + + return nil +} + +func (c *MultirangeCodec) decodeText(m *Map, multirangeOID uint32, src []byte, multirange MultirangeSetter) error { + elements, err := parseUntypedTextMultirange(src) + if err != nil { + return err + } + + err = multirange.SetLen(len(elements)) + if err != nil { + return err + } + + if len(elements) == 0 { + return nil + } + + elementScanPlan := c.ElementType.Codec.PlanScan(m, c.ElementType.OID, TextFormatCode, multirange.ScanIndex(0)) + if elementScanPlan == nil { + elementScanPlan = m.PlanScan(c.ElementType.OID, TextFormatCode, multirange.ScanIndex(0)) + } + + for i, s := range elements { + elem := multirange.ScanIndex(i) + err = elementScanPlan.Scan([]byte(s), elem) + if err != nil { + return err + } + } + + return nil +} + +type scanPlanMultirangeCodec struct { + multirangeCodec *MultirangeCodec + m *Map + oid uint32 + formatCode int16 + elementScanPlan ScanPlan +} + +func (spac *scanPlanMultirangeCodec) Scan(src []byte, dst any) error { + c := spac.multirangeCodec + m := spac.m + oid := spac.oid + formatCode := spac.formatCode + + multirange := dst.(MultirangeSetter) + + if src == nil { + return multirange.ScanNull() + } + + switch formatCode { + case BinaryFormatCode: + return c.decodeBinary(m, oid, src, multirange) + case TextFormatCode: + return c.decodeText(m, oid, src, multirange) + default: + return fmt.Errorf("unknown format code %d", formatCode) + } +} + +func (c *MultirangeCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + return string(src), nil + case BinaryFormatCode: + buf := make([]byte, len(src)) + copy(buf, src) + return buf, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } +} + +func (c *MultirangeCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var multirange Multirange[Range[any]] + err := m.PlanScan(oid, format, &multirange).Scan(src, &multirange) + return multirange, err +} + +func parseUntypedTextMultirange(src []byte) ([]string, error) { + elements := make([]string, 0) + + buf := bytes.NewBuffer(src) + + skipWhitespace(buf) + + r, _, err := buf.ReadRune() + if err != nil { + return nil, fmt.Errorf("invalid array: %w", err) + } + + if r != '{' { + return nil, fmt.Errorf("invalid multirange, expected '{' got %v", r) + } + +parseValueLoop: + for { + r, _, err = buf.ReadRune() + if err != nil { + return nil, fmt.Errorf("invalid multirange: %w", err) + } + + switch r { + case ',': // skip range separator + case '}': + break parseValueLoop + default: + buf.UnreadRune() + value, err := parseRange(buf) + if err != nil { + return nil, fmt.Errorf("invalid multirange value: %w", err) + } + elements = append(elements, value) + } + } + + skipWhitespace(buf) + + if buf.Len() > 0 { + return nil, fmt.Errorf("unexpected trailing data: %v", buf.String()) + } + + return elements, nil + +} + +func parseRange(buf *bytes.Buffer) (string, error) { + s := &bytes.Buffer{} + + boundSepRead := false + for { + r, _, err := buf.ReadRune() + if err != nil { + return "", err + } + + switch r { + case ',', '}': + if r == ',' && !boundSepRead { + boundSepRead = true + break + } + buf.UnreadRune() + return s.String(), nil + } + + s.WriteRune(r) + } +} + +// Multirange is a generic multirange type. +// +// T should implement RangeValuer and *T should implement RangeScanner. However, there does not appear to be a way to +// enforce the RangeScanner constraint. +type Multirange[T RangeValuer] []T + +func (r Multirange[T]) IsNull() bool { + return r == nil +} + +func (r Multirange[T]) Len() int { + return len(r) +} + +func (r Multirange[T]) Index(i int) any { + return r[i] +} + +func (r Multirange[T]) IndexType() any { + var zero T + return zero +} + +func (r *Multirange[T]) ScanNull() error { + *r = nil + return nil +} + +func (r *Multirange[T]) SetLen(n int) error { + *r = make([]T, n) + return nil +} + +func (r Multirange[T]) ScanIndex(i int) any { + return &r[i] +} + +func (r Multirange[T]) ScanIndexType() any { + return new(T) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/numeric.go b/vendor/github.com/jackc/pgx/v5/pgtype/numeric.go new file mode 100644 index 0000000..4dbec78 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/numeric.go @@ -0,0 +1,823 @@ +package pgtype + +import ( + "bytes" + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "math/big" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +// PostgreSQL internal numeric storage uses 16-bit "digits" with base of 10,000 +const nbase = 10000 + +const ( + pgNumericNaN = 0x00000000c0000000 + pgNumericNaNSign = 0xc000 + + pgNumericPosInf = 0x00000000d0000000 + pgNumericPosInfSign = 0xd000 + + pgNumericNegInf = 0x00000000f0000000 + pgNumericNegInfSign = 0xf000 +) + +var big0 *big.Int = big.NewInt(0) +var big1 *big.Int = big.NewInt(1) +var big10 *big.Int = big.NewInt(10) +var big100 *big.Int = big.NewInt(100) +var big1000 *big.Int = big.NewInt(1000) + +var bigNBase *big.Int = big.NewInt(nbase) +var bigNBaseX2 *big.Int = big.NewInt(nbase * nbase) +var bigNBaseX3 *big.Int = big.NewInt(nbase * nbase * nbase) +var bigNBaseX4 *big.Int = big.NewInt(nbase * nbase * nbase * nbase) + +type NumericScanner interface { + ScanNumeric(v Numeric) error +} + +type NumericValuer interface { + NumericValue() (Numeric, error) +} + +type Numeric struct { + Int *big.Int + Exp int32 + NaN bool + InfinityModifier InfinityModifier + Valid bool +} + +func (n *Numeric) ScanNumeric(v Numeric) error { + *n = v + return nil +} + +func (n Numeric) NumericValue() (Numeric, error) { + return n, nil +} + +func (n Numeric) Float64Value() (Float8, error) { + if !n.Valid { + return Float8{}, nil + } else if n.NaN { + return Float8{Float64: math.NaN(), Valid: true}, nil + } else if n.InfinityModifier == Infinity { + return Float8{Float64: math.Inf(1), Valid: true}, nil + } else if n.InfinityModifier == NegativeInfinity { + return Float8{Float64: math.Inf(-1), Valid: true}, nil + } + + buf := make([]byte, 0, 32) + + if n.Int == nil { + buf = append(buf, '0') + } else { + buf = append(buf, n.Int.String()...) + } + buf = append(buf, 'e') + buf = append(buf, strconv.FormatInt(int64(n.Exp), 10)...) + + f, err := strconv.ParseFloat(string(buf), 64) + if err != nil { + return Float8{}, err + } + + return Float8{Float64: f, Valid: true}, nil +} + +func (n *Numeric) ScanInt64(v Int8) error { + if !v.Valid { + *n = Numeric{} + return nil + } + + *n = Numeric{Int: big.NewInt(v.Int64), Valid: true} + return nil +} + +func (n Numeric) Int64Value() (Int8, error) { + if !n.Valid { + return Int8{}, nil + } + + bi, err := n.toBigInt() + if err != nil { + return Int8{}, err + } + + if !bi.IsInt64() { + return Int8{}, fmt.Errorf("cannot convert %v to int64", n) + } + + return Int8{Int64: bi.Int64(), Valid: true}, nil +} + +func (n *Numeric) ScanScientific(src string) error { + if !strings.ContainsAny("eE", src) { + return scanPlanTextAnyToNumericScanner{}.Scan([]byte(src), n) + } + + if bigF, ok := new(big.Float).SetString(string(src)); ok { + smallF, _ := bigF.Float64() + src = strconv.FormatFloat(smallF, 'f', -1, 64) + } + + num, exp, err := parseNumericString(src) + if err != nil { + return err + } + + *n = Numeric{Int: num, Exp: exp, Valid: true} + + return nil +} + +func (n *Numeric) toBigInt() (*big.Int, error) { + if n.Exp == 0 { + return n.Int, nil + } + + num := &big.Int{} + num.Set(n.Int) + if n.Exp > 0 { + mul := &big.Int{} + mul.Exp(big10, big.NewInt(int64(n.Exp)), nil) + num.Mul(num, mul) + return num, nil + } + + div := &big.Int{} + div.Exp(big10, big.NewInt(int64(-n.Exp)), nil) + remainder := &big.Int{} + num.DivMod(num, div, remainder) + if remainder.Cmp(big0) != 0 { + return nil, fmt.Errorf("cannot convert %v to integer", n) + } + return num, nil +} + +func parseNumericString(str string) (n *big.Int, exp int32, err error) { + idx := strings.IndexByte(str, '.') + + if idx == -1 { + for len(str) > 1 && str[len(str)-1] == '0' && str[len(str)-2] != '-' { + str = str[:len(str)-1] + exp++ + } + } else { + exp = int32(-(len(str) - idx - 1)) + str = str[:idx] + str[idx+1:] + } + + accum := &big.Int{} + if _, ok := accum.SetString(str, 10); !ok { + return nil, 0, fmt.Errorf("%s is not a number", str) + } + + return accum, exp, nil +} + +func nbaseDigitsToInt64(src []byte) (accum int64, bytesRead, digitsRead int) { + digits := len(src) / 2 + if digits > 4 { + digits = 4 + } + + rp := 0 + + for i := 0; i < digits; i++ { + if i > 0 { + accum *= nbase + } + accum += int64(binary.BigEndian.Uint16(src[rp:])) + rp += 2 + } + + return accum, rp, digits +} + +// Scan implements the database/sql Scanner interface. +func (n *Numeric) Scan(src any) error { + if src == nil { + *n = Numeric{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToNumericScanner{}.Scan([]byte(src), n) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (n Numeric) Value() (driver.Value, error) { + if !n.Valid { + return nil, nil + } + + buf, err := NumericCodec{}.PlanEncode(nil, 0, TextFormatCode, n).Encode(n, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +func (n Numeric) MarshalJSON() ([]byte, error) { + if !n.Valid { + return []byte("null"), nil + } + + if n.NaN { + return []byte(`"NaN"`), nil + } + + return n.numberTextBytes(), nil +} + +func (n *Numeric) UnmarshalJSON(src []byte) error { + if bytes.Equal(src, []byte(`null`)) { + *n = Numeric{} + return nil + } + if bytes.Equal(src, []byte(`"NaN"`)) { + *n = Numeric{NaN: true, Valid: true} + return nil + } + return scanPlanTextAnyToNumericScanner{}.Scan(src, n) +} + +// numberString returns a string of the number. undefined if NaN, infinite, or NULL +func (n Numeric) numberTextBytes() []byte { + intStr := n.Int.String() + + buf := &bytes.Buffer{} + + if len(intStr) > 0 && intStr[:1] == "-" { + intStr = intStr[1:] + buf.WriteByte('-') + } + + exp := int(n.Exp) + if exp > 0 { + buf.WriteString(intStr) + for i := 0; i < exp; i++ { + buf.WriteByte('0') + } + } else if exp < 0 { + if len(intStr) <= -exp { + buf.WriteString("0.") + leadingZeros := -exp - len(intStr) + for i := 0; i < leadingZeros; i++ { + buf.WriteByte('0') + } + buf.WriteString(intStr) + } else if len(intStr) > -exp { + dpPos := len(intStr) + exp + buf.WriteString(intStr[:dpPos]) + buf.WriteByte('.') + buf.WriteString(intStr[dpPos:]) + } + } else { + buf.WriteString(intStr) + } + + return buf.Bytes() +} + +type NumericCodec struct{} + +func (NumericCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (NumericCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (NumericCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case NumericValuer: + return encodePlanNumericCodecBinaryNumericValuer{} + case Float64Valuer: + return encodePlanNumericCodecBinaryFloat64Valuer{} + case Int64Valuer: + return encodePlanNumericCodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case NumericValuer: + return encodePlanNumericCodecTextNumericValuer{} + case Float64Valuer: + return encodePlanNumericCodecTextFloat64Valuer{} + case Int64Valuer: + return encodePlanNumericCodecTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanNumericCodecBinaryNumericValuer struct{} + +func (encodePlanNumericCodecBinaryNumericValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(NumericValuer).NumericValue() + if err != nil { + return nil, err + } + + return encodeNumericBinary(n, buf) +} + +type encodePlanNumericCodecBinaryFloat64Valuer struct{} + +func (encodePlanNumericCodecBinaryFloat64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Float64Valuer).Float64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if math.IsNaN(n.Float64) { + return encodeNumericBinary(Numeric{NaN: true, Valid: true}, buf) + } else if math.IsInf(n.Float64, 1) { + return encodeNumericBinary(Numeric{InfinityModifier: Infinity, Valid: true}, buf) + } else if math.IsInf(n.Float64, -1) { + return encodeNumericBinary(Numeric{InfinityModifier: NegativeInfinity, Valid: true}, buf) + } + num, exp, err := parseNumericString(strconv.FormatFloat(n.Float64, 'f', -1, 64)) + if err != nil { + return nil, err + } + + return encodeNumericBinary(Numeric{Int: num, Exp: exp, Valid: true}, buf) +} + +type encodePlanNumericCodecBinaryInt64Valuer struct{} + +func (encodePlanNumericCodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + return encodeNumericBinary(Numeric{Int: big.NewInt(n.Int64), Valid: true}, buf) +} + +func encodeNumericBinary(n Numeric, buf []byte) (newBuf []byte, err error) { + if !n.Valid { + return nil, nil + } + + if n.NaN { + buf = pgio.AppendUint64(buf, pgNumericNaN) + return buf, nil + } else if n.InfinityModifier == Infinity { + buf = pgio.AppendUint64(buf, pgNumericPosInf) + return buf, nil + } else if n.InfinityModifier == NegativeInfinity { + buf = pgio.AppendUint64(buf, pgNumericNegInf) + return buf, nil + } + + var sign int16 + if n.Int.Cmp(big0) < 0 { + sign = 16384 + } + + absInt := &big.Int{} + wholePart := &big.Int{} + fracPart := &big.Int{} + remainder := &big.Int{} + absInt.Abs(n.Int) + + // Normalize absInt and exp to where exp is always a multiple of 4. This makes + // converting to 16-bit base 10,000 digits easier. + var exp int32 + switch n.Exp % 4 { + case 1, -3: + exp = n.Exp - 1 + absInt.Mul(absInt, big10) + case 2, -2: + exp = n.Exp - 2 + absInt.Mul(absInt, big100) + case 3, -1: + exp = n.Exp - 3 + absInt.Mul(absInt, big1000) + default: + exp = n.Exp + } + + if exp < 0 { + divisor := &big.Int{} + divisor.Exp(big10, big.NewInt(int64(-exp)), nil) + wholePart.DivMod(absInt, divisor, fracPart) + fracPart.Add(fracPart, divisor) + } else { + wholePart = absInt + } + + var wholeDigits, fracDigits []int16 + + for wholePart.Cmp(big0) != 0 { + wholePart.DivMod(wholePart, bigNBase, remainder) + wholeDigits = append(wholeDigits, int16(remainder.Int64())) + } + + if fracPart.Cmp(big0) != 0 { + for fracPart.Cmp(big1) != 0 { + fracPart.DivMod(fracPart, bigNBase, remainder) + fracDigits = append(fracDigits, int16(remainder.Int64())) + } + } + + buf = pgio.AppendInt16(buf, int16(len(wholeDigits)+len(fracDigits))) + + var weight int16 + if len(wholeDigits) > 0 { + weight = int16(len(wholeDigits) - 1) + if exp > 0 { + weight += int16(exp / 4) + } + } else { + weight = int16(exp/4) - 1 + int16(len(fracDigits)) + } + buf = pgio.AppendInt16(buf, weight) + + buf = pgio.AppendInt16(buf, sign) + + var dscale int16 + if n.Exp < 0 { + dscale = int16(-n.Exp) + } + buf = pgio.AppendInt16(buf, dscale) + + for i := len(wholeDigits) - 1; i >= 0; i-- { + buf = pgio.AppendInt16(buf, wholeDigits[i]) + } + + for i := len(fracDigits) - 1; i >= 0; i-- { + buf = pgio.AppendInt16(buf, fracDigits[i]) + } + + return buf, nil +} + +type encodePlanNumericCodecTextNumericValuer struct{} + +func (encodePlanNumericCodecTextNumericValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(NumericValuer).NumericValue() + if err != nil { + return nil, err + } + + return encodeNumericText(n, buf) +} + +type encodePlanNumericCodecTextFloat64Valuer struct{} + +func (encodePlanNumericCodecTextFloat64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Float64Valuer).Float64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + if math.IsNaN(n.Float64) { + buf = append(buf, "NaN"...) + } else if math.IsInf(n.Float64, 1) { + buf = append(buf, "Infinity"...) + } else if math.IsInf(n.Float64, -1) { + buf = append(buf, "-Infinity"...) + } else { + buf = append(buf, strconv.FormatFloat(n.Float64, 'f', -1, 64)...) + } + return buf, nil +} + +type encodePlanNumericCodecTextInt64Valuer struct{} + +func (encodePlanNumericCodecTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + n, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !n.Valid { + return nil, nil + } + + buf = append(buf, strconv.FormatInt(n.Int64, 10)...) + return buf, nil +} + +func encodeNumericText(n Numeric, buf []byte) (newBuf []byte, err error) { + if !n.Valid { + return nil, nil + } + + if n.NaN { + buf = append(buf, "NaN"...) + return buf, nil + } else if n.InfinityModifier == Infinity { + buf = append(buf, "Infinity"...) + return buf, nil + } else if n.InfinityModifier == NegativeInfinity { + buf = append(buf, "-Infinity"...) + return buf, nil + } + + buf = append(buf, n.numberTextBytes()...) + + return buf, nil +} + +func (NumericCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case NumericScanner: + return scanPlanBinaryNumericToNumericScanner{} + case Float64Scanner: + return scanPlanBinaryNumericToFloat64Scanner{} + case Int64Scanner: + return scanPlanBinaryNumericToInt64Scanner{} + case TextScanner: + return scanPlanBinaryNumericToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case NumericScanner: + return scanPlanTextAnyToNumericScanner{} + case Float64Scanner: + return scanPlanTextAnyToFloat64Scanner{} + case Int64Scanner: + return scanPlanTextAnyToInt64Scanner{} + } + } + + return nil +} + +type scanPlanBinaryNumericToNumericScanner struct{} + +func (scanPlanBinaryNumericToNumericScanner) Scan(src []byte, dst any) error { + scanner := (dst).(NumericScanner) + + if src == nil { + return scanner.ScanNumeric(Numeric{}) + } + + if len(src) < 8 { + return fmt.Errorf("numeric incomplete %v", src) + } + + rp := 0 + ndigits := binary.BigEndian.Uint16(src[rp:]) + rp += 2 + weight := int16(binary.BigEndian.Uint16(src[rp:])) + rp += 2 + sign := binary.BigEndian.Uint16(src[rp:]) + rp += 2 + dscale := int16(binary.BigEndian.Uint16(src[rp:])) + rp += 2 + + if sign == pgNumericNaNSign { + return scanner.ScanNumeric(Numeric{NaN: true, Valid: true}) + } else if sign == pgNumericPosInfSign { + return scanner.ScanNumeric(Numeric{InfinityModifier: Infinity, Valid: true}) + } else if sign == pgNumericNegInfSign { + return scanner.ScanNumeric(Numeric{InfinityModifier: NegativeInfinity, Valid: true}) + } + + if ndigits == 0 { + return scanner.ScanNumeric(Numeric{Int: big.NewInt(0), Valid: true}) + } + + if len(src[rp:]) < int(ndigits)*2 { + return fmt.Errorf("numeric incomplete %v", src) + } + + accum := &big.Int{} + + for i := 0; i < int(ndigits+3)/4; i++ { + int64accum, bytesRead, digitsRead := nbaseDigitsToInt64(src[rp:]) + rp += bytesRead + + if i > 0 { + var mul *big.Int + switch digitsRead { + case 1: + mul = bigNBase + case 2: + mul = bigNBaseX2 + case 3: + mul = bigNBaseX3 + case 4: + mul = bigNBaseX4 + default: + return fmt.Errorf("invalid digitsRead: %d (this can't happen)", digitsRead) + } + accum.Mul(accum, mul) + } + + accum.Add(accum, big.NewInt(int64accum)) + } + + exp := (int32(weight) - int32(ndigits) + 1) * 4 + + if dscale > 0 { + fracNBaseDigits := int16(int32(ndigits) - int32(weight) - 1) + fracDecimalDigits := fracNBaseDigits * 4 + + if dscale > fracDecimalDigits { + multCount := int(dscale - fracDecimalDigits) + for i := 0; i < multCount; i++ { + accum.Mul(accum, big10) + exp-- + } + } else if dscale < fracDecimalDigits { + divCount := int(fracDecimalDigits - dscale) + for i := 0; i < divCount; i++ { + accum.Div(accum, big10) + exp++ + } + } + } + + reduced := &big.Int{} + remainder := &big.Int{} + if exp >= 0 { + for { + reduced.DivMod(accum, big10, remainder) + if remainder.Cmp(big0) != 0 { + break + } + accum.Set(reduced) + exp++ + } + } + + if sign != 0 { + accum.Neg(accum) + } + + return scanner.ScanNumeric(Numeric{Int: accum, Exp: exp, Valid: true}) +} + +type scanPlanBinaryNumericToFloat64Scanner struct{} + +func (scanPlanBinaryNumericToFloat64Scanner) Scan(src []byte, dst any) error { + scanner := (dst).(Float64Scanner) + + if src == nil { + return scanner.ScanFloat64(Float8{}) + } + + var n Numeric + + err := scanPlanBinaryNumericToNumericScanner{}.Scan(src, &n) + if err != nil { + return err + } + + f8, err := n.Float64Value() + if err != nil { + return err + } + + return scanner.ScanFloat64(f8) +} + +type scanPlanBinaryNumericToInt64Scanner struct{} + +func (scanPlanBinaryNumericToInt64Scanner) Scan(src []byte, dst any) error { + scanner := (dst).(Int64Scanner) + + if src == nil { + return scanner.ScanInt64(Int8{}) + } + + var n Numeric + + err := scanPlanBinaryNumericToNumericScanner{}.Scan(src, &n) + if err != nil { + return err + } + + bigInt, err := n.toBigInt() + if err != nil { + return err + } + + if !bigInt.IsInt64() { + return fmt.Errorf("%v is out of range for int64", bigInt) + } + + return scanner.ScanInt64(Int8{Int64: bigInt.Int64(), Valid: true}) +} + +type scanPlanBinaryNumericToTextScanner struct{} + +func (scanPlanBinaryNumericToTextScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TextScanner) + + if src == nil { + return scanner.ScanText(Text{}) + } + + var n Numeric + + err := scanPlanBinaryNumericToNumericScanner{}.Scan(src, &n) + if err != nil { + return err + } + + sbuf, err := encodeNumericText(n, nil) + if err != nil { + return err + } + + return scanner.ScanText(Text{String: string(sbuf), Valid: true}) +} + +type scanPlanTextAnyToNumericScanner struct{} + +func (scanPlanTextAnyToNumericScanner) Scan(src []byte, dst any) error { + scanner := (dst).(NumericScanner) + + if src == nil { + return scanner.ScanNumeric(Numeric{}) + } + + if string(src) == "NaN" { + return scanner.ScanNumeric(Numeric{NaN: true, Valid: true}) + } else if string(src) == "Infinity" { + return scanner.ScanNumeric(Numeric{InfinityModifier: Infinity, Valid: true}) + } else if string(src) == "-Infinity" { + return scanner.ScanNumeric(Numeric{InfinityModifier: NegativeInfinity, Valid: true}) + } + + num, exp, err := parseNumericString(string(src)) + if err != nil { + return err + } + + return scanner.ScanNumeric(Numeric{Int: num, Exp: exp, Valid: true}) +} + +func (c NumericCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + if format == TextFormatCode { + return string(src), nil + } + + var n Numeric + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + + buf, err := m.Encode(oid, TextFormatCode, n, nil) + if err != nil { + return nil, err + } + return string(buf), nil +} + +func (c NumericCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n Numeric + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/path.go b/vendor/github.com/jackc/pgx/v5/pgtype/path.go new file mode 100644 index 0000000..73e0ec5 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/path.go @@ -0,0 +1,272 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type PathScanner interface { + ScanPath(v Path) error +} + +type PathValuer interface { + PathValue() (Path, error) +} + +type Path struct { + P []Vec2 + Closed bool + Valid bool +} + +func (path *Path) ScanPath(v Path) error { + *path = v + return nil +} + +func (path Path) PathValue() (Path, error) { + return path, nil +} + +// Scan implements the database/sql Scanner interface. +func (path *Path) Scan(src any) error { + if src == nil { + *path = Path{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToPathScanner{}.Scan([]byte(src), path) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (path Path) Value() (driver.Value, error) { + if !path.Valid { + return nil, nil + } + + buf, err := PathCodec{}.PlanEncode(nil, 0, TextFormatCode, path).Encode(path, nil) + if err != nil { + return nil, err + } + + return string(buf), err +} + +type PathCodec struct{} + +func (PathCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (PathCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (PathCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(PathValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanPathCodecBinary{} + case TextFormatCode: + return encodePlanPathCodecText{} + } + + return nil +} + +type encodePlanPathCodecBinary struct{} + +func (encodePlanPathCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + path, err := value.(PathValuer).PathValue() + if err != nil { + return nil, err + } + + if !path.Valid { + return nil, nil + } + + var closeByte byte + if path.Closed { + closeByte = 1 + } + buf = append(buf, closeByte) + + buf = pgio.AppendInt32(buf, int32(len(path.P))) + + for _, p := range path.P { + buf = pgio.AppendUint64(buf, math.Float64bits(p.X)) + buf = pgio.AppendUint64(buf, math.Float64bits(p.Y)) + } + + return buf, nil +} + +type encodePlanPathCodecText struct{} + +func (encodePlanPathCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + path, err := value.(PathValuer).PathValue() + if err != nil { + return nil, err + } + + if !path.Valid { + return nil, nil + } + + var startByte, endByte byte + if path.Closed { + startByte = '(' + endByte = ')' + } else { + startByte = '[' + endByte = ']' + } + buf = append(buf, startByte) + + for i, p := range path.P { + if i > 0 { + buf = append(buf, ',') + } + buf = append(buf, fmt.Sprintf(`(%s,%s)`, + strconv.FormatFloat(p.X, 'f', -1, 64), + strconv.FormatFloat(p.Y, 'f', -1, 64), + )...) + } + + buf = append(buf, endByte) + + return buf, nil +} + +func (PathCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case PathScanner: + return scanPlanBinaryPathToPathScanner{} + } + case TextFormatCode: + switch target.(type) { + case PathScanner: + return scanPlanTextAnyToPathScanner{} + } + } + + return nil +} + +type scanPlanBinaryPathToPathScanner struct{} + +func (scanPlanBinaryPathToPathScanner) Scan(src []byte, dst any) error { + scanner := (dst).(PathScanner) + + if src == nil { + return scanner.ScanPath(Path{}) + } + + if len(src) < 5 { + return fmt.Errorf("invalid length for Path: %v", len(src)) + } + + closed := src[0] == 1 + pointCount := int(binary.BigEndian.Uint32(src[1:])) + + rp := 5 + + if 5+pointCount*16 != len(src) { + return fmt.Errorf("invalid length for Path with %d points: %v", pointCount, len(src)) + } + + points := make([]Vec2, pointCount) + for i := 0; i < len(points); i++ { + x := binary.BigEndian.Uint64(src[rp:]) + rp += 8 + y := binary.BigEndian.Uint64(src[rp:]) + rp += 8 + points[i] = Vec2{math.Float64frombits(x), math.Float64frombits(y)} + } + + return scanner.ScanPath(Path{ + P: points, + Closed: closed, + Valid: true, + }) +} + +type scanPlanTextAnyToPathScanner struct{} + +func (scanPlanTextAnyToPathScanner) Scan(src []byte, dst any) error { + scanner := (dst).(PathScanner) + + if src == nil { + return scanner.ScanPath(Path{}) + } + + if len(src) < 7 { + return fmt.Errorf("invalid length for Path: %v", len(src)) + } + + closed := src[0] == '(' + points := make([]Vec2, 0) + + str := string(src[2:]) + + for { + end := strings.IndexByte(str, ',') + x, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1:] + end = strings.IndexByte(str, ')') + + y, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + points = append(points, Vec2{x, y}) + + if end+3 < len(str) { + str = str[end+3:] + } else { + break + } + } + + return scanner.ScanPath(Path{P: points, Closed: closed, Valid: true}) +} + +func (c PathCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c PathCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var path Path + err := codecScan(c, m, oid, format, src, &path) + if err != nil { + return nil, err + } + return path, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/pgtype.go b/vendor/github.com/jackc/pgx/v5/pgtype/pgtype.go new file mode 100644 index 0000000..bdd9f05 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/pgtype.go @@ -0,0 +1,2065 @@ +package pgtype + +import ( + "database/sql" + "database/sql/driver" + "errors" + "fmt" + "net" + "net/netip" + "reflect" + "time" +) + +// PostgreSQL oids for common types +const ( + BoolOID = 16 + ByteaOID = 17 + QCharOID = 18 + NameOID = 19 + Int8OID = 20 + Int2OID = 21 + Int4OID = 23 + TextOID = 25 + OIDOID = 26 + TIDOID = 27 + XIDOID = 28 + CIDOID = 29 + JSONOID = 114 + XMLOID = 142 + XMLArrayOID = 143 + JSONArrayOID = 199 + PointOID = 600 + LsegOID = 601 + PathOID = 602 + BoxOID = 603 + PolygonOID = 604 + LineOID = 628 + LineArrayOID = 629 + CIDROID = 650 + CIDRArrayOID = 651 + Float4OID = 700 + Float8OID = 701 + CircleOID = 718 + CircleArrayOID = 719 + UnknownOID = 705 + Macaddr8OID = 774 + MacaddrOID = 829 + InetOID = 869 + BoolArrayOID = 1000 + QCharArrayOID = 1002 + NameArrayOID = 1003 + Int2ArrayOID = 1005 + Int4ArrayOID = 1007 + TextArrayOID = 1009 + TIDArrayOID = 1010 + ByteaArrayOID = 1001 + XIDArrayOID = 1011 + CIDArrayOID = 1012 + BPCharArrayOID = 1014 + VarcharArrayOID = 1015 + Int8ArrayOID = 1016 + PointArrayOID = 1017 + LsegArrayOID = 1018 + PathArrayOID = 1019 + BoxArrayOID = 1020 + Float4ArrayOID = 1021 + Float8ArrayOID = 1022 + PolygonArrayOID = 1027 + OIDArrayOID = 1028 + ACLItemOID = 1033 + ACLItemArrayOID = 1034 + MacaddrArrayOID = 1040 + InetArrayOID = 1041 + BPCharOID = 1042 + VarcharOID = 1043 + DateOID = 1082 + TimeOID = 1083 + TimestampOID = 1114 + TimestampArrayOID = 1115 + DateArrayOID = 1182 + TimeArrayOID = 1183 + TimestamptzOID = 1184 + TimestamptzArrayOID = 1185 + IntervalOID = 1186 + IntervalArrayOID = 1187 + NumericArrayOID = 1231 + TimetzOID = 1266 + TimetzArrayOID = 1270 + BitOID = 1560 + BitArrayOID = 1561 + VarbitOID = 1562 + VarbitArrayOID = 1563 + NumericOID = 1700 + RecordOID = 2249 + RecordArrayOID = 2287 + UUIDOID = 2950 + UUIDArrayOID = 2951 + JSONBOID = 3802 + JSONBArrayOID = 3807 + DaterangeOID = 3912 + DaterangeArrayOID = 3913 + Int4rangeOID = 3904 + Int4rangeArrayOID = 3905 + NumrangeOID = 3906 + NumrangeArrayOID = 3907 + TsrangeOID = 3908 + TsrangeArrayOID = 3909 + TstzrangeOID = 3910 + TstzrangeArrayOID = 3911 + Int8rangeOID = 3926 + Int8rangeArrayOID = 3927 + JSONPathOID = 4072 + JSONPathArrayOID = 4073 + Int4multirangeOID = 4451 + NummultirangeOID = 4532 + TsmultirangeOID = 4533 + TstzmultirangeOID = 4534 + DatemultirangeOID = 4535 + Int8multirangeOID = 4536 + Int4multirangeArrayOID = 6150 + NummultirangeArrayOID = 6151 + TsmultirangeArrayOID = 6152 + TstzmultirangeArrayOID = 6153 + DatemultirangeArrayOID = 6155 + Int8multirangeArrayOID = 6157 +) + +type InfinityModifier int8 + +const ( + Infinity InfinityModifier = 1 + Finite InfinityModifier = 0 + NegativeInfinity InfinityModifier = -Infinity +) + +func (im InfinityModifier) String() string { + switch im { + case Finite: + return "finite" + case Infinity: + return "infinity" + case NegativeInfinity: + return "-infinity" + default: + return "invalid" + } +} + +// PostgreSQL format codes +const ( + TextFormatCode = 0 + BinaryFormatCode = 1 +) + +// A Codec converts between Go and PostgreSQL values. A Codec must not be mutated after it is registered with a Map. +type Codec interface { + // FormatSupported returns true if the format is supported. + FormatSupported(int16) bool + + // PreferredFormat returns the preferred format. + PreferredFormat() int16 + + // PlanEncode returns an EncodePlan for encoding value into PostgreSQL format for oid and format. If no plan can be + // found then nil is returned. + PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan + + // PlanScan returns a ScanPlan for scanning a PostgreSQL value into a destination with the same type as target. If + // no plan can be found then nil is returned. + PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan + + // DecodeDatabaseSQLValue returns src decoded into a value compatible with the sql.Scanner interface. + DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) + + // DecodeValue returns src decoded into its default format. + DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) +} + +type nullAssignmentError struct { + dst any +} + +func (e *nullAssignmentError) Error() string { + return fmt.Sprintf("cannot assign NULL to %T", e.dst) +} + +// Type represents a PostgreSQL data type. It must not be mutated after it is registered with a Map. +type Type struct { + Codec Codec + Name string + OID uint32 +} + +// Map is the mapping between PostgreSQL server types and Go type handling logic. It can encode values for +// transmission to a PostgreSQL server and scan received values. +type Map struct { + oidToType map[uint32]*Type + nameToType map[string]*Type + reflectTypeToName map[reflect.Type]string + oidToFormatCode map[uint32]int16 + + reflectTypeToType map[reflect.Type]*Type + + memoizedScanPlans map[uint32]map[reflect.Type][2]ScanPlan + memoizedEncodePlans map[uint32]map[reflect.Type][2]EncodePlan + + // TryWrapEncodePlanFuncs is a slice of functions that will wrap a value that cannot be encoded by the Codec. Every + // time a wrapper is found the PlanEncode method will be recursively called with the new value. This allows several layers of wrappers + // to be built up. There are default functions placed in this slice by NewMap(). In most cases these functions + // should run last. i.e. Additional functions should typically be prepended not appended. + TryWrapEncodePlanFuncs []TryWrapEncodePlanFunc + + // TryWrapScanPlanFuncs is a slice of functions that will wrap a target that cannot be scanned into by the Codec. Every + // time a wrapper is found the PlanScan method will be recursively called with the new target. This allows several layers of wrappers + // to be built up. There are default functions placed in this slice by NewMap(). In most cases these functions + // should run last. i.e. Additional functions should typically be prepended not appended. + TryWrapScanPlanFuncs []TryWrapScanPlanFunc +} + +// Copy returns a new Map containing the same registered types. +func (m *Map) Copy() *Map { + newMap := NewMap() + for _, type_ := range m.oidToType { + newMap.RegisterType(type_) + } + return newMap +} + +func NewMap() *Map { + defaultMapInitOnce.Do(initDefaultMap) + + return &Map{ + oidToType: make(map[uint32]*Type), + nameToType: make(map[string]*Type), + reflectTypeToName: make(map[reflect.Type]string), + oidToFormatCode: make(map[uint32]int16), + + memoizedScanPlans: make(map[uint32]map[reflect.Type][2]ScanPlan), + memoizedEncodePlans: make(map[uint32]map[reflect.Type][2]EncodePlan), + + TryWrapEncodePlanFuncs: []TryWrapEncodePlanFunc{ + TryWrapDerefPointerEncodePlan, + TryWrapBuiltinTypeEncodePlan, + TryWrapFindUnderlyingTypeEncodePlan, + TryWrapStructEncodePlan, + TryWrapSliceEncodePlan, + TryWrapMultiDimSliceEncodePlan, + TryWrapArrayEncodePlan, + }, + + TryWrapScanPlanFuncs: []TryWrapScanPlanFunc{ + TryPointerPointerScanPlan, + TryWrapBuiltinTypeScanPlan, + TryFindUnderlyingTypeScanPlan, + TryWrapStructScanPlan, + TryWrapPtrSliceScanPlan, + TryWrapPtrMultiDimSliceScanPlan, + TryWrapPtrArrayScanPlan, + }, + } +} + +// RegisterTypes registers multiple data types in the sequence they are provided. +func (m *Map) RegisterTypes(types []*Type) { + for _, t := range types { + m.RegisterType(t) + } +} + +// RegisterType registers a data type with the Map. t must not be mutated after it is registered. +func (m *Map) RegisterType(t *Type) { + m.oidToType[t.OID] = t + m.nameToType[t.Name] = t + m.oidToFormatCode[t.OID] = t.Codec.PreferredFormat() + + // Invalidated by type registration + m.reflectTypeToType = nil + for k := range m.memoizedScanPlans { + delete(m.memoizedScanPlans, k) + } + for k := range m.memoizedEncodePlans { + delete(m.memoizedEncodePlans, k) + } +} + +// RegisterDefaultPgType registers a mapping of a Go type to a PostgreSQL type name. Typically the data type to be +// encoded or decoded is determined by the PostgreSQL OID. But if the OID of a value to be encoded or decoded is +// unknown, this additional mapping will be used by TypeForValue to determine a suitable data type. +func (m *Map) RegisterDefaultPgType(value any, name string) { + m.reflectTypeToName[reflect.TypeOf(value)] = name + + // Invalidated by type registration + m.reflectTypeToType = nil + for k := range m.memoizedScanPlans { + delete(m.memoizedScanPlans, k) + } + for k := range m.memoizedEncodePlans { + delete(m.memoizedEncodePlans, k) + } +} + +// TypeForOID returns the Type registered for the given OID. The returned Type must not be mutated. +func (m *Map) TypeForOID(oid uint32) (*Type, bool) { + if dt, ok := m.oidToType[oid]; ok { + return dt, true + } + + dt, ok := defaultMap.oidToType[oid] + return dt, ok +} + +// TypeForName returns the Type registered for the given name. The returned Type must not be mutated. +func (m *Map) TypeForName(name string) (*Type, bool) { + if dt, ok := m.nameToType[name]; ok { + return dt, true + } + dt, ok := defaultMap.nameToType[name] + return dt, ok +} + +func (m *Map) buildReflectTypeToType() { + m.reflectTypeToType = make(map[reflect.Type]*Type) + + for reflectType, name := range m.reflectTypeToName { + if dt, ok := m.TypeForName(name); ok { + m.reflectTypeToType[reflectType] = dt + } + } +} + +// TypeForValue finds a data type suitable for v. Use RegisterType to register types that can encode and decode +// themselves. Use RegisterDefaultPgType to register that can be handled by a registered data type. The returned Type +// must not be mutated. +func (m *Map) TypeForValue(v any) (*Type, bool) { + if m.reflectTypeToType == nil { + m.buildReflectTypeToType() + } + + if dt, ok := m.reflectTypeToType[reflect.TypeOf(v)]; ok { + return dt, true + } + + dt, ok := defaultMap.reflectTypeToType[reflect.TypeOf(v)] + return dt, ok +} + +// FormatCodeForOID returns the preferred format code for type oid. If the type is not registered it returns the text +// format code. +func (m *Map) FormatCodeForOID(oid uint32) int16 { + if fc, ok := m.oidToFormatCode[oid]; ok { + return fc + } + + if fc, ok := defaultMap.oidToFormatCode[oid]; ok { + return fc + } + + return TextFormatCode +} + +// EncodePlan is a precompiled plan to encode a particular type into a particular OID and format. +type EncodePlan interface { + // Encode appends the encoded bytes of value to buf. If value is the SQL value NULL then append nothing and return + // (nil, nil). The caller of Encode is responsible for writing the correct NULL value or the length of the data + // written. + Encode(value any, buf []byte) (newBuf []byte, err error) +} + +// ScanPlan is a precompiled plan to scan into a type of destination. +type ScanPlan interface { + // Scan scans src into target. src is only valid during the call to Scan. The ScanPlan must not retain a reference to + // src. + Scan(src []byte, target any) error +} + +type scanPlanCodecSQLScanner struct { + c Codec + m *Map + oid uint32 + formatCode int16 +} + +func (plan *scanPlanCodecSQLScanner) Scan(src []byte, dst any) error { + value, err := plan.c.DecodeDatabaseSQLValue(plan.m, plan.oid, plan.formatCode, src) + if err != nil { + return err + } + + scanner := dst.(sql.Scanner) + return scanner.Scan(value) +} + +type scanPlanSQLScanner struct { + formatCode int16 +} + +func (plan *scanPlanSQLScanner) Scan(src []byte, dst any) error { + scanner := dst.(sql.Scanner) + if src == nil { + // This is necessary because interface value []byte:nil does not equal nil:nil for the binary format path and the + // text format path would be converted to empty string. + return scanner.Scan(nil) + } else if plan.formatCode == BinaryFormatCode { + return scanner.Scan(src) + } else { + return scanner.Scan(string(src)) + } +} + +type scanPlanString struct{} + +func (scanPlanString) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p := (dst).(*string) + *p = string(src) + return nil +} + +type scanPlanAnyTextToBytes struct{} + +func (scanPlanAnyTextToBytes) Scan(src []byte, dst any) error { + dstBuf := dst.(*[]byte) + if src == nil { + *dstBuf = nil + return nil + } + + *dstBuf = make([]byte, len(src)) + copy(*dstBuf, src) + return nil +} + +type scanPlanFail struct { + m *Map + oid uint32 + formatCode int16 +} + +func (plan *scanPlanFail) Scan(src []byte, dst any) error { + // If src is NULL it might be possible to scan into dst even though it is the types are not compatible. While this + // may seem to be a contrived case it can occur when selecting NULL directly. PostgreSQL assigns it the type of text. + // It would be surprising to the caller to have to cast the NULL (e.g. `select null::int`). So try to figure out a + // compatible data type for dst and scan with that. + // + // See https://github.com/jackc/pgx/issues/1326 + if src == nil { + // As a horrible hack try all types to find anything that can scan into dst. + for oid := range plan.m.oidToType { + // using planScan instead of Scan or PlanScan to avoid polluting the planned scan cache. + plan := plan.m.planScan(oid, plan.formatCode, dst) + if _, ok := plan.(*scanPlanFail); !ok { + return plan.Scan(src, dst) + } + } + for oid := range defaultMap.oidToType { + if _, ok := plan.m.oidToType[oid]; !ok { + plan := plan.m.planScan(oid, plan.formatCode, dst) + if _, ok := plan.(*scanPlanFail); !ok { + return plan.Scan(src, dst) + } + } + } + } + + var format string + switch plan.formatCode { + case TextFormatCode: + format = "text" + case BinaryFormatCode: + format = "binary" + default: + format = fmt.Sprintf("unknown %d", plan.formatCode) + } + + var dataTypeName string + if t, ok := plan.m.TypeForOID(plan.oid); ok { + dataTypeName = t.Name + } else { + dataTypeName = "unknown type" + } + + return fmt.Errorf("cannot scan %s (OID %d) in %v format into %T", dataTypeName, plan.oid, format, dst) +} + +// TryWrapScanPlanFunc is a function that tries to create a wrapper plan for target. If successful it returns a plan +// that will convert the target passed to Scan and then call the next plan. nextTarget is target as it will be converted +// by plan. It must be used to find another suitable ScanPlan. When it is found SetNext must be called on plan for it +// to be usabled. ok indicates if a suitable wrapper was found. +type TryWrapScanPlanFunc func(target any) (plan WrappedScanPlanNextSetter, nextTarget any, ok bool) + +type pointerPointerScanPlan struct { + dstType reflect.Type + next ScanPlan +} + +func (plan *pointerPointerScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *pointerPointerScanPlan) Scan(src []byte, dst any) error { + el := reflect.ValueOf(dst).Elem() + if src == nil { + el.Set(reflect.Zero(el.Type())) + return nil + } + + el.Set(reflect.New(el.Type().Elem())) + return plan.next.Scan(src, el.Interface()) +} + +// TryPointerPointerScanPlan handles a pointer to a pointer by setting the target to nil for SQL NULL and allocating and +// scanning for non-NULL. +func TryPointerPointerScanPlan(target any) (plan WrappedScanPlanNextSetter, nextTarget any, ok bool) { + if dstValue := reflect.ValueOf(target); dstValue.Kind() == reflect.Ptr { + elemValue := dstValue.Elem() + if elemValue.Kind() == reflect.Ptr { + plan = &pointerPointerScanPlan{dstType: dstValue.Type()} + return plan, reflect.Zero(elemValue.Type()).Interface(), true + } + } + + return nil, nil, false +} + +// SkipUnderlyingTypePlanner prevents PlanScan and PlanDecode from trying to use the underlying type. +type SkipUnderlyingTypePlanner interface { + SkipUnderlyingTypePlan() +} + +var elemKindToPointerTypes map[reflect.Kind]reflect.Type = map[reflect.Kind]reflect.Type{ + reflect.Int: reflect.TypeOf(new(int)), + reflect.Int8: reflect.TypeOf(new(int8)), + reflect.Int16: reflect.TypeOf(new(int16)), + reflect.Int32: reflect.TypeOf(new(int32)), + reflect.Int64: reflect.TypeOf(new(int64)), + reflect.Uint: reflect.TypeOf(new(uint)), + reflect.Uint8: reflect.TypeOf(new(uint8)), + reflect.Uint16: reflect.TypeOf(new(uint16)), + reflect.Uint32: reflect.TypeOf(new(uint32)), + reflect.Uint64: reflect.TypeOf(new(uint64)), + reflect.Float32: reflect.TypeOf(new(float32)), + reflect.Float64: reflect.TypeOf(new(float64)), + reflect.String: reflect.TypeOf(new(string)), + reflect.Bool: reflect.TypeOf(new(bool)), +} + +type underlyingTypeScanPlan struct { + dstType reflect.Type + nextDstType reflect.Type + next ScanPlan +} + +func (plan *underlyingTypeScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *underlyingTypeScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, reflect.ValueOf(dst).Convert(plan.nextDstType).Interface()) +} + +// TryFindUnderlyingTypeScanPlan tries to convert to a Go builtin type. e.g. If value was of type MyString and +// MyString was defined as a string then a wrapper plan would be returned that converts MyString to string. +func TryFindUnderlyingTypeScanPlan(dst any) (plan WrappedScanPlanNextSetter, nextDst any, ok bool) { + if _, ok := dst.(SkipUnderlyingTypePlanner); ok { + return nil, nil, false + } + + dstValue := reflect.ValueOf(dst) + + if dstValue.Kind() == reflect.Ptr { + var elemValue reflect.Value + if dstValue.IsNil() { + elemValue = reflect.New(dstValue.Type().Elem()).Elem() + } else { + elemValue = dstValue.Elem() + } + nextDstType := elemKindToPointerTypes[elemValue.Kind()] + if nextDstType == nil { + if elemValue.Kind() == reflect.Slice { + if elemValue.Type().Elem().Kind() == reflect.Uint8 { + var v *[]byte + nextDstType = reflect.TypeOf(v) + } + } + + // Get underlying type of any array. + // https://github.com/jackc/pgx/issues/2107 + if elemValue.Kind() == reflect.Array { + nextDstType = reflect.PointerTo(reflect.ArrayOf(elemValue.Len(), elemValue.Type().Elem())) + } + } + + if nextDstType != nil && dstValue.Type() != nextDstType && dstValue.CanConvert(nextDstType) { + return &underlyingTypeScanPlan{dstType: dstValue.Type(), nextDstType: nextDstType}, dstValue.Convert(nextDstType).Interface(), true + } + } + + return nil, nil, false +} + +type WrappedScanPlanNextSetter interface { + SetNext(ScanPlan) + ScanPlan +} + +// TryWrapBuiltinTypeScanPlan tries to wrap a builtin type with a wrapper that provides additional methods. e.g. If +// value was of type int32 then a wrapper plan would be returned that converts target to a value that implements +// Int64Scanner. +func TryWrapBuiltinTypeScanPlan(target any) (plan WrappedScanPlanNextSetter, nextDst any, ok bool) { + switch target := target.(type) { + case *int8: + return &wrapInt8ScanPlan{}, (*int8Wrapper)(target), true + case *int16: + return &wrapInt16ScanPlan{}, (*int16Wrapper)(target), true + case *int32: + return &wrapInt32ScanPlan{}, (*int32Wrapper)(target), true + case *int64: + return &wrapInt64ScanPlan{}, (*int64Wrapper)(target), true + case *int: + return &wrapIntScanPlan{}, (*intWrapper)(target), true + case *uint8: + return &wrapUint8ScanPlan{}, (*uint8Wrapper)(target), true + case *uint16: + return &wrapUint16ScanPlan{}, (*uint16Wrapper)(target), true + case *uint32: + return &wrapUint32ScanPlan{}, (*uint32Wrapper)(target), true + case *uint64: + return &wrapUint64ScanPlan{}, (*uint64Wrapper)(target), true + case *uint: + return &wrapUintScanPlan{}, (*uintWrapper)(target), true + case *float32: + return &wrapFloat32ScanPlan{}, (*float32Wrapper)(target), true + case *float64: + return &wrapFloat64ScanPlan{}, (*float64Wrapper)(target), true + case *string: + return &wrapStringScanPlan{}, (*stringWrapper)(target), true + case *time.Time: + return &wrapTimeScanPlan{}, (*timeWrapper)(target), true + case *time.Duration: + return &wrapDurationScanPlan{}, (*durationWrapper)(target), true + case *net.IPNet: + return &wrapNetIPNetScanPlan{}, (*netIPNetWrapper)(target), true + case *net.IP: + return &wrapNetIPScanPlan{}, (*netIPWrapper)(target), true + case *netip.Prefix: + return &wrapNetipPrefixScanPlan{}, (*netipPrefixWrapper)(target), true + case *netip.Addr: + return &wrapNetipAddrScanPlan{}, (*netipAddrWrapper)(target), true + case *map[string]*string: + return &wrapMapStringToPointerStringScanPlan{}, (*mapStringToPointerStringWrapper)(target), true + case *map[string]string: + return &wrapMapStringToStringScanPlan{}, (*mapStringToStringWrapper)(target), true + case *[16]byte: + return &wrapByte16ScanPlan{}, (*byte16Wrapper)(target), true + case *[]byte: + return &wrapByteSliceScanPlan{}, (*byteSliceWrapper)(target), true + } + + return nil, nil, false +} + +type wrapInt8ScanPlan struct { + next ScanPlan +} + +func (plan *wrapInt8ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapInt8ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*int8Wrapper)(dst.(*int8))) +} + +type wrapInt16ScanPlan struct { + next ScanPlan +} + +func (plan *wrapInt16ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapInt16ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*int16Wrapper)(dst.(*int16))) +} + +type wrapInt32ScanPlan struct { + next ScanPlan +} + +func (plan *wrapInt32ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapInt32ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*int32Wrapper)(dst.(*int32))) +} + +type wrapInt64ScanPlan struct { + next ScanPlan +} + +func (plan *wrapInt64ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapInt64ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*int64Wrapper)(dst.(*int64))) +} + +type wrapIntScanPlan struct { + next ScanPlan +} + +func (plan *wrapIntScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapIntScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*intWrapper)(dst.(*int))) +} + +type wrapUint8ScanPlan struct { + next ScanPlan +} + +func (plan *wrapUint8ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapUint8ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*uint8Wrapper)(dst.(*uint8))) +} + +type wrapUint16ScanPlan struct { + next ScanPlan +} + +func (plan *wrapUint16ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapUint16ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*uint16Wrapper)(dst.(*uint16))) +} + +type wrapUint32ScanPlan struct { + next ScanPlan +} + +func (plan *wrapUint32ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapUint32ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*uint32Wrapper)(dst.(*uint32))) +} + +type wrapUint64ScanPlan struct { + next ScanPlan +} + +func (plan *wrapUint64ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapUint64ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*uint64Wrapper)(dst.(*uint64))) +} + +type wrapUintScanPlan struct { + next ScanPlan +} + +func (plan *wrapUintScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapUintScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*uintWrapper)(dst.(*uint))) +} + +type wrapFloat32ScanPlan struct { + next ScanPlan +} + +func (plan *wrapFloat32ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapFloat32ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*float32Wrapper)(dst.(*float32))) +} + +type wrapFloat64ScanPlan struct { + next ScanPlan +} + +func (plan *wrapFloat64ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapFloat64ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*float64Wrapper)(dst.(*float64))) +} + +type wrapStringScanPlan struct { + next ScanPlan +} + +func (plan *wrapStringScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapStringScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*stringWrapper)(dst.(*string))) +} + +type wrapTimeScanPlan struct { + next ScanPlan +} + +func (plan *wrapTimeScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapTimeScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*timeWrapper)(dst.(*time.Time))) +} + +type wrapDurationScanPlan struct { + next ScanPlan +} + +func (plan *wrapDurationScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapDurationScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*durationWrapper)(dst.(*time.Duration))) +} + +type wrapNetIPNetScanPlan struct { + next ScanPlan +} + +func (plan *wrapNetIPNetScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapNetIPNetScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*netIPNetWrapper)(dst.(*net.IPNet))) +} + +type wrapNetIPScanPlan struct { + next ScanPlan +} + +func (plan *wrapNetIPScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapNetIPScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*netIPWrapper)(dst.(*net.IP))) +} + +type wrapNetipPrefixScanPlan struct { + next ScanPlan +} + +func (plan *wrapNetipPrefixScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapNetipPrefixScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*netipPrefixWrapper)(dst.(*netip.Prefix))) +} + +type wrapNetipAddrScanPlan struct { + next ScanPlan +} + +func (plan *wrapNetipAddrScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapNetipAddrScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*netipAddrWrapper)(dst.(*netip.Addr))) +} + +type wrapMapStringToPointerStringScanPlan struct { + next ScanPlan +} + +func (plan *wrapMapStringToPointerStringScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapMapStringToPointerStringScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*mapStringToPointerStringWrapper)(dst.(*map[string]*string))) +} + +type wrapMapStringToStringScanPlan struct { + next ScanPlan +} + +func (plan *wrapMapStringToStringScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapMapStringToStringScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*mapStringToStringWrapper)(dst.(*map[string]string))) +} + +type wrapByte16ScanPlan struct { + next ScanPlan +} + +func (plan *wrapByte16ScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapByte16ScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*byte16Wrapper)(dst.(*[16]byte))) +} + +type wrapByteSliceScanPlan struct { + next ScanPlan +} + +func (plan *wrapByteSliceScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapByteSliceScanPlan) Scan(src []byte, dst any) error { + return plan.next.Scan(src, (*byteSliceWrapper)(dst.(*[]byte))) +} + +type pointerEmptyInterfaceScanPlan struct { + codec Codec + m *Map + oid uint32 + formatCode int16 +} + +func (plan *pointerEmptyInterfaceScanPlan) Scan(src []byte, dst any) error { + value, err := plan.codec.DecodeValue(plan.m, plan.oid, plan.formatCode, src) + if err != nil { + return err + } + + ptrAny := dst.(*any) + *ptrAny = value + + return nil +} + +// TryWrapStructPlan tries to wrap a struct with a wrapper that implements CompositeIndexGetter. +func TryWrapStructScanPlan(target any) (plan WrappedScanPlanNextSetter, nextValue any, ok bool) { + targetValue := reflect.ValueOf(target) + if targetValue.Kind() != reflect.Ptr { + return nil, nil, false + } + + var targetElemValue reflect.Value + if targetValue.IsNil() { + targetElemValue = reflect.Zero(targetValue.Type().Elem()) + } else { + targetElemValue = targetValue.Elem() + } + targetElemType := targetElemValue.Type() + + if targetElemType.Kind() == reflect.Struct { + exportedFields := getExportedFieldValues(targetElemValue) + if len(exportedFields) == 0 { + return nil, nil, false + } + + w := ptrStructWrapper{ + s: target, + exportedFields: exportedFields, + } + return &wrapAnyPtrStructScanPlan{}, &w, true + } + + return nil, nil, false +} + +type wrapAnyPtrStructScanPlan struct { + next ScanPlan +} + +func (plan *wrapAnyPtrStructScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapAnyPtrStructScanPlan) Scan(src []byte, target any) error { + w := ptrStructWrapper{ + s: target, + exportedFields: getExportedFieldValues(reflect.ValueOf(target).Elem()), + } + + return plan.next.Scan(src, &w) +} + +// TryWrapPtrSliceScanPlan tries to wrap a pointer to a single dimension slice. +func TryWrapPtrSliceScanPlan(target any) (plan WrappedScanPlanNextSetter, nextValue any, ok bool) { + // Avoid using reflect path for common types. + switch target := target.(type) { + case *[]int16: + return &wrapPtrSliceScanPlan[int16]{}, (*FlatArray[int16])(target), true + case *[]int32: + return &wrapPtrSliceScanPlan[int32]{}, (*FlatArray[int32])(target), true + case *[]int64: + return &wrapPtrSliceScanPlan[int64]{}, (*FlatArray[int64])(target), true + case *[]float32: + return &wrapPtrSliceScanPlan[float32]{}, (*FlatArray[float32])(target), true + case *[]float64: + return &wrapPtrSliceScanPlan[float64]{}, (*FlatArray[float64])(target), true + case *[]string: + return &wrapPtrSliceScanPlan[string]{}, (*FlatArray[string])(target), true + case *[]time.Time: + return &wrapPtrSliceScanPlan[time.Time]{}, (*FlatArray[time.Time])(target), true + } + + targetType := reflect.TypeOf(target) + if targetType.Kind() != reflect.Ptr { + return nil, nil, false + } + + targetElemType := targetType.Elem() + + if targetElemType.Kind() == reflect.Slice { + slice := reflect.New(targetElemType).Elem() + return &wrapPtrSliceReflectScanPlan{}, &anySliceArrayReflect{slice: slice}, true + } + return nil, nil, false +} + +type wrapPtrSliceScanPlan[T any] struct { + next ScanPlan +} + +func (plan *wrapPtrSliceScanPlan[T]) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapPtrSliceScanPlan[T]) Scan(src []byte, target any) error { + return plan.next.Scan(src, (*FlatArray[T])(target.(*[]T))) +} + +type wrapPtrSliceReflectScanPlan struct { + next ScanPlan +} + +func (plan *wrapPtrSliceReflectScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapPtrSliceReflectScanPlan) Scan(src []byte, target any) error { + return plan.next.Scan(src, &anySliceArrayReflect{slice: reflect.ValueOf(target).Elem()}) +} + +// TryWrapPtrMultiDimSliceScanPlan tries to wrap a pointer to a multi-dimension slice. +func TryWrapPtrMultiDimSliceScanPlan(target any) (plan WrappedScanPlanNextSetter, nextValue any, ok bool) { + targetValue := reflect.ValueOf(target) + if targetValue.Kind() != reflect.Ptr { + return nil, nil, false + } + + targetElemValue := targetValue.Elem() + + if targetElemValue.Kind() == reflect.Slice { + elemElemKind := targetElemValue.Type().Elem().Kind() + if elemElemKind == reflect.Slice { + if !isRagged(targetElemValue) { + return &wrapPtrMultiDimSliceScanPlan{}, &anyMultiDimSliceArray{slice: targetValue.Elem()}, true + } + } + } + + return nil, nil, false +} + +type wrapPtrMultiDimSliceScanPlan struct { + next ScanPlan +} + +func (plan *wrapPtrMultiDimSliceScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapPtrMultiDimSliceScanPlan) Scan(src []byte, target any) error { + return plan.next.Scan(src, &anyMultiDimSliceArray{slice: reflect.ValueOf(target).Elem()}) +} + +// TryWrapPtrArrayScanPlan tries to wrap a pointer to a single dimension array. +func TryWrapPtrArrayScanPlan(target any) (plan WrappedScanPlanNextSetter, nextValue any, ok bool) { + targetValue := reflect.ValueOf(target) + if targetValue.Kind() != reflect.Ptr { + return nil, nil, false + } + + targetElemValue := targetValue.Elem() + + if targetElemValue.Kind() == reflect.Array { + return &wrapPtrArrayReflectScanPlan{}, &anyArrayArrayReflect{array: targetElemValue}, true + } + return nil, nil, false +} + +type wrapPtrArrayReflectScanPlan struct { + next ScanPlan +} + +func (plan *wrapPtrArrayReflectScanPlan) SetNext(next ScanPlan) { plan.next = next } + +func (plan *wrapPtrArrayReflectScanPlan) Scan(src []byte, target any) error { + return plan.next.Scan(src, &anyArrayArrayReflect{array: reflect.ValueOf(target).Elem()}) +} + +// PlanScan prepares a plan to scan a value into target. +func (m *Map) PlanScan(oid uint32, formatCode int16, target any) ScanPlan { + oidMemo := m.memoizedScanPlans[oid] + if oidMemo == nil { + oidMemo = make(map[reflect.Type][2]ScanPlan) + m.memoizedScanPlans[oid] = oidMemo + } + targetReflectType := reflect.TypeOf(target) + typeMemo := oidMemo[targetReflectType] + plan := typeMemo[formatCode] + if plan == nil { + plan = m.planScan(oid, formatCode, target) + typeMemo[formatCode] = plan + oidMemo[targetReflectType] = typeMemo + } + + return plan +} + +func (m *Map) planScan(oid uint32, formatCode int16, target any) ScanPlan { + if target == nil { + return &scanPlanFail{m: m, oid: oid, formatCode: formatCode} + } + + if _, ok := target.(*UndecodedBytes); ok { + return scanPlanAnyToUndecodedBytes{} + } + + switch formatCode { + case BinaryFormatCode: + switch target.(type) { + case *string: + switch oid { + case TextOID, VarcharOID: + return scanPlanString{} + } + } + case TextFormatCode: + switch target.(type) { + case *string: + return scanPlanString{} + case *[]byte: + if oid != ByteaOID { + return scanPlanAnyTextToBytes{} + } + case TextScanner: + return scanPlanTextAnyToTextScanner{} + } + } + + var dt *Type + + if dataType, ok := m.TypeForOID(oid); ok { + dt = dataType + } else if dataType, ok := m.TypeForValue(target); ok { + dt = dataType + oid = dt.OID // Preserve assumed OID in case we are recursively called below. + } + + if dt != nil { + if plan := dt.Codec.PlanScan(m, oid, formatCode, target); plan != nil { + return plan + } + } + + // This needs to happen before trying m.TryWrapScanPlanFuncs. Otherwise, a sql.Scanner would not get called if it was + // defined on a type that could be unwrapped such as `type myString string`. + // + // https://github.com/jackc/pgtype/issues/197 + if _, ok := target.(sql.Scanner); ok { + if dt == nil { + return &scanPlanSQLScanner{formatCode: formatCode} + } else { + return &scanPlanCodecSQLScanner{c: dt.Codec, m: m, oid: oid, formatCode: formatCode} + } + } + + for _, f := range m.TryWrapScanPlanFuncs { + if wrapperPlan, nextDst, ok := f(target); ok { + if nextPlan := m.planScan(oid, formatCode, nextDst); nextPlan != nil { + if _, failed := nextPlan.(*scanPlanFail); !failed { + wrapperPlan.SetNext(nextPlan) + return wrapperPlan + } + } + } + } + + if dt != nil { + if _, ok := target.(*any); ok { + return &pointerEmptyInterfaceScanPlan{codec: dt.Codec, m: m, oid: oid, formatCode: formatCode} + } + } + + return &scanPlanFail{m: m, oid: oid, formatCode: formatCode} +} + +func (m *Map) Scan(oid uint32, formatCode int16, src []byte, dst any) error { + if dst == nil { + return nil + } + + plan := m.PlanScan(oid, formatCode, dst) + return plan.Scan(src, dst) +} + +var ErrScanTargetTypeChanged = errors.New("scan target type changed") + +func codecScan(codec Codec, m *Map, oid uint32, format int16, src []byte, dst any) error { + scanPlan := codec.PlanScan(m, oid, format, dst) + if scanPlan == nil { + return fmt.Errorf("PlanScan did not find a plan") + } + return scanPlan.Scan(src, dst) +} + +func codecDecodeToTextFormat(codec Codec, m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + if format == TextFormatCode { + return string(src), nil + } else { + value, err := codec.DecodeValue(m, oid, format, src) + if err != nil { + return nil, err + } + buf, err := m.Encode(oid, TextFormatCode, value, nil) + if err != nil { + return nil, err + } + return string(buf), nil + } +} + +// PlanEncode returns an Encode plan for encoding value into PostgreSQL format for oid and format. If no plan can be +// found then nil is returned. +func (m *Map) PlanEncode(oid uint32, format int16, value any) EncodePlan { + oidMemo := m.memoizedEncodePlans[oid] + if oidMemo == nil { + oidMemo = make(map[reflect.Type][2]EncodePlan) + m.memoizedEncodePlans[oid] = oidMemo + } + targetReflectType := reflect.TypeOf(value) + typeMemo := oidMemo[targetReflectType] + plan := typeMemo[format] + if plan == nil { + plan = m.planEncode(oid, format, value) + typeMemo[format] = plan + oidMemo[targetReflectType] = typeMemo + } + + return plan +} + +func (m *Map) planEncode(oid uint32, format int16, value any) EncodePlan { + if format == TextFormatCode { + switch value.(type) { + case string: + return encodePlanStringToAnyTextFormat{} + case TextValuer: + return encodePlanTextValuerToAnyTextFormat{} + } + } + + var dt *Type + if dataType, ok := m.TypeForOID(oid); ok { + dt = dataType + } else { + // If no type for the OID was found, then either it is unknowable (e.g. the simple protocol) or it is an + // unregistered type. In either case try to find the type and OID that matches the value (e.g. a []byte would be + // registered to PostgreSQL bytea). + if dataType, ok := m.TypeForValue(value); ok { + dt = dataType + oid = dt.OID // Preserve assumed OID in case we are recursively called below. + } + } + + if dt != nil { + if plan := dt.Codec.PlanEncode(m, oid, format, value); plan != nil { + return plan + } + } + + for _, f := range m.TryWrapEncodePlanFuncs { + if wrapperPlan, nextValue, ok := f(value); ok { + if nextPlan := m.PlanEncode(oid, format, nextValue); nextPlan != nil { + wrapperPlan.SetNext(nextPlan) + return wrapperPlan + } + } + } + + if _, ok := value.(driver.Valuer); ok { + return &encodePlanDriverValuer{m: m, oid: oid, formatCode: format} + } + + return nil +} + +type encodePlanStringToAnyTextFormat struct{} + +func (encodePlanStringToAnyTextFormat) Encode(value any, buf []byte) (newBuf []byte, err error) { + s := value.(string) + return append(buf, s...), nil +} + +type encodePlanTextValuerToAnyTextFormat struct{} + +func (encodePlanTextValuerToAnyTextFormat) Encode(value any, buf []byte) (newBuf []byte, err error) { + t, err := value.(TextValuer).TextValue() + if err != nil { + return nil, err + } + if !t.Valid { + return nil, nil + } + + return append(buf, t.String...), nil +} + +type encodePlanDriverValuer struct { + m *Map + oid uint32 + formatCode int16 +} + +func (plan *encodePlanDriverValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + dv := value.(driver.Valuer) + if dv == nil { + return nil, nil + } + v, err := dv.Value() + if err != nil { + return nil, err + } + if v == nil { + return nil, nil + } + + newBuf, err = plan.m.Encode(plan.oid, plan.formatCode, v, buf) + if err == nil { + return newBuf, nil + } + + s, ok := v.(string) + if !ok { + return nil, err + } + + var scannedValue any + scanErr := plan.m.Scan(plan.oid, TextFormatCode, []byte(s), &scannedValue) + if scanErr != nil { + return nil, err + } + + // Prevent infinite loop. We can't encode this. See https://github.com/jackc/pgx/issues/1331. + if reflect.TypeOf(value) == reflect.TypeOf(scannedValue) { + return nil, fmt.Errorf("tried to encode %v via encoding to text and scanning but failed due to receiving same type back", value) + } + + var err2 error + newBuf, err2 = plan.m.Encode(plan.oid, BinaryFormatCode, scannedValue, buf) + if err2 != nil { + return nil, err + } + + return newBuf, nil +} + +// TryWrapEncodePlanFunc is a function that tries to create a wrapper plan for value. If successful it returns a plan +// that will convert the value passed to Encode and then call the next plan. nextValue is value as it will be converted +// by plan. It must be used to find another suitable EncodePlan. When it is found SetNext must be called on plan for it +// to be usabled. ok indicates if a suitable wrapper was found. +type TryWrapEncodePlanFunc func(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) + +type derefPointerEncodePlan struct { + next EncodePlan +} + +func (plan *derefPointerEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *derefPointerEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + ptr := reflect.ValueOf(value) + + if ptr.IsNil() { + return nil, nil + } + + return plan.next.Encode(ptr.Elem().Interface(), buf) +} + +// TryWrapDerefPointerEncodePlan tries to dereference a pointer. e.g. If value was of type *string then a wrapper plan +// would be returned that dereferences the value. +func TryWrapDerefPointerEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + if valueType := reflect.TypeOf(value); valueType != nil && valueType.Kind() == reflect.Ptr { + return &derefPointerEncodePlan{}, reflect.New(valueType.Elem()).Elem().Interface(), true + } + + return nil, nil, false +} + +var kindToTypes map[reflect.Kind]reflect.Type = map[reflect.Kind]reflect.Type{ + reflect.Int: reflect.TypeOf(int(0)), + reflect.Int8: reflect.TypeOf(int8(0)), + reflect.Int16: reflect.TypeOf(int16(0)), + reflect.Int32: reflect.TypeOf(int32(0)), + reflect.Int64: reflect.TypeOf(int64(0)), + reflect.Uint: reflect.TypeOf(uint(0)), + reflect.Uint8: reflect.TypeOf(uint8(0)), + reflect.Uint16: reflect.TypeOf(uint16(0)), + reflect.Uint32: reflect.TypeOf(uint32(0)), + reflect.Uint64: reflect.TypeOf(uint64(0)), + reflect.Float32: reflect.TypeOf(float32(0)), + reflect.Float64: reflect.TypeOf(float64(0)), + reflect.String: reflect.TypeOf(""), + reflect.Bool: reflect.TypeOf(false), +} + +var byteSliceType = reflect.TypeOf([]byte{}) + +type underlyingTypeEncodePlan struct { + nextValueType reflect.Type + next EncodePlan +} + +func (plan *underlyingTypeEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *underlyingTypeEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(reflect.ValueOf(value).Convert(plan.nextValueType).Interface(), buf) +} + +// TryWrapFindUnderlyingTypeEncodePlan tries to convert to a Go builtin type. e.g. If value was of type MyString and +// MyString was defined as a string then a wrapper plan would be returned that converts MyString to string. +func TryWrapFindUnderlyingTypeEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if value == nil { + return nil, nil, false + } + + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + if _, ok := value.(SkipUnderlyingTypePlanner); ok { + return nil, nil, false + } + + refValue := reflect.ValueOf(value) + + nextValueType := kindToTypes[refValue.Kind()] + if nextValueType != nil && refValue.Type() != nextValueType { + return &underlyingTypeEncodePlan{nextValueType: nextValueType}, refValue.Convert(nextValueType).Interface(), true + } + + // []byte is a special case. It is a slice but we treat it as a scalar type. In the case of a named type like + // json.RawMessage which is defined as []byte the underlying type should be considered as []byte. But any other slice + // does not have a special underlying type. + // + // https://github.com/jackc/pgx/issues/1763 + if refValue.Type() != byteSliceType && refValue.Type().AssignableTo(byteSliceType) { + return &underlyingTypeEncodePlan{nextValueType: byteSliceType}, refValue.Convert(byteSliceType).Interface(), true + } + + // Get underlying type of any array. + // https://github.com/jackc/pgx/issues/2107 + if refValue.Kind() == reflect.Array { + underlyingArrayType := reflect.ArrayOf(refValue.Len(), refValue.Type().Elem()) + if refValue.Type() != underlyingArrayType { + return &underlyingTypeEncodePlan{nextValueType: underlyingArrayType}, refValue.Convert(underlyingArrayType).Interface(), true + } + } + + return nil, nil, false +} + +type WrappedEncodePlanNextSetter interface { + SetNext(EncodePlan) + EncodePlan +} + +// TryWrapBuiltinTypeEncodePlan tries to wrap a builtin type with a wrapper that provides additional methods. e.g. If +// value was of type int32 then a wrapper plan would be returned that converts value to a type that implements +// Int64Valuer. +func TryWrapBuiltinTypeEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + switch value := value.(type) { + case int8: + return &wrapInt8EncodePlan{}, int8Wrapper(value), true + case int16: + return &wrapInt16EncodePlan{}, int16Wrapper(value), true + case int32: + return &wrapInt32EncodePlan{}, int32Wrapper(value), true + case int64: + return &wrapInt64EncodePlan{}, int64Wrapper(value), true + case int: + return &wrapIntEncodePlan{}, intWrapper(value), true + case uint8: + return &wrapUint8EncodePlan{}, uint8Wrapper(value), true + case uint16: + return &wrapUint16EncodePlan{}, uint16Wrapper(value), true + case uint32: + return &wrapUint32EncodePlan{}, uint32Wrapper(value), true + case uint64: + return &wrapUint64EncodePlan{}, uint64Wrapper(value), true + case uint: + return &wrapUintEncodePlan{}, uintWrapper(value), true + case float32: + return &wrapFloat32EncodePlan{}, float32Wrapper(value), true + case float64: + return &wrapFloat64EncodePlan{}, float64Wrapper(value), true + case string: + return &wrapStringEncodePlan{}, stringWrapper(value), true + case time.Time: + return &wrapTimeEncodePlan{}, timeWrapper(value), true + case time.Duration: + return &wrapDurationEncodePlan{}, durationWrapper(value), true + case net.IPNet: + return &wrapNetIPNetEncodePlan{}, netIPNetWrapper(value), true + case net.IP: + return &wrapNetIPEncodePlan{}, netIPWrapper(value), true + case netip.Prefix: + return &wrapNetipPrefixEncodePlan{}, netipPrefixWrapper(value), true + case netip.Addr: + return &wrapNetipAddrEncodePlan{}, netipAddrWrapper(value), true + case map[string]*string: + return &wrapMapStringToPointerStringEncodePlan{}, mapStringToPointerStringWrapper(value), true + case map[string]string: + return &wrapMapStringToStringEncodePlan{}, mapStringToStringWrapper(value), true + case [16]byte: + return &wrapByte16EncodePlan{}, byte16Wrapper(value), true + case []byte: + return &wrapByteSliceEncodePlan{}, byteSliceWrapper(value), true + case fmt.Stringer: + return &wrapFmtStringerEncodePlan{}, fmtStringerWrapper{value}, true + } + + return nil, nil, false +} + +type wrapInt8EncodePlan struct { + next EncodePlan +} + +func (plan *wrapInt8EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapInt8EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(int8Wrapper(value.(int8)), buf) +} + +type wrapInt16EncodePlan struct { + next EncodePlan +} + +func (plan *wrapInt16EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapInt16EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(int16Wrapper(value.(int16)), buf) +} + +type wrapInt32EncodePlan struct { + next EncodePlan +} + +func (plan *wrapInt32EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapInt32EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(int32Wrapper(value.(int32)), buf) +} + +type wrapInt64EncodePlan struct { + next EncodePlan +} + +func (plan *wrapInt64EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapInt64EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(int64Wrapper(value.(int64)), buf) +} + +type wrapIntEncodePlan struct { + next EncodePlan +} + +func (plan *wrapIntEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapIntEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(intWrapper(value.(int)), buf) +} + +type wrapUint8EncodePlan struct { + next EncodePlan +} + +func (plan *wrapUint8EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapUint8EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(uint8Wrapper(value.(uint8)), buf) +} + +type wrapUint16EncodePlan struct { + next EncodePlan +} + +func (plan *wrapUint16EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapUint16EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(uint16Wrapper(value.(uint16)), buf) +} + +type wrapUint32EncodePlan struct { + next EncodePlan +} + +func (plan *wrapUint32EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapUint32EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(uint32Wrapper(value.(uint32)), buf) +} + +type wrapUint64EncodePlan struct { + next EncodePlan +} + +func (plan *wrapUint64EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapUint64EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(uint64Wrapper(value.(uint64)), buf) +} + +type wrapUintEncodePlan struct { + next EncodePlan +} + +func (plan *wrapUintEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapUintEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(uintWrapper(value.(uint)), buf) +} + +type wrapFloat32EncodePlan struct { + next EncodePlan +} + +func (plan *wrapFloat32EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapFloat32EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(float32Wrapper(value.(float32)), buf) +} + +type wrapFloat64EncodePlan struct { + next EncodePlan +} + +func (plan *wrapFloat64EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapFloat64EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(float64Wrapper(value.(float64)), buf) +} + +type wrapStringEncodePlan struct { + next EncodePlan +} + +func (plan *wrapStringEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapStringEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(stringWrapper(value.(string)), buf) +} + +type wrapTimeEncodePlan struct { + next EncodePlan +} + +func (plan *wrapTimeEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapTimeEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(timeWrapper(value.(time.Time)), buf) +} + +type wrapDurationEncodePlan struct { + next EncodePlan +} + +func (plan *wrapDurationEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapDurationEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(durationWrapper(value.(time.Duration)), buf) +} + +type wrapNetIPNetEncodePlan struct { + next EncodePlan +} + +func (plan *wrapNetIPNetEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapNetIPNetEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(netIPNetWrapper(value.(net.IPNet)), buf) +} + +type wrapNetIPEncodePlan struct { + next EncodePlan +} + +func (plan *wrapNetIPEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapNetIPEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(netIPWrapper(value.(net.IP)), buf) +} + +type wrapNetipPrefixEncodePlan struct { + next EncodePlan +} + +func (plan *wrapNetipPrefixEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapNetipPrefixEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(netipPrefixWrapper(value.(netip.Prefix)), buf) +} + +type wrapNetipAddrEncodePlan struct { + next EncodePlan +} + +func (plan *wrapNetipAddrEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapNetipAddrEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(netipAddrWrapper(value.(netip.Addr)), buf) +} + +type wrapMapStringToPointerStringEncodePlan struct { + next EncodePlan +} + +func (plan *wrapMapStringToPointerStringEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapMapStringToPointerStringEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(mapStringToPointerStringWrapper(value.(map[string]*string)), buf) +} + +type wrapMapStringToStringEncodePlan struct { + next EncodePlan +} + +func (plan *wrapMapStringToStringEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapMapStringToStringEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(mapStringToStringWrapper(value.(map[string]string)), buf) +} + +type wrapByte16EncodePlan struct { + next EncodePlan +} + +func (plan *wrapByte16EncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapByte16EncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(byte16Wrapper(value.([16]byte)), buf) +} + +type wrapByteSliceEncodePlan struct { + next EncodePlan +} + +func (plan *wrapByteSliceEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapByteSliceEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(byteSliceWrapper(value.([]byte)), buf) +} + +type wrapFmtStringerEncodePlan struct { + next EncodePlan +} + +func (plan *wrapFmtStringerEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapFmtStringerEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode(fmtStringerWrapper{value.(fmt.Stringer)}, buf) +} + +// TryWrapStructPlan tries to wrap a struct with a wrapper that implements CompositeIndexGetter. +func TryWrapStructEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + if valueType := reflect.TypeOf(value); valueType != nil && valueType.Kind() == reflect.Struct { + exportedFields := getExportedFieldValues(reflect.ValueOf(value)) + if len(exportedFields) == 0 { + return nil, nil, false + } + + w := structWrapper{ + s: value, + exportedFields: exportedFields, + } + return &wrapAnyStructEncodePlan{}, w, true + } + + return nil, nil, false +} + +type wrapAnyStructEncodePlan struct { + next EncodePlan +} + +func (plan *wrapAnyStructEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapAnyStructEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + w := structWrapper{ + s: value, + exportedFields: getExportedFieldValues(reflect.ValueOf(value)), + } + + return plan.next.Encode(w, buf) +} + +func getExportedFieldValues(structValue reflect.Value) []reflect.Value { + structType := structValue.Type() + exportedFields := make([]reflect.Value, 0, structValue.NumField()) + for i := 0; i < structType.NumField(); i++ { + sf := structType.Field(i) + if sf.IsExported() { + exportedFields = append(exportedFields, structValue.Field(i)) + } + } + + return exportedFields +} + +func TryWrapSliceEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + // Avoid using reflect path for common types. + switch value := value.(type) { + case []int16: + return &wrapSliceEncodePlan[int16]{}, (FlatArray[int16])(value), true + case []int32: + return &wrapSliceEncodePlan[int32]{}, (FlatArray[int32])(value), true + case []int64: + return &wrapSliceEncodePlan[int64]{}, (FlatArray[int64])(value), true + case []float32: + return &wrapSliceEncodePlan[float32]{}, (FlatArray[float32])(value), true + case []float64: + return &wrapSliceEncodePlan[float64]{}, (FlatArray[float64])(value), true + case []string: + return &wrapSliceEncodePlan[string]{}, (FlatArray[string])(value), true + case []time.Time: + return &wrapSliceEncodePlan[time.Time]{}, (FlatArray[time.Time])(value), true + } + + if valueType := reflect.TypeOf(value); valueType != nil && valueType.Kind() == reflect.Slice { + w := anySliceArrayReflect{ + slice: reflect.ValueOf(value), + } + return &wrapSliceEncodeReflectPlan{}, w, true + } + + return nil, nil, false +} + +type wrapSliceEncodePlan[T any] struct { + next EncodePlan +} + +func (plan *wrapSliceEncodePlan[T]) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapSliceEncodePlan[T]) Encode(value any, buf []byte) (newBuf []byte, err error) { + return plan.next.Encode((FlatArray[T])(value.([]T)), buf) +} + +type wrapSliceEncodeReflectPlan struct { + next EncodePlan +} + +func (plan *wrapSliceEncodeReflectPlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapSliceEncodeReflectPlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + w := anySliceArrayReflect{ + slice: reflect.ValueOf(value), + } + + return plan.next.Encode(w, buf) +} + +func TryWrapMultiDimSliceEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + sliceValue := reflect.ValueOf(value) + if sliceValue.Kind() == reflect.Slice { + valueElemType := sliceValue.Type().Elem() + + if valueElemType.Kind() == reflect.Slice { + if !isRagged(sliceValue) { + w := anyMultiDimSliceArray{ + slice: reflect.ValueOf(value), + } + return &wrapMultiDimSliceEncodePlan{}, &w, true + } + } + } + + return nil, nil, false +} + +type wrapMultiDimSliceEncodePlan struct { + next EncodePlan +} + +func (plan *wrapMultiDimSliceEncodePlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapMultiDimSliceEncodePlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + w := anyMultiDimSliceArray{ + slice: reflect.ValueOf(value), + } + + return plan.next.Encode(&w, buf) +} + +func TryWrapArrayEncodePlan(value any) (plan WrappedEncodePlanNextSetter, nextValue any, ok bool) { + if _, ok := value.(driver.Valuer); ok { + return nil, nil, false + } + + if valueType := reflect.TypeOf(value); valueType != nil && valueType.Kind() == reflect.Array { + w := anyArrayArrayReflect{ + array: reflect.ValueOf(value), + } + return &wrapArrayEncodeReflectPlan{}, w, true + } + + return nil, nil, false +} + +type wrapArrayEncodeReflectPlan struct { + next EncodePlan +} + +func (plan *wrapArrayEncodeReflectPlan) SetNext(next EncodePlan) { plan.next = next } + +func (plan *wrapArrayEncodeReflectPlan) Encode(value any, buf []byte) (newBuf []byte, err error) { + w := anyArrayArrayReflect{ + array: reflect.ValueOf(value), + } + + return plan.next.Encode(w, buf) +} + +func newEncodeError(value any, m *Map, oid uint32, formatCode int16, err error) error { + var format string + switch formatCode { + case TextFormatCode: + format = "text" + case BinaryFormatCode: + format = "binary" + default: + format = fmt.Sprintf("unknown (%d)", formatCode) + } + + var dataTypeName string + if t, ok := m.TypeForOID(oid); ok { + dataTypeName = t.Name + } else { + dataTypeName = "unknown type" + } + + return fmt.Errorf("unable to encode %#v into %s format for %s (OID %d): %w", value, format, dataTypeName, oid, err) +} + +// Encode appends the encoded bytes of value to buf. If value is the SQL value NULL then append nothing and return +// (nil, nil). The caller of Encode is responsible for writing the correct NULL value or the length of the data +// written. +func (m *Map) Encode(oid uint32, formatCode int16, value any, buf []byte) (newBuf []byte, err error) { + if isNil, callNilDriverValuer := isNilDriverValuer(value); isNil { + if callNilDriverValuer { + newBuf, err = (&encodePlanDriverValuer{m: m, oid: oid, formatCode: formatCode}).Encode(value, buf) + if err != nil { + return nil, newEncodeError(value, m, oid, formatCode, err) + } + + return newBuf, nil + } else { + return nil, nil + } + } + + plan := m.PlanEncode(oid, formatCode, value) + if plan == nil { + return nil, newEncodeError(value, m, oid, formatCode, errors.New("cannot find encode plan")) + } + + newBuf, err = plan.Encode(value, buf) + if err != nil { + return nil, newEncodeError(value, m, oid, formatCode, err) + } + + return newBuf, nil +} + +// SQLScanner returns a database/sql.Scanner for v. This is necessary for types like Array[T] and Range[T] where the +// type needs assistance from Map to implement the sql.Scanner interface. It is not necessary for types like Box that +// implement sql.Scanner directly. +// +// This uses the type of v to look up the PostgreSQL OID that v presumably came from. This means v must be registered +// with m by calling RegisterDefaultPgType. +func (m *Map) SQLScanner(v any) sql.Scanner { + if s, ok := v.(sql.Scanner); ok { + return s + } + + return &sqlScannerWrapper{m: m, v: v} +} + +type sqlScannerWrapper struct { + m *Map + v any +} + +func (w *sqlScannerWrapper) Scan(src any) error { + t, ok := w.m.TypeForValue(w.v) + if !ok { + return fmt.Errorf("cannot convert to sql.Scanner: cannot find registered type for %T", w.v) + } + + var bufSrc []byte + if src != nil { + switch src := src.(type) { + case string: + bufSrc = []byte(src) + case []byte: + bufSrc = src + default: + bufSrc = []byte(fmt.Sprint(bufSrc)) + } + } + + return w.m.Scan(t.OID, TextFormatCode, bufSrc, w.v) +} + +// canBeNil returns true if value can be nil. +func canBeNil(value any) bool { + refVal := reflect.ValueOf(value) + kind := refVal.Kind() + switch kind { + case reflect.Chan, reflect.Func, reflect.Map, reflect.Ptr, reflect.UnsafePointer, reflect.Interface, reflect.Slice: + return true + default: + return false + } +} + +// valuerReflectType is a reflect.Type for driver.Valuer. It has confusing syntax because reflect.TypeOf returns nil +// when it's argument is a nil interface value. So we use a pointer to the interface and call Elem to get the actual +// type. Yuck. +// +// This can be simplified in Go 1.22 with reflect.TypeFor. +// +// var valuerReflectType = reflect.TypeFor[driver.Valuer]() +var valuerReflectType = reflect.TypeOf((*driver.Valuer)(nil)).Elem() + +// isNilDriverValuer returns true if value is any type of nil unless it implements driver.Valuer. *T is not considered to implement +// driver.Valuer if it is only implemented by T. +func isNilDriverValuer(value any) (isNil bool, callNilDriverValuer bool) { + if value == nil { + return true, false + } + + refVal := reflect.ValueOf(value) + kind := refVal.Kind() + switch kind { + case reflect.Chan, reflect.Func, reflect.Map, reflect.Ptr, reflect.UnsafePointer, reflect.Interface, reflect.Slice: + if !refVal.IsNil() { + return false, false + } + + if _, ok := value.(driver.Valuer); ok { + if kind == reflect.Ptr { + // The type assertion will succeed if driver.Valuer is implemented on T or *T. Check if it is implemented on *T + // by checking if it is not implemented on *T. + return true, !refVal.Type().Elem().Implements(valuerReflectType) + } else { + return true, true + } + } + + return true, false + default: + return false, false + } +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/pgtype_default.go b/vendor/github.com/jackc/pgx/v5/pgtype/pgtype_default.go new file mode 100644 index 0000000..c812573 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/pgtype_default.go @@ -0,0 +1,229 @@ +package pgtype + +import ( + "encoding/json" + "encoding/xml" + "net" + "net/netip" + "reflect" + "sync" + "time" +) + +var ( + // defaultMap contains default mappings between PostgreSQL server types and Go type handling logic. + defaultMap *Map + defaultMapInitOnce = sync.Once{} +) + +func initDefaultMap() { + defaultMap = &Map{ + oidToType: make(map[uint32]*Type), + nameToType: make(map[string]*Type), + reflectTypeToName: make(map[reflect.Type]string), + oidToFormatCode: make(map[uint32]int16), + + memoizedScanPlans: make(map[uint32]map[reflect.Type][2]ScanPlan), + memoizedEncodePlans: make(map[uint32]map[reflect.Type][2]EncodePlan), + + TryWrapEncodePlanFuncs: []TryWrapEncodePlanFunc{ + TryWrapDerefPointerEncodePlan, + TryWrapBuiltinTypeEncodePlan, + TryWrapFindUnderlyingTypeEncodePlan, + TryWrapStructEncodePlan, + TryWrapSliceEncodePlan, + TryWrapMultiDimSliceEncodePlan, + TryWrapArrayEncodePlan, + }, + + TryWrapScanPlanFuncs: []TryWrapScanPlanFunc{ + TryPointerPointerScanPlan, + TryWrapBuiltinTypeScanPlan, + TryFindUnderlyingTypeScanPlan, + TryWrapStructScanPlan, + TryWrapPtrSliceScanPlan, + TryWrapPtrMultiDimSliceScanPlan, + TryWrapPtrArrayScanPlan, + }, + } + + // Base types + defaultMap.RegisterType(&Type{Name: "aclitem", OID: ACLItemOID, Codec: &TextFormatOnlyCodec{TextCodec{}}}) + defaultMap.RegisterType(&Type{Name: "bit", OID: BitOID, Codec: BitsCodec{}}) + defaultMap.RegisterType(&Type{Name: "bool", OID: BoolOID, Codec: BoolCodec{}}) + defaultMap.RegisterType(&Type{Name: "box", OID: BoxOID, Codec: BoxCodec{}}) + defaultMap.RegisterType(&Type{Name: "bpchar", OID: BPCharOID, Codec: TextCodec{}}) + defaultMap.RegisterType(&Type{Name: "bytea", OID: ByteaOID, Codec: ByteaCodec{}}) + defaultMap.RegisterType(&Type{Name: "char", OID: QCharOID, Codec: QCharCodec{}}) + defaultMap.RegisterType(&Type{Name: "cid", OID: CIDOID, Codec: Uint32Codec{}}) + defaultMap.RegisterType(&Type{Name: "cidr", OID: CIDROID, Codec: InetCodec{}}) + defaultMap.RegisterType(&Type{Name: "circle", OID: CircleOID, Codec: CircleCodec{}}) + defaultMap.RegisterType(&Type{Name: "date", OID: DateOID, Codec: DateCodec{}}) + defaultMap.RegisterType(&Type{Name: "float4", OID: Float4OID, Codec: Float4Codec{}}) + defaultMap.RegisterType(&Type{Name: "float8", OID: Float8OID, Codec: Float8Codec{}}) + defaultMap.RegisterType(&Type{Name: "inet", OID: InetOID, Codec: InetCodec{}}) + defaultMap.RegisterType(&Type{Name: "int2", OID: Int2OID, Codec: Int2Codec{}}) + defaultMap.RegisterType(&Type{Name: "int4", OID: Int4OID, Codec: Int4Codec{}}) + defaultMap.RegisterType(&Type{Name: "int8", OID: Int8OID, Codec: Int8Codec{}}) + defaultMap.RegisterType(&Type{Name: "interval", OID: IntervalOID, Codec: IntervalCodec{}}) + defaultMap.RegisterType(&Type{Name: "json", OID: JSONOID, Codec: &JSONCodec{Marshal: json.Marshal, Unmarshal: json.Unmarshal}}) + defaultMap.RegisterType(&Type{Name: "jsonb", OID: JSONBOID, Codec: &JSONBCodec{Marshal: json.Marshal, Unmarshal: json.Unmarshal}}) + defaultMap.RegisterType(&Type{Name: "jsonpath", OID: JSONPathOID, Codec: &TextFormatOnlyCodec{TextCodec{}}}) + defaultMap.RegisterType(&Type{Name: "line", OID: LineOID, Codec: LineCodec{}}) + defaultMap.RegisterType(&Type{Name: "lseg", OID: LsegOID, Codec: LsegCodec{}}) + defaultMap.RegisterType(&Type{Name: "macaddr8", OID: Macaddr8OID, Codec: MacaddrCodec{}}) + defaultMap.RegisterType(&Type{Name: "macaddr", OID: MacaddrOID, Codec: MacaddrCodec{}}) + defaultMap.RegisterType(&Type{Name: "name", OID: NameOID, Codec: TextCodec{}}) + defaultMap.RegisterType(&Type{Name: "numeric", OID: NumericOID, Codec: NumericCodec{}}) + defaultMap.RegisterType(&Type{Name: "oid", OID: OIDOID, Codec: Uint32Codec{}}) + defaultMap.RegisterType(&Type{Name: "path", OID: PathOID, Codec: PathCodec{}}) + defaultMap.RegisterType(&Type{Name: "point", OID: PointOID, Codec: PointCodec{}}) + defaultMap.RegisterType(&Type{Name: "polygon", OID: PolygonOID, Codec: PolygonCodec{}}) + defaultMap.RegisterType(&Type{Name: "record", OID: RecordOID, Codec: RecordCodec{}}) + defaultMap.RegisterType(&Type{Name: "text", OID: TextOID, Codec: TextCodec{}}) + defaultMap.RegisterType(&Type{Name: "tid", OID: TIDOID, Codec: TIDCodec{}}) + defaultMap.RegisterType(&Type{Name: "time", OID: TimeOID, Codec: TimeCodec{}}) + defaultMap.RegisterType(&Type{Name: "timestamp", OID: TimestampOID, Codec: &TimestampCodec{}}) + defaultMap.RegisterType(&Type{Name: "timestamptz", OID: TimestamptzOID, Codec: &TimestamptzCodec{}}) + defaultMap.RegisterType(&Type{Name: "unknown", OID: UnknownOID, Codec: TextCodec{}}) + defaultMap.RegisterType(&Type{Name: "uuid", OID: UUIDOID, Codec: UUIDCodec{}}) + defaultMap.RegisterType(&Type{Name: "varbit", OID: VarbitOID, Codec: BitsCodec{}}) + defaultMap.RegisterType(&Type{Name: "varchar", OID: VarcharOID, Codec: TextCodec{}}) + defaultMap.RegisterType(&Type{Name: "xid", OID: XIDOID, Codec: Uint32Codec{}}) + defaultMap.RegisterType(&Type{Name: "xml", OID: XMLOID, Codec: &XMLCodec{Marshal: xml.Marshal, Unmarshal: xml.Unmarshal}}) + + // Range types + defaultMap.RegisterType(&Type{Name: "daterange", OID: DaterangeOID, Codec: &RangeCodec{ElementType: defaultMap.oidToType[DateOID]}}) + defaultMap.RegisterType(&Type{Name: "int4range", OID: Int4rangeOID, Codec: &RangeCodec{ElementType: defaultMap.oidToType[Int4OID]}}) + defaultMap.RegisterType(&Type{Name: "int8range", OID: Int8rangeOID, Codec: &RangeCodec{ElementType: defaultMap.oidToType[Int8OID]}}) + defaultMap.RegisterType(&Type{Name: "numrange", OID: NumrangeOID, Codec: &RangeCodec{ElementType: defaultMap.oidToType[NumericOID]}}) + defaultMap.RegisterType(&Type{Name: "tsrange", OID: TsrangeOID, Codec: &RangeCodec{ElementType: defaultMap.oidToType[TimestampOID]}}) + defaultMap.RegisterType(&Type{Name: "tstzrange", OID: TstzrangeOID, Codec: &RangeCodec{ElementType: defaultMap.oidToType[TimestamptzOID]}}) + + // Multirange types + defaultMap.RegisterType(&Type{Name: "datemultirange", OID: DatemultirangeOID, Codec: &MultirangeCodec{ElementType: defaultMap.oidToType[DaterangeOID]}}) + defaultMap.RegisterType(&Type{Name: "int4multirange", OID: Int4multirangeOID, Codec: &MultirangeCodec{ElementType: defaultMap.oidToType[Int4rangeOID]}}) + defaultMap.RegisterType(&Type{Name: "int8multirange", OID: Int8multirangeOID, Codec: &MultirangeCodec{ElementType: defaultMap.oidToType[Int8rangeOID]}}) + defaultMap.RegisterType(&Type{Name: "nummultirange", OID: NummultirangeOID, Codec: &MultirangeCodec{ElementType: defaultMap.oidToType[NumrangeOID]}}) + defaultMap.RegisterType(&Type{Name: "tsmultirange", OID: TsmultirangeOID, Codec: &MultirangeCodec{ElementType: defaultMap.oidToType[TsrangeOID]}}) + defaultMap.RegisterType(&Type{Name: "tstzmultirange", OID: TstzmultirangeOID, Codec: &MultirangeCodec{ElementType: defaultMap.oidToType[TstzrangeOID]}}) + + // Array types + defaultMap.RegisterType(&Type{Name: "_aclitem", OID: ACLItemArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[ACLItemOID]}}) + defaultMap.RegisterType(&Type{Name: "_bit", OID: BitArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[BitOID]}}) + defaultMap.RegisterType(&Type{Name: "_bool", OID: BoolArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[BoolOID]}}) + defaultMap.RegisterType(&Type{Name: "_box", OID: BoxArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[BoxOID]}}) + defaultMap.RegisterType(&Type{Name: "_bpchar", OID: BPCharArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[BPCharOID]}}) + defaultMap.RegisterType(&Type{Name: "_bytea", OID: ByteaArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[ByteaOID]}}) + defaultMap.RegisterType(&Type{Name: "_char", OID: QCharArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[QCharOID]}}) + defaultMap.RegisterType(&Type{Name: "_cid", OID: CIDArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[CIDOID]}}) + defaultMap.RegisterType(&Type{Name: "_cidr", OID: CIDRArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[CIDROID]}}) + defaultMap.RegisterType(&Type{Name: "_circle", OID: CircleArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[CircleOID]}}) + defaultMap.RegisterType(&Type{Name: "_date", OID: DateArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[DateOID]}}) + defaultMap.RegisterType(&Type{Name: "_daterange", OID: DaterangeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[DaterangeOID]}}) + defaultMap.RegisterType(&Type{Name: "_float4", OID: Float4ArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Float4OID]}}) + defaultMap.RegisterType(&Type{Name: "_float8", OID: Float8ArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Float8OID]}}) + defaultMap.RegisterType(&Type{Name: "_inet", OID: InetArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[InetOID]}}) + defaultMap.RegisterType(&Type{Name: "_int2", OID: Int2ArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Int2OID]}}) + defaultMap.RegisterType(&Type{Name: "_int4", OID: Int4ArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Int4OID]}}) + defaultMap.RegisterType(&Type{Name: "_int4range", OID: Int4rangeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Int4rangeOID]}}) + defaultMap.RegisterType(&Type{Name: "_int8", OID: Int8ArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Int8OID]}}) + defaultMap.RegisterType(&Type{Name: "_int8range", OID: Int8rangeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[Int8rangeOID]}}) + defaultMap.RegisterType(&Type{Name: "_interval", OID: IntervalArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[IntervalOID]}}) + defaultMap.RegisterType(&Type{Name: "_json", OID: JSONArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[JSONOID]}}) + defaultMap.RegisterType(&Type{Name: "_jsonb", OID: JSONBArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[JSONBOID]}}) + defaultMap.RegisterType(&Type{Name: "_jsonpath", OID: JSONPathArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[JSONPathOID]}}) + defaultMap.RegisterType(&Type{Name: "_line", OID: LineArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[LineOID]}}) + defaultMap.RegisterType(&Type{Name: "_lseg", OID: LsegArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[LsegOID]}}) + defaultMap.RegisterType(&Type{Name: "_macaddr", OID: MacaddrArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[MacaddrOID]}}) + defaultMap.RegisterType(&Type{Name: "_name", OID: NameArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[NameOID]}}) + defaultMap.RegisterType(&Type{Name: "_numeric", OID: NumericArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[NumericOID]}}) + defaultMap.RegisterType(&Type{Name: "_numrange", OID: NumrangeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[NumrangeOID]}}) + defaultMap.RegisterType(&Type{Name: "_oid", OID: OIDArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[OIDOID]}}) + defaultMap.RegisterType(&Type{Name: "_path", OID: PathArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[PathOID]}}) + defaultMap.RegisterType(&Type{Name: "_point", OID: PointArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[PointOID]}}) + defaultMap.RegisterType(&Type{Name: "_polygon", OID: PolygonArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[PolygonOID]}}) + defaultMap.RegisterType(&Type{Name: "_record", OID: RecordArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[RecordOID]}}) + defaultMap.RegisterType(&Type{Name: "_text", OID: TextArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TextOID]}}) + defaultMap.RegisterType(&Type{Name: "_tid", OID: TIDArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TIDOID]}}) + defaultMap.RegisterType(&Type{Name: "_time", OID: TimeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TimeOID]}}) + defaultMap.RegisterType(&Type{Name: "_timestamp", OID: TimestampArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TimestampOID]}}) + defaultMap.RegisterType(&Type{Name: "_timestamptz", OID: TimestamptzArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TimestamptzOID]}}) + defaultMap.RegisterType(&Type{Name: "_tsrange", OID: TsrangeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TsrangeOID]}}) + defaultMap.RegisterType(&Type{Name: "_tstzrange", OID: TstzrangeArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[TstzrangeOID]}}) + defaultMap.RegisterType(&Type{Name: "_uuid", OID: UUIDArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[UUIDOID]}}) + defaultMap.RegisterType(&Type{Name: "_varbit", OID: VarbitArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[VarbitOID]}}) + defaultMap.RegisterType(&Type{Name: "_varchar", OID: VarcharArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[VarcharOID]}}) + defaultMap.RegisterType(&Type{Name: "_xid", OID: XIDArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[XIDOID]}}) + defaultMap.RegisterType(&Type{Name: "_xml", OID: XMLArrayOID, Codec: &ArrayCodec{ElementType: defaultMap.oidToType[XMLOID]}}) + + // Integer types that directly map to a PostgreSQL type + registerDefaultPgTypeVariants[int16](defaultMap, "int2") + registerDefaultPgTypeVariants[int32](defaultMap, "int4") + registerDefaultPgTypeVariants[int64](defaultMap, "int8") + + // Integer types that do not have a direct match to a PostgreSQL type + registerDefaultPgTypeVariants[int8](defaultMap, "int8") + registerDefaultPgTypeVariants[int](defaultMap, "int8") + registerDefaultPgTypeVariants[uint8](defaultMap, "int8") + registerDefaultPgTypeVariants[uint16](defaultMap, "int8") + registerDefaultPgTypeVariants[uint32](defaultMap, "int8") + registerDefaultPgTypeVariants[uint64](defaultMap, "numeric") + registerDefaultPgTypeVariants[uint](defaultMap, "numeric") + + registerDefaultPgTypeVariants[float32](defaultMap, "float4") + registerDefaultPgTypeVariants[float64](defaultMap, "float8") + + registerDefaultPgTypeVariants[bool](defaultMap, "bool") + registerDefaultPgTypeVariants[time.Time](defaultMap, "timestamptz") + registerDefaultPgTypeVariants[time.Duration](defaultMap, "interval") + registerDefaultPgTypeVariants[string](defaultMap, "text") + registerDefaultPgTypeVariants[json.RawMessage](defaultMap, "json") + registerDefaultPgTypeVariants[[]byte](defaultMap, "bytea") + + registerDefaultPgTypeVariants[net.IP](defaultMap, "inet") + registerDefaultPgTypeVariants[net.IPNet](defaultMap, "cidr") + registerDefaultPgTypeVariants[netip.Addr](defaultMap, "inet") + registerDefaultPgTypeVariants[netip.Prefix](defaultMap, "cidr") + + // pgtype provided structs + registerDefaultPgTypeVariants[Bits](defaultMap, "varbit") + registerDefaultPgTypeVariants[Bool](defaultMap, "bool") + registerDefaultPgTypeVariants[Box](defaultMap, "box") + registerDefaultPgTypeVariants[Circle](defaultMap, "circle") + registerDefaultPgTypeVariants[Date](defaultMap, "date") + registerDefaultPgTypeVariants[Range[Date]](defaultMap, "daterange") + registerDefaultPgTypeVariants[Multirange[Range[Date]]](defaultMap, "datemultirange") + registerDefaultPgTypeVariants[Float4](defaultMap, "float4") + registerDefaultPgTypeVariants[Float8](defaultMap, "float8") + registerDefaultPgTypeVariants[Range[Float8]](defaultMap, "numrange") // There is no PostgreSQL builtin float8range so map it to numrange. + registerDefaultPgTypeVariants[Multirange[Range[Float8]]](defaultMap, "nummultirange") // There is no PostgreSQL builtin float8multirange so map it to nummultirange. + registerDefaultPgTypeVariants[Int2](defaultMap, "int2") + registerDefaultPgTypeVariants[Int4](defaultMap, "int4") + registerDefaultPgTypeVariants[Range[Int4]](defaultMap, "int4range") + registerDefaultPgTypeVariants[Multirange[Range[Int4]]](defaultMap, "int4multirange") + registerDefaultPgTypeVariants[Int8](defaultMap, "int8") + registerDefaultPgTypeVariants[Range[Int8]](defaultMap, "int8range") + registerDefaultPgTypeVariants[Multirange[Range[Int8]]](defaultMap, "int8multirange") + registerDefaultPgTypeVariants[Interval](defaultMap, "interval") + registerDefaultPgTypeVariants[Line](defaultMap, "line") + registerDefaultPgTypeVariants[Lseg](defaultMap, "lseg") + registerDefaultPgTypeVariants[Numeric](defaultMap, "numeric") + registerDefaultPgTypeVariants[Range[Numeric]](defaultMap, "numrange") + registerDefaultPgTypeVariants[Multirange[Range[Numeric]]](defaultMap, "nummultirange") + registerDefaultPgTypeVariants[Path](defaultMap, "path") + registerDefaultPgTypeVariants[Point](defaultMap, "point") + registerDefaultPgTypeVariants[Polygon](defaultMap, "polygon") + registerDefaultPgTypeVariants[TID](defaultMap, "tid") + registerDefaultPgTypeVariants[Text](defaultMap, "text") + registerDefaultPgTypeVariants[Time](defaultMap, "time") + registerDefaultPgTypeVariants[Timestamp](defaultMap, "timestamp") + registerDefaultPgTypeVariants[Timestamptz](defaultMap, "timestamptz") + registerDefaultPgTypeVariants[Range[Timestamp]](defaultMap, "tsrange") + registerDefaultPgTypeVariants[Multirange[Range[Timestamp]]](defaultMap, "tsmultirange") + registerDefaultPgTypeVariants[Range[Timestamptz]](defaultMap, "tstzrange") + registerDefaultPgTypeVariants[Multirange[Range[Timestamptz]]](defaultMap, "tstzmultirange") + registerDefaultPgTypeVariants[UUID](defaultMap, "uuid") + + defaultMap.buildReflectTypeToType() +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/point.go b/vendor/github.com/jackc/pgx/v5/pgtype/point.go new file mode 100644 index 0000000..09b19bb --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/point.go @@ -0,0 +1,266 @@ +package pgtype + +import ( + "bytes" + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type Vec2 struct { + X float64 + Y float64 +} + +type PointScanner interface { + ScanPoint(v Point) error +} + +type PointValuer interface { + PointValue() (Point, error) +} + +type Point struct { + P Vec2 + Valid bool +} + +func (p *Point) ScanPoint(v Point) error { + *p = v + return nil +} + +func (p Point) PointValue() (Point, error) { + return p, nil +} + +func parsePoint(src []byte) (*Point, error) { + if src == nil || bytes.Equal(src, []byte("null")) { + return &Point{}, nil + } + + if len(src) < 5 { + return nil, fmt.Errorf("invalid length for point: %v", len(src)) + } + if src[0] == '"' && src[len(src)-1] == '"' { + src = src[1 : len(src)-1] + } + sx, sy, found := strings.Cut(string(src[1:len(src)-1]), ",") + if !found { + return nil, fmt.Errorf("invalid format for point") + } + + x, err := strconv.ParseFloat(sx, 64) + if err != nil { + return nil, err + } + + y, err := strconv.ParseFloat(sy, 64) + if err != nil { + return nil, err + } + + return &Point{P: Vec2{x, y}, Valid: true}, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Point) Scan(src any) error { + if src == nil { + *dst = Point{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToPointScanner{}.Scan([]byte(src), dst) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Point) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + buf, err := PointCodec{}.PlanEncode(nil, 0, TextFormatCode, src).Encode(src, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +func (src Point) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + + var buff bytes.Buffer + buff.WriteByte('"') + buff.WriteString(fmt.Sprintf("(%g,%g)", src.P.X, src.P.Y)) + buff.WriteByte('"') + return buff.Bytes(), nil +} + +func (dst *Point) UnmarshalJSON(point []byte) error { + p, err := parsePoint(point) + if err != nil { + return err + } + *dst = *p + return nil +} + +type PointCodec struct{} + +func (PointCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (PointCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (PointCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(PointValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanPointCodecBinary{} + case TextFormatCode: + return encodePlanPointCodecText{} + } + + return nil +} + +type encodePlanPointCodecBinary struct{} + +func (encodePlanPointCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + point, err := value.(PointValuer).PointValue() + if err != nil { + return nil, err + } + + if !point.Valid { + return nil, nil + } + + buf = pgio.AppendUint64(buf, math.Float64bits(point.P.X)) + buf = pgio.AppendUint64(buf, math.Float64bits(point.P.Y)) + return buf, nil +} + +type encodePlanPointCodecText struct{} + +func (encodePlanPointCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + point, err := value.(PointValuer).PointValue() + if err != nil { + return nil, err + } + + if !point.Valid { + return nil, nil + } + + return append(buf, fmt.Sprintf(`(%s,%s)`, + strconv.FormatFloat(point.P.X, 'f', -1, 64), + strconv.FormatFloat(point.P.Y, 'f', -1, 64), + )...), nil +} + +func (PointCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case PointScanner: + return scanPlanBinaryPointToPointScanner{} + } + case TextFormatCode: + switch target.(type) { + case PointScanner: + return scanPlanTextAnyToPointScanner{} + } + } + + return nil +} + +func (c PointCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c PointCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var point Point + err := codecScan(c, m, oid, format, src, &point) + if err != nil { + return nil, err + } + return point, nil +} + +type scanPlanBinaryPointToPointScanner struct{} + +func (scanPlanBinaryPointToPointScanner) Scan(src []byte, dst any) error { + scanner := (dst).(PointScanner) + + if src == nil { + return scanner.ScanPoint(Point{}) + } + + if len(src) != 16 { + return fmt.Errorf("invalid length for point: %v", len(src)) + } + + x := binary.BigEndian.Uint64(src) + y := binary.BigEndian.Uint64(src[8:]) + + return scanner.ScanPoint(Point{ + P: Vec2{math.Float64frombits(x), math.Float64frombits(y)}, + Valid: true, + }) +} + +type scanPlanTextAnyToPointScanner struct{} + +func (scanPlanTextAnyToPointScanner) Scan(src []byte, dst any) error { + scanner := (dst).(PointScanner) + + if src == nil { + return scanner.ScanPoint(Point{}) + } + + if len(src) < 5 { + return fmt.Errorf("invalid length for point: %v", len(src)) + } + + sx, sy, found := strings.Cut(string(src[1:len(src)-1]), ",") + if !found { + return fmt.Errorf("invalid format for point") + } + + x, err := strconv.ParseFloat(sx, 64) + if err != nil { + return err + } + + y, err := strconv.ParseFloat(sy, 64) + if err != nil { + return err + } + + return scanner.ScanPoint(Point{P: Vec2{x, y}, Valid: true}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/polygon.go b/vendor/github.com/jackc/pgx/v5/pgtype/polygon.go new file mode 100644 index 0000000..04b0ba6 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/polygon.go @@ -0,0 +1,253 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type PolygonScanner interface { + ScanPolygon(v Polygon) error +} + +type PolygonValuer interface { + PolygonValue() (Polygon, error) +} + +type Polygon struct { + P []Vec2 + Valid bool +} + +func (p *Polygon) ScanPolygon(v Polygon) error { + *p = v + return nil +} + +func (p Polygon) PolygonValue() (Polygon, error) { + return p, nil +} + +// Scan implements the database/sql Scanner interface. +func (p *Polygon) Scan(src any) error { + if src == nil { + *p = Polygon{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToPolygonScanner{}.Scan([]byte(src), p) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (p Polygon) Value() (driver.Value, error) { + if !p.Valid { + return nil, nil + } + + buf, err := PolygonCodec{}.PlanEncode(nil, 0, TextFormatCode, p).Encode(p, nil) + if err != nil { + return nil, err + } + + return string(buf), err +} + +type PolygonCodec struct{} + +func (PolygonCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (PolygonCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (PolygonCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(PolygonValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanPolygonCodecBinary{} + case TextFormatCode: + return encodePlanPolygonCodecText{} + } + + return nil +} + +type encodePlanPolygonCodecBinary struct{} + +func (encodePlanPolygonCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + polygon, err := value.(PolygonValuer).PolygonValue() + if err != nil { + return nil, err + } + + if !polygon.Valid { + return nil, nil + } + + buf = pgio.AppendInt32(buf, int32(len(polygon.P))) + + for _, p := range polygon.P { + buf = pgio.AppendUint64(buf, math.Float64bits(p.X)) + buf = pgio.AppendUint64(buf, math.Float64bits(p.Y)) + } + + return buf, nil +} + +type encodePlanPolygonCodecText struct{} + +func (encodePlanPolygonCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + polygon, err := value.(PolygonValuer).PolygonValue() + if err != nil { + return nil, err + } + + if !polygon.Valid { + return nil, nil + } + + buf = append(buf, '(') + + for i, p := range polygon.P { + if i > 0 { + buf = append(buf, ',') + } + buf = append(buf, fmt.Sprintf(`(%s,%s)`, + strconv.FormatFloat(p.X, 'f', -1, 64), + strconv.FormatFloat(p.Y, 'f', -1, 64), + )...) + } + + buf = append(buf, ')') + + return buf, nil +} + +func (PolygonCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case PolygonScanner: + return scanPlanBinaryPolygonToPolygonScanner{} + } + case TextFormatCode: + switch target.(type) { + case PolygonScanner: + return scanPlanTextAnyToPolygonScanner{} + } + } + + return nil +} + +type scanPlanBinaryPolygonToPolygonScanner struct{} + +func (scanPlanBinaryPolygonToPolygonScanner) Scan(src []byte, dst any) error { + scanner := (dst).(PolygonScanner) + + if src == nil { + return scanner.ScanPolygon(Polygon{}) + } + + if len(src) < 5 { + return fmt.Errorf("invalid length for polygon: %v", len(src)) + } + + pointCount := int(binary.BigEndian.Uint32(src)) + rp := 4 + + if 4+pointCount*16 != len(src) { + return fmt.Errorf("invalid length for Polygon with %d points: %v", pointCount, len(src)) + } + + points := make([]Vec2, pointCount) + for i := 0; i < len(points); i++ { + x := binary.BigEndian.Uint64(src[rp:]) + rp += 8 + y := binary.BigEndian.Uint64(src[rp:]) + rp += 8 + points[i] = Vec2{math.Float64frombits(x), math.Float64frombits(y)} + } + + return scanner.ScanPolygon(Polygon{ + P: points, + Valid: true, + }) +} + +type scanPlanTextAnyToPolygonScanner struct{} + +func (scanPlanTextAnyToPolygonScanner) Scan(src []byte, dst any) error { + scanner := (dst).(PolygonScanner) + + if src == nil { + return scanner.ScanPolygon(Polygon{}) + } + + if len(src) < 7 { + return fmt.Errorf("invalid length for Polygon: %v", len(src)) + } + + points := make([]Vec2, 0) + + str := string(src[2:]) + + for { + end := strings.IndexByte(str, ',') + x, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + str = str[end+1:] + end = strings.IndexByte(str, ')') + + y, err := strconv.ParseFloat(str[:end], 64) + if err != nil { + return err + } + + points = append(points, Vec2{x, y}) + + if end+3 < len(str) { + str = str[end+3:] + } else { + break + } + } + + return scanner.ScanPolygon(Polygon{P: points, Valid: true}) +} + +func (c PolygonCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c PolygonCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var polygon Polygon + err := codecScan(c, m, oid, format, src, &polygon) + if err != nil { + return nil, err + } + return polygon, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/qchar.go b/vendor/github.com/jackc/pgx/v5/pgtype/qchar.go new file mode 100644 index 0000000..fc40a5b --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/qchar.go @@ -0,0 +1,141 @@ +package pgtype + +import ( + "database/sql/driver" + "fmt" + "math" +) + +// QCharCodec is for PostgreSQL's special 8-bit-only "char" type more akin to the C +// language's char type, or Go's byte type. (Note that the name in PostgreSQL +// itself is "char", in double-quotes, and not char.) It gets used a lot in +// PostgreSQL's system tables to hold a single ASCII character value (eg +// pg_class.relkind). It is named Qchar for quoted char to disambiguate from SQL +// standard type char. +type QCharCodec struct{} + +func (QCharCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (QCharCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (QCharCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case TextFormatCode, BinaryFormatCode: + switch value.(type) { + case byte: + return encodePlanQcharCodecByte{} + case rune: + return encodePlanQcharCodecRune{} + } + } + + return nil +} + +type encodePlanQcharCodecByte struct{} + +func (encodePlanQcharCodecByte) Encode(value any, buf []byte) (newBuf []byte, err error) { + b := value.(byte) + buf = append(buf, b) + return buf, nil +} + +type encodePlanQcharCodecRune struct{} + +func (encodePlanQcharCodecRune) Encode(value any, buf []byte) (newBuf []byte, err error) { + r := value.(rune) + if r > math.MaxUint8 { + return nil, fmt.Errorf(`%v cannot be encoded to "char"`, r) + } + b := byte(r) + buf = append(buf, b) + return buf, nil +} + +func (QCharCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case TextFormatCode, BinaryFormatCode: + switch target.(type) { + case *byte: + return scanPlanQcharCodecByte{} + case *rune: + return scanPlanQcharCodecRune{} + } + } + + return nil +} + +type scanPlanQcharCodecByte struct{} + +func (scanPlanQcharCodecByte) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) > 1 { + return fmt.Errorf(`invalid length for "char": %v`, len(src)) + } + + b := dst.(*byte) + // In the text format the zero value is returned as a zero byte value instead of 0 + if len(src) == 0 { + *b = 0 + } else { + *b = src[0] + } + + return nil +} + +type scanPlanQcharCodecRune struct{} + +func (scanPlanQcharCodecRune) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) > 1 { + return fmt.Errorf(`invalid length for "char": %v`, len(src)) + } + + r := dst.(*rune) + // In the text format the zero value is returned as a zero byte value instead of 0 + if len(src) == 0 { + *r = 0 + } else { + *r = rune(src[0]) + } + + return nil +} + +func (c QCharCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var r rune + err := codecScan(c, m, oid, format, src, &r) + if err != nil { + return nil, err + } + return string(r), nil +} + +func (c QCharCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var r rune + err := codecScan(c, m, oid, format, src, &r) + if err != nil { + return nil, err + } + return r, nil +} diff --git a/vendor/github.com/jackc/pgtype/range.go b/vendor/github.com/jackc/pgx/v5/pgtype/range.go similarity index 76% rename from vendor/github.com/jackc/pgtype/range.go rename to vendor/github.com/jackc/pgx/v5/pgtype/range.go index e999f6a..16427cc 100644 --- a/vendor/github.com/jackc/pgtype/range.go +++ b/vendor/github.com/jackc/pgx/v5/pgtype/range.go @@ -19,15 +19,15 @@ func (bt BoundType) String() string { return string(bt) } -type UntypedTextRange struct { +type untypedTextRange struct { Lower string Upper string LowerType BoundType UpperType BoundType } -func ParseUntypedTextRange(src string) (*UntypedTextRange, error) { - utr := &UntypedTextRange{} +func parseUntypedTextRange(src string) (*untypedTextRange, error) { + utr := &untypedTextRange{} if src == "empty" { utr.LowerType = Empty utr.UpperType = Empty @@ -40,7 +40,7 @@ func ParseUntypedTextRange(src string) (*UntypedTextRange, error) { r, _, err := buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid lower bound: %v", err) + return nil, fmt.Errorf("invalid lower bound: %w", err) } switch r { case '(': @@ -53,7 +53,7 @@ func ParseUntypedTextRange(src string) (*UntypedTextRange, error) { r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid lower value: %v", err) + return nil, fmt.Errorf("invalid lower value: %w", err) } buf.UnreadRune() @@ -62,13 +62,13 @@ func ParseUntypedTextRange(src string) (*UntypedTextRange, error) { } else { utr.Lower, err = rangeParseValue(buf) if err != nil { - return nil, fmt.Errorf("invalid lower value: %v", err) + return nil, fmt.Errorf("invalid lower value: %w", err) } } r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("missing range separator: %v", err) + return nil, fmt.Errorf("missing range separator: %w", err) } if r != ',' { return nil, fmt.Errorf("missing range separator: %v", r) @@ -76,7 +76,7 @@ func ParseUntypedTextRange(src string) (*UntypedTextRange, error) { r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("invalid upper value: %v", err) + return nil, fmt.Errorf("invalid upper value: %w", err) } if r == ')' || r == ']' { @@ -85,12 +85,12 @@ func ParseUntypedTextRange(src string) (*UntypedTextRange, error) { buf.UnreadRune() utr.Upper, err = rangeParseValue(buf) if err != nil { - return nil, fmt.Errorf("invalid upper value: %v", err) + return nil, fmt.Errorf("invalid upper value: %w", err) } r, _, err = buf.ReadRune() if err != nil { - return nil, fmt.Errorf("missing upper bound: %v", err) + return nil, fmt.Errorf("missing upper bound: %w", err) } switch r { case ')': @@ -173,7 +173,7 @@ func rangeParseQuotedValue(buf *bytes.Buffer) (string, error) { } } -type UntypedBinaryRange struct { +type untypedBinaryRange struct { Lower []byte Upper []byte LowerType BoundType @@ -197,8 +197,8 @@ const upperInclusiveMask = 4 const lowerUnboundedMask = 8 const upperUnboundedMask = 16 -func ParseUntypedBinaryRange(src []byte) (*UntypedBinaryRange, error) { - ubr := &UntypedBinaryRange{} +func parseUntypedBinaryRange(src []byte) (*untypedBinaryRange, error) { + ubr := &untypedBinaryRange{} if len(src) == 0 { return nil, fmt.Errorf("range too short: %v", len(src)) @@ -275,3 +275,48 @@ func ParseUntypedBinaryRange(src []byte) (*UntypedBinaryRange, error) { return ubr, nil } + +// Range is a generic range type. +type Range[T any] struct { + Lower T + Upper T + LowerType BoundType + UpperType BoundType + Valid bool +} + +func (r Range[T]) IsNull() bool { + return !r.Valid +} + +func (r Range[T]) BoundTypes() (lower, upper BoundType) { + return r.LowerType, r.UpperType +} + +func (r Range[T]) Bounds() (lower, upper any) { + return &r.Lower, &r.Upper +} + +func (r *Range[T]) ScanNull() error { + *r = Range[T]{} + return nil +} + +func (r *Range[T]) ScanBounds() (lowerTarget, upperTarget any) { + return &r.Lower, &r.Upper +} + +func (r *Range[T]) SetBoundTypes(lower, upper BoundType) error { + if lower == Unbounded || lower == Empty { + var zero T + r.Lower = zero + } + if upper == Unbounded || upper == Empty { + var zero T + r.Upper = zero + } + r.LowerType = lower + r.UpperType = upper + r.Valid = true + return nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/range_codec.go b/vendor/github.com/jackc/pgx/v5/pgtype/range_codec.go new file mode 100644 index 0000000..684f1bf --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/range_codec.go @@ -0,0 +1,379 @@ +package pgtype + +import ( + "database/sql/driver" + "fmt" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +// RangeValuer is a type that can be converted into a PostgreSQL range. +type RangeValuer interface { + // IsNull returns true if the value is SQL NULL. + IsNull() bool + + // BoundTypes returns the lower and upper bound types. + BoundTypes() (lower, upper BoundType) + + // Bounds returns the lower and upper range values. + Bounds() (lower, upper any) +} + +// RangeScanner is a type can be scanned from a PostgreSQL range. +type RangeScanner interface { + // ScanNull sets the value to SQL NULL. + ScanNull() error + + // ScanBounds returns values usable as a scan target. The returned values may not be scanned if the range is empty or + // the bound type is unbounded. + ScanBounds() (lowerTarget, upperTarget any) + + // SetBoundTypes sets the lower and upper bound types. ScanBounds will be called and the returned values scanned + // (if appropriate) before SetBoundTypes is called. If the bound types are unbounded or empty this method must + // also set the bound values. + SetBoundTypes(lower, upper BoundType) error +} + +// RangeCodec is a codec for any range type. +type RangeCodec struct { + ElementType *Type +} + +func (c *RangeCodec) FormatSupported(format int16) bool { + return c.ElementType.Codec.FormatSupported(format) +} + +func (c *RangeCodec) PreferredFormat() int16 { + if c.FormatSupported(BinaryFormatCode) { + return BinaryFormatCode + } + return TextFormatCode +} + +func (c *RangeCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(RangeValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return &encodePlanRangeCodecRangeValuerToBinary{rc: c, m: m} + case TextFormatCode: + return &encodePlanRangeCodecRangeValuerToText{rc: c, m: m} + } + + return nil +} + +type encodePlanRangeCodecRangeValuerToBinary struct { + rc *RangeCodec + m *Map +} + +func (plan *encodePlanRangeCodecRangeValuerToBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + getter := value.(RangeValuer) + + if getter.IsNull() { + return nil, nil + } + + lowerType, upperType := getter.BoundTypes() + lower, upper := getter.Bounds() + + var rangeType byte + switch lowerType { + case Inclusive: + rangeType |= lowerInclusiveMask + case Unbounded: + rangeType |= lowerUnboundedMask + case Exclusive: + case Empty: + return append(buf, emptyMask), nil + default: + return nil, fmt.Errorf("unknown LowerType: %v", lowerType) + } + + switch upperType { + case Inclusive: + rangeType |= upperInclusiveMask + case Unbounded: + rangeType |= upperUnboundedMask + case Exclusive: + default: + return nil, fmt.Errorf("unknown UpperType: %v", upperType) + } + + buf = append(buf, rangeType) + + if lowerType != Unbounded { + if lower == nil { + return nil, fmt.Errorf("Lower cannot be NULL unless LowerType is Unbounded") + } + + sp := len(buf) + buf = pgio.AppendInt32(buf, -1) + + lowerPlan := plan.m.PlanEncode(plan.rc.ElementType.OID, BinaryFormatCode, lower) + if lowerPlan == nil { + return nil, fmt.Errorf("cannot encode %v as element of range", lower) + } + + buf, err = lowerPlan.Encode(lower, buf) + if err != nil { + return nil, fmt.Errorf("failed to encode %v as element of range: %w", lower, err) + } + if buf == nil { + return nil, fmt.Errorf("Lower cannot be NULL unless LowerType is Unbounded") + } + + pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) + } + + if upperType != Unbounded { + if upper == nil { + return nil, fmt.Errorf("Upper cannot be NULL unless UpperType is Unbounded") + } + + sp := len(buf) + buf = pgio.AppendInt32(buf, -1) + + upperPlan := plan.m.PlanEncode(plan.rc.ElementType.OID, BinaryFormatCode, upper) + if upperPlan == nil { + return nil, fmt.Errorf("cannot encode %v as element of range", upper) + } + + buf, err = upperPlan.Encode(upper, buf) + if err != nil { + return nil, fmt.Errorf("failed to encode %v as element of range: %w", upper, err) + } + if buf == nil { + return nil, fmt.Errorf("Upper cannot be NULL unless UpperType is Unbounded") + } + + pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) + } + + return buf, nil +} + +type encodePlanRangeCodecRangeValuerToText struct { + rc *RangeCodec + m *Map +} + +func (plan *encodePlanRangeCodecRangeValuerToText) Encode(value any, buf []byte) (newBuf []byte, err error) { + getter := value.(RangeValuer) + + if getter.IsNull() { + return nil, nil + } + + lowerType, upperType := getter.BoundTypes() + lower, upper := getter.Bounds() + + switch lowerType { + case Exclusive, Unbounded: + buf = append(buf, '(') + case Inclusive: + buf = append(buf, '[') + case Empty: + return append(buf, "empty"...), nil + default: + return nil, fmt.Errorf("unknown lower bound type %v", lowerType) + } + + if lowerType != Unbounded { + if lower == nil { + return nil, fmt.Errorf("Lower cannot be NULL unless LowerType is Unbounded") + } + + lowerPlan := plan.m.PlanEncode(plan.rc.ElementType.OID, TextFormatCode, lower) + if lowerPlan == nil { + return nil, fmt.Errorf("cannot encode %v as element of range", lower) + } + + buf, err = lowerPlan.Encode(lower, buf) + if err != nil { + return nil, fmt.Errorf("failed to encode %v as element of range: %w", lower, err) + } + if buf == nil { + return nil, fmt.Errorf("Lower cannot be NULL unless LowerType is Unbounded") + } + } + + buf = append(buf, ',') + + if upperType != Unbounded { + if upper == nil { + return nil, fmt.Errorf("Upper cannot be NULL unless UpperType is Unbounded") + } + + upperPlan := plan.m.PlanEncode(plan.rc.ElementType.OID, TextFormatCode, upper) + if upperPlan == nil { + return nil, fmt.Errorf("cannot encode %v as element of range", upper) + } + + buf, err = upperPlan.Encode(upper, buf) + if err != nil { + return nil, fmt.Errorf("failed to encode %v as element of range: %w", upper, err) + } + if buf == nil { + return nil, fmt.Errorf("Upper cannot be NULL unless UpperType is Unbounded") + } + } + + switch upperType { + case Exclusive, Unbounded: + buf = append(buf, ')') + case Inclusive: + buf = append(buf, ']') + default: + return nil, fmt.Errorf("unknown upper bound type %v", upperType) + } + + return buf, nil +} + +func (c *RangeCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case BinaryFormatCode: + switch target.(type) { + case RangeScanner: + return &scanPlanBinaryRangeToRangeScanner{rc: c, m: m} + } + case TextFormatCode: + switch target.(type) { + case RangeScanner: + return &scanPlanTextRangeToRangeScanner{rc: c, m: m} + } + } + + return nil +} + +type scanPlanBinaryRangeToRangeScanner struct { + rc *RangeCodec + m *Map +} + +func (plan *scanPlanBinaryRangeToRangeScanner) Scan(src []byte, target any) error { + rangeScanner := (target).(RangeScanner) + + if src == nil { + return rangeScanner.ScanNull() + } + + ubr, err := parseUntypedBinaryRange(src) + if err != nil { + return err + } + + if ubr.LowerType == Empty { + return rangeScanner.SetBoundTypes(ubr.LowerType, ubr.UpperType) + } + + lowerTarget, upperTarget := rangeScanner.ScanBounds() + + if ubr.LowerType == Inclusive || ubr.LowerType == Exclusive { + lowerPlan := plan.m.PlanScan(plan.rc.ElementType.OID, BinaryFormatCode, lowerTarget) + if lowerPlan == nil { + return fmt.Errorf("cannot scan into %v from range element", lowerTarget) + } + + err = lowerPlan.Scan(ubr.Lower, lowerTarget) + if err != nil { + return fmt.Errorf("cannot scan into %v from range element: %w", lowerTarget, err) + } + } + + if ubr.UpperType == Inclusive || ubr.UpperType == Exclusive { + upperPlan := plan.m.PlanScan(plan.rc.ElementType.OID, BinaryFormatCode, upperTarget) + if upperPlan == nil { + return fmt.Errorf("cannot scan into %v from range element", upperTarget) + } + + err = upperPlan.Scan(ubr.Upper, upperTarget) + if err != nil { + return fmt.Errorf("cannot scan into %v from range element: %w", upperTarget, err) + } + } + + return rangeScanner.SetBoundTypes(ubr.LowerType, ubr.UpperType) +} + +type scanPlanTextRangeToRangeScanner struct { + rc *RangeCodec + m *Map +} + +func (plan *scanPlanTextRangeToRangeScanner) Scan(src []byte, target any) error { + rangeScanner := (target).(RangeScanner) + + if src == nil { + return rangeScanner.ScanNull() + } + + utr, err := parseUntypedTextRange(string(src)) + if err != nil { + return err + } + + if utr.LowerType == Empty { + return rangeScanner.SetBoundTypes(utr.LowerType, utr.UpperType) + } + + lowerTarget, upperTarget := rangeScanner.ScanBounds() + + if utr.LowerType == Inclusive || utr.LowerType == Exclusive { + lowerPlan := plan.m.PlanScan(plan.rc.ElementType.OID, TextFormatCode, lowerTarget) + if lowerPlan == nil { + return fmt.Errorf("cannot scan into %v from range element", lowerTarget) + } + + err = lowerPlan.Scan([]byte(utr.Lower), lowerTarget) + if err != nil { + return fmt.Errorf("cannot scan into %v from range element: %w", lowerTarget, err) + } + } + + if utr.UpperType == Inclusive || utr.UpperType == Exclusive { + upperPlan := plan.m.PlanScan(plan.rc.ElementType.OID, TextFormatCode, upperTarget) + if upperPlan == nil { + return fmt.Errorf("cannot scan into %v from range element", upperTarget) + } + + err = upperPlan.Scan([]byte(utr.Upper), upperTarget) + if err != nil { + return fmt.Errorf("cannot scan into %v from range element: %w", upperTarget, err) + } + } + + return rangeScanner.SetBoundTypes(utr.LowerType, utr.UpperType) +} + +func (c *RangeCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + return string(src), nil + case BinaryFormatCode: + buf := make([]byte, len(src)) + copy(buf, src) + return buf, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } +} + +func (c *RangeCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var r Range[any] + err := c.PlanScan(m, oid, format, &r).Scan(src, &r) + return r, err +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/record_codec.go b/vendor/github.com/jackc/pgx/v5/pgtype/record_codec.go new file mode 100644 index 0000000..b3b1660 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/record_codec.go @@ -0,0 +1,125 @@ +package pgtype + +import ( + "database/sql/driver" + "fmt" +) + +// ArrayGetter is a type that can be converted into a PostgreSQL array. + +// RecordCodec is a codec for the generic PostgreSQL record type such as is created with the "row" function. Record can +// only decode the binary format. The text format output format from PostgreSQL does not include type information and +// is therefore impossible to decode. Encoding is impossible because PostgreSQL does not support input of generic +// records. +type RecordCodec struct{} + +func (RecordCodec) FormatSupported(format int16) bool { + return format == BinaryFormatCode +} + +func (RecordCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (RecordCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + return nil +} + +func (RecordCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + if format == BinaryFormatCode { + switch target.(type) { + case CompositeIndexScanner: + return &scanPlanBinaryRecordToCompositeIndexScanner{m: m} + } + } + + return nil +} + +type scanPlanBinaryRecordToCompositeIndexScanner struct { + m *Map +} + +func (plan *scanPlanBinaryRecordToCompositeIndexScanner) Scan(src []byte, target any) error { + targetScanner := (target).(CompositeIndexScanner) + + if src == nil { + return targetScanner.ScanNull() + } + + scanner := NewCompositeBinaryScanner(plan.m, src) + for i := 0; scanner.Next(); i++ { + fieldTarget := targetScanner.ScanIndex(i) + if fieldTarget != nil { + fieldPlan := plan.m.PlanScan(scanner.OID(), BinaryFormatCode, fieldTarget) + if fieldPlan == nil { + return fmt.Errorf("unable to scan OID %d in binary format into %v", scanner.OID(), fieldTarget) + } + + err := fieldPlan.Scan(scanner.Bytes(), fieldTarget) + if err != nil { + return err + } + } + } + + if err := scanner.Err(); err != nil { + return err + } + + return nil +} + +func (RecordCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + return string(src), nil + case BinaryFormatCode: + buf := make([]byte, len(src)) + copy(buf, src) + return buf, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } +} + +func (RecordCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + switch format { + case TextFormatCode: + return string(src), nil + case BinaryFormatCode: + scanner := NewCompositeBinaryScanner(m, src) + values := make([]any, scanner.FieldCount()) + for i := 0; scanner.Next(); i++ { + var v any + fieldPlan := m.PlanScan(scanner.OID(), BinaryFormatCode, &v) + if fieldPlan == nil { + return nil, fmt.Errorf("unable to scan OID %d in binary format into %v", scanner.OID(), v) + } + + err := fieldPlan.Scan(scanner.Bytes(), &v) + if err != nil { + return nil, err + } + + values[i] = v + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return values, nil + default: + return nil, fmt.Errorf("unknown format code %d", format) + } + +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/register_default_pg_types.go b/vendor/github.com/jackc/pgx/v5/pgtype/register_default_pg_types.go new file mode 100644 index 0000000..be1ca4a --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/register_default_pg_types.go @@ -0,0 +1,35 @@ +//go:build !nopgxregisterdefaulttypes + +package pgtype + +func registerDefaultPgTypeVariants[T any](m *Map, name string) { + arrayName := "_" + name + + var value T + m.RegisterDefaultPgType(value, name) // T + m.RegisterDefaultPgType(&value, name) // *T + + var sliceT []T + m.RegisterDefaultPgType(sliceT, arrayName) // []T + m.RegisterDefaultPgType(&sliceT, arrayName) // *[]T + + var slicePtrT []*T + m.RegisterDefaultPgType(slicePtrT, arrayName) // []*T + m.RegisterDefaultPgType(&slicePtrT, arrayName) // *[]*T + + var arrayOfT Array[T] + m.RegisterDefaultPgType(arrayOfT, arrayName) // Array[T] + m.RegisterDefaultPgType(&arrayOfT, arrayName) // *Array[T] + + var arrayOfPtrT Array[*T] + m.RegisterDefaultPgType(arrayOfPtrT, arrayName) // Array[*T] + m.RegisterDefaultPgType(&arrayOfPtrT, arrayName) // *Array[*T] + + var flatArrayOfT FlatArray[T] + m.RegisterDefaultPgType(flatArrayOfT, arrayName) // FlatArray[T] + m.RegisterDefaultPgType(&flatArrayOfT, arrayName) // *FlatArray[T] + + var flatArrayOfPtrT FlatArray[*T] + m.RegisterDefaultPgType(flatArrayOfPtrT, arrayName) // FlatArray[*T] + m.RegisterDefaultPgType(&flatArrayOfPtrT, arrayName) // *FlatArray[*T] +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/register_default_pg_types_disabled.go b/vendor/github.com/jackc/pgx/v5/pgtype/register_default_pg_types_disabled.go new file mode 100644 index 0000000..56fe7c2 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/register_default_pg_types_disabled.go @@ -0,0 +1,6 @@ +//go:build nopgxregisterdefaulttypes + +package pgtype + +func registerDefaultPgTypeVariants[T any](m *Map, name string) { +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/text.go b/vendor/github.com/jackc/pgx/v5/pgtype/text.go new file mode 100644 index 0000000..021ee33 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/text.go @@ -0,0 +1,223 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/json" + "fmt" +) + +type TextScanner interface { + ScanText(v Text) error +} + +type TextValuer interface { + TextValue() (Text, error) +} + +type Text struct { + String string + Valid bool +} + +func (t *Text) ScanText(v Text) error { + *t = v + return nil +} + +func (t Text) TextValue() (Text, error) { + return t, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Text) Scan(src any) error { + if src == nil { + *dst = Text{} + return nil + } + + switch src := src.(type) { + case string: + *dst = Text{String: src, Valid: true} + return nil + case []byte: + *dst = Text{String: string(src), Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src Text) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + return src.String, nil +} + +func (src Text) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + + return json.Marshal(src.String) +} + +func (dst *Text) UnmarshalJSON(b []byte) error { + var s *string + err := json.Unmarshal(b, &s) + if err != nil { + return err + } + + if s == nil { + *dst = Text{} + } else { + *dst = Text{String: *s, Valid: true} + } + + return nil +} + +type TextCodec struct{} + +func (TextCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (TextCodec) PreferredFormat() int16 { + return TextFormatCode +} + +func (TextCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case TextFormatCode, BinaryFormatCode: + switch value.(type) { + case string: + return encodePlanTextCodecString{} + case []byte: + return encodePlanTextCodecByteSlice{} + case TextValuer: + return encodePlanTextCodecTextValuer{} + } + } + + return nil +} + +type encodePlanTextCodecString struct{} + +func (encodePlanTextCodecString) Encode(value any, buf []byte) (newBuf []byte, err error) { + s := value.(string) + buf = append(buf, s...) + return buf, nil +} + +type encodePlanTextCodecByteSlice struct{} + +func (encodePlanTextCodecByteSlice) Encode(value any, buf []byte) (newBuf []byte, err error) { + s := value.([]byte) + buf = append(buf, s...) + return buf, nil +} + +type encodePlanTextCodecStringer struct{} + +func (encodePlanTextCodecStringer) Encode(value any, buf []byte) (newBuf []byte, err error) { + s := value.(fmt.Stringer) + buf = append(buf, s.String()...) + return buf, nil +} + +type encodePlanTextCodecTextValuer struct{} + +func (encodePlanTextCodecTextValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + text, err := value.(TextValuer).TextValue() + if err != nil { + return nil, err + } + + if !text.Valid { + return nil, nil + } + + buf = append(buf, text.String...) + return buf, nil +} + +func (TextCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case TextFormatCode, BinaryFormatCode: + switch target.(type) { + case *string: + return scanPlanTextAnyToString{} + case *[]byte: + return scanPlanAnyToNewByteSlice{} + case BytesScanner: + return scanPlanAnyToByteScanner{} + case TextScanner: + return scanPlanTextAnyToTextScanner{} + } + } + + return nil +} + +func (c TextCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return c.DecodeValue(m, oid, format, src) +} + +func (c TextCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + return string(src), nil +} + +type scanPlanTextAnyToString struct{} + +func (scanPlanTextAnyToString) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + p := (dst).(*string) + *p = string(src) + + return nil +} + +type scanPlanAnyToNewByteSlice struct{} + +func (scanPlanAnyToNewByteSlice) Scan(src []byte, dst any) error { + p := (dst).(*[]byte) + if src == nil { + *p = nil + } else { + *p = make([]byte, len(src)) + copy(*p, src) + } + + return nil +} + +type scanPlanAnyToByteScanner struct{} + +func (scanPlanAnyToByteScanner) Scan(src []byte, dst any) error { + p := (dst).(BytesScanner) + return p.ScanBytes(src) +} + +type scanPlanTextAnyToTextScanner struct{} + +func (scanPlanTextAnyToTextScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TextScanner) + + if src == nil { + return scanner.ScanText(Text{}) + } + + return scanner.ScanText(Text{String: string(src), Valid: true}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/text_format_only_codec.go b/vendor/github.com/jackc/pgx/v5/pgtype/text_format_only_codec.go new file mode 100644 index 0000000..d5e4cdb --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/text_format_only_codec.go @@ -0,0 +1,13 @@ +package pgtype + +type TextFormatOnlyCodec struct { + Codec +} + +func (c *TextFormatOnlyCodec) FormatSupported(format int16) bool { + return format == TextFormatCode && c.Codec.FormatSupported(format) +} + +func (TextFormatOnlyCodec) PreferredFormat() int16 { + return TextFormatCode +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/tid.go b/vendor/github.com/jackc/pgx/v5/pgtype/tid.go new file mode 100644 index 0000000..9bc2c2a --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/tid.go @@ -0,0 +1,241 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "strconv" + "strings" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type TIDScanner interface { + ScanTID(v TID) error +} + +type TIDValuer interface { + TIDValue() (TID, error) +} + +// TID is PostgreSQL's Tuple Identifier type. +// +// When one does +// +// select ctid, * from some_table; +// +// it is the data type of the ctid hidden system column. +// +// It is currently implemented as a pair unsigned two byte integers. +// Its conversion functions can be found in src/backend/utils/adt/tid.c +// in the PostgreSQL sources. +type TID struct { + BlockNumber uint32 + OffsetNumber uint16 + Valid bool +} + +func (b *TID) ScanTID(v TID) error { + *b = v + return nil +} + +func (b TID) TIDValue() (TID, error) { + return b, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *TID) Scan(src any) error { + if src == nil { + *dst = TID{} + return nil + } + + switch src := src.(type) { + case string: + return scanPlanTextAnyToTIDScanner{}.Scan([]byte(src), dst) + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src TID) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + buf, err := TIDCodec{}.PlanEncode(nil, 0, TextFormatCode, src).Encode(src, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type TIDCodec struct{} + +func (TIDCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (TIDCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (TIDCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(TIDValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanTIDCodecBinary{} + case TextFormatCode: + return encodePlanTIDCodecText{} + } + + return nil +} + +type encodePlanTIDCodecBinary struct{} + +func (encodePlanTIDCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + tid, err := value.(TIDValuer).TIDValue() + if err != nil { + return nil, err + } + + if !tid.Valid { + return nil, nil + } + + buf = pgio.AppendUint32(buf, tid.BlockNumber) + buf = pgio.AppendUint16(buf, tid.OffsetNumber) + return buf, nil +} + +type encodePlanTIDCodecText struct{} + +func (encodePlanTIDCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + tid, err := value.(TIDValuer).TIDValue() + if err != nil { + return nil, err + } + + if !tid.Valid { + return nil, nil + } + + buf = append(buf, fmt.Sprintf(`(%d,%d)`, tid.BlockNumber, tid.OffsetNumber)...) + return buf, nil +} + +func (TIDCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case TIDScanner: + return scanPlanBinaryTIDToTIDScanner{} + case TextScanner: + return scanPlanBinaryTIDToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case TIDScanner: + return scanPlanTextAnyToTIDScanner{} + } + } + + return nil +} + +type scanPlanBinaryTIDToTIDScanner struct{} + +func (scanPlanBinaryTIDToTIDScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TIDScanner) + + if src == nil { + return scanner.ScanTID(TID{}) + } + + if len(src) != 6 { + return fmt.Errorf("invalid length for tid: %v", len(src)) + } + + return scanner.ScanTID(TID{ + BlockNumber: binary.BigEndian.Uint32(src), + OffsetNumber: binary.BigEndian.Uint16(src[4:]), + Valid: true, + }) +} + +type scanPlanBinaryTIDToTextScanner struct{} + +func (scanPlanBinaryTIDToTextScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TextScanner) + + if src == nil { + return scanner.ScanText(Text{}) + } + + if len(src) != 6 { + return fmt.Errorf("invalid length for tid: %v", len(src)) + } + + blockNumber := binary.BigEndian.Uint32(src) + offsetNumber := binary.BigEndian.Uint16(src[4:]) + + return scanner.ScanText(Text{ + String: fmt.Sprintf(`(%d,%d)`, blockNumber, offsetNumber), + Valid: true, + }) +} + +type scanPlanTextAnyToTIDScanner struct{} + +func (scanPlanTextAnyToTIDScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TIDScanner) + + if src == nil { + return scanner.ScanTID(TID{}) + } + + if len(src) < 5 { + return fmt.Errorf("invalid length for tid: %v", len(src)) + } + + block, offset, found := strings.Cut(string(src[1:len(src)-1]), ",") + if !found { + return fmt.Errorf("invalid format for tid") + } + + blockNumber, err := strconv.ParseUint(block, 10, 32) + if err != nil { + return err + } + + offsetNumber, err := strconv.ParseUint(offset, 10, 16) + if err != nil { + return err + } + + return scanner.ScanTID(TID{BlockNumber: uint32(blockNumber), OffsetNumber: uint16(offsetNumber), Valid: true}) +} + +func (c TIDCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c TIDCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var tid TID + err := codecScan(c, m, oid, format, src, &tid) + if err != nil { + return nil, err + } + return tid, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/time.go b/vendor/github.com/jackc/pgx/v5/pgtype/time.go new file mode 100644 index 0000000..f8fd948 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/time.go @@ -0,0 +1,274 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "strconv" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type TimeScanner interface { + ScanTime(v Time) error +} + +type TimeValuer interface { + TimeValue() (Time, error) +} + +// Time represents the PostgreSQL time type. The PostgreSQL time is a time of day without time zone. +// +// Time is represented as the number of microseconds since midnight in the same way that PostgreSQL does. Other time and +// date types in pgtype can use time.Time as the underlying representation. However, pgtype.Time type cannot due to +// needing to handle 24:00:00. time.Time converts that to 00:00:00 on the following day. +// +// The time with time zone type is not supported. Use of time with time zone is discouraged by the PostgreSQL documentation. +type Time struct { + Microseconds int64 // Number of microseconds since midnight + Valid bool +} + +func (t *Time) ScanTime(v Time) error { + *t = v + return nil +} + +func (t Time) TimeValue() (Time, error) { + return t, nil +} + +// Scan implements the database/sql Scanner interface. +func (t *Time) Scan(src any) error { + if src == nil { + *t = Time{} + return nil + } + + switch src := src.(type) { + case string: + err := scanPlanTextAnyToTimeScanner{}.Scan([]byte(src), t) + if err != nil { + t.Microseconds = 0 + t.Valid = false + } + return err + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (t Time) Value() (driver.Value, error) { + if !t.Valid { + return nil, nil + } + + buf, err := TimeCodec{}.PlanEncode(nil, 0, TextFormatCode, t).Encode(t, nil) + if err != nil { + return nil, err + } + return string(buf), err +} + +type TimeCodec struct{} + +func (TimeCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (TimeCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (TimeCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(TimeValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanTimeCodecBinary{} + case TextFormatCode: + return encodePlanTimeCodecText{} + } + + return nil +} + +type encodePlanTimeCodecBinary struct{} + +func (encodePlanTimeCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + t, err := value.(TimeValuer).TimeValue() + if err != nil { + return nil, err + } + + if !t.Valid { + return nil, nil + } + + return pgio.AppendInt64(buf, t.Microseconds), nil +} + +type encodePlanTimeCodecText struct{} + +func (encodePlanTimeCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + t, err := value.(TimeValuer).TimeValue() + if err != nil { + return nil, err + } + + if !t.Valid { + return nil, nil + } + + usec := t.Microseconds + hours := usec / microsecondsPerHour + usec -= hours * microsecondsPerHour + minutes := usec / microsecondsPerMinute + usec -= minutes * microsecondsPerMinute + seconds := usec / microsecondsPerSecond + usec -= seconds * microsecondsPerSecond + + s := fmt.Sprintf("%02d:%02d:%02d.%06d", hours, minutes, seconds, usec) + + return append(buf, s...), nil +} + +func (TimeCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case TimeScanner: + return scanPlanBinaryTimeToTimeScanner{} + case TextScanner: + return scanPlanBinaryTimeToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case TimeScanner: + return scanPlanTextAnyToTimeScanner{} + } + } + + return nil +} + +type scanPlanBinaryTimeToTimeScanner struct{} + +func (scanPlanBinaryTimeToTimeScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TimeScanner) + + if src == nil { + return scanner.ScanTime(Time{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for time: %v", len(src)) + } + + usec := int64(binary.BigEndian.Uint64(src)) + + return scanner.ScanTime(Time{Microseconds: usec, Valid: true}) +} + +type scanPlanBinaryTimeToTextScanner struct{} + +func (scanPlanBinaryTimeToTextScanner) Scan(src []byte, dst any) error { + ts, ok := (dst).(TextScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return ts.ScanText(Text{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for time: %v", len(src)) + } + + usec := int64(binary.BigEndian.Uint64(src)) + + tim := Time{Microseconds: usec, Valid: true} + + buf, err := TimeCodec{}.PlanEncode(nil, 0, TextFormatCode, tim).Encode(tim, nil) + if err != nil { + return err + } + + return ts.ScanText(Text{String: string(buf), Valid: true}) +} + +type scanPlanTextAnyToTimeScanner struct{} + +func (scanPlanTextAnyToTimeScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TimeScanner) + + if src == nil { + return scanner.ScanTime(Time{}) + } + + s := string(src) + + if len(s) < 8 || s[2] != ':' || s[5] != ':' { + return fmt.Errorf("cannot decode %v into Time", s) + } + + hours, err := strconv.ParseInt(s[0:2], 10, 64) + if err != nil { + return fmt.Errorf("cannot decode %v into Time", s) + } + usec := hours * microsecondsPerHour + + minutes, err := strconv.ParseInt(s[3:5], 10, 64) + if err != nil { + return fmt.Errorf("cannot decode %v into Time", s) + } + usec += minutes * microsecondsPerMinute + + seconds, err := strconv.ParseInt(s[6:8], 10, 64) + if err != nil { + return fmt.Errorf("cannot decode %v into Time", s) + } + usec += seconds * microsecondsPerSecond + + if len(s) > 9 { + if s[8] != '.' || len(s) > 15 { + return fmt.Errorf("cannot decode %v into Time", s) + } + + fraction := s[9:] + n, err := strconv.ParseInt(fraction, 10, 64) + if err != nil { + return fmt.Errorf("cannot decode %v into Time", s) + } + + for i := len(fraction); i < 6; i++ { + n *= 10 + } + + usec += n + } + + return scanner.ScanTime(Time{Microseconds: usec, Valid: true}) +} + +func (c TimeCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + return codecDecodeToTextFormat(c, m, oid, format, src) +} + +func (c TimeCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var t Time + err := codecScan(c, m, oid, format, src, &t) + if err != nil { + return nil, err + } + return t, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/timestamp.go b/vendor/github.com/jackc/pgx/v5/pgtype/timestamp.go new file mode 100644 index 0000000..677a2c6 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/timestamp.go @@ -0,0 +1,356 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +const pgTimestampFormat = "2006-01-02 15:04:05.999999999" + +type TimestampScanner interface { + ScanTimestamp(v Timestamp) error +} + +type TimestampValuer interface { + TimestampValue() (Timestamp, error) +} + +// Timestamp represents the PostgreSQL timestamp type. +type Timestamp struct { + Time time.Time // Time zone will be ignored when encoding to PostgreSQL. + InfinityModifier InfinityModifier + Valid bool +} + +func (ts *Timestamp) ScanTimestamp(v Timestamp) error { + *ts = v + return nil +} + +func (ts Timestamp) TimestampValue() (Timestamp, error) { + return ts, nil +} + +// Scan implements the database/sql Scanner interface. +func (ts *Timestamp) Scan(src any) error { + if src == nil { + *ts = Timestamp{} + return nil + } + + switch src := src.(type) { + case string: + return (&scanPlanTextTimestampToTimestampScanner{}).Scan([]byte(src), ts) + case time.Time: + *ts = Timestamp{Time: src, Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (ts Timestamp) Value() (driver.Value, error) { + if !ts.Valid { + return nil, nil + } + + if ts.InfinityModifier != Finite { + return ts.InfinityModifier.String(), nil + } + return ts.Time, nil +} + +func (ts Timestamp) MarshalJSON() ([]byte, error) { + if !ts.Valid { + return []byte("null"), nil + } + + var s string + + switch ts.InfinityModifier { + case Finite: + s = ts.Time.Format(time.RFC3339Nano) + case Infinity: + s = "infinity" + case NegativeInfinity: + s = "-infinity" + } + + return json.Marshal(s) +} + +func (ts *Timestamp) UnmarshalJSON(b []byte) error { + var s *string + err := json.Unmarshal(b, &s) + if err != nil { + return err + } + + if s == nil { + *ts = Timestamp{} + return nil + } + + switch *s { + case "infinity": + *ts = Timestamp{Valid: true, InfinityModifier: Infinity} + case "-infinity": + *ts = Timestamp{Valid: true, InfinityModifier: -Infinity} + default: + // PostgreSQL uses ISO 8601 for to_json function and casting from a string to timestamptz + tim, err := time.Parse(time.RFC3339Nano, *s) + if err != nil { + return err + } + + *ts = Timestamp{Time: tim, Valid: true} + } + + return nil +} + +type TimestampCodec struct { + // ScanLocation is the location that the time is assumed to be in for scanning. This is different from + // TimestamptzCodec.ScanLocation in that this setting does change the instant in time that the timestamp represents. + ScanLocation *time.Location +} + +func (*TimestampCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (*TimestampCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (*TimestampCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(TimestampValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanTimestampCodecBinary{} + case TextFormatCode: + return encodePlanTimestampCodecText{} + } + + return nil +} + +type encodePlanTimestampCodecBinary struct{} + +func (encodePlanTimestampCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + ts, err := value.(TimestampValuer).TimestampValue() + if err != nil { + return nil, err + } + + if !ts.Valid { + return nil, nil + } + + var microsecSinceY2K int64 + switch ts.InfinityModifier { + case Finite: + t := discardTimeZone(ts.Time) + microsecSinceUnixEpoch := t.Unix()*1000000 + int64(t.Nanosecond())/1000 + microsecSinceY2K = microsecSinceUnixEpoch - microsecFromUnixEpochToY2K + case Infinity: + microsecSinceY2K = infinityMicrosecondOffset + case NegativeInfinity: + microsecSinceY2K = negativeInfinityMicrosecondOffset + } + + buf = pgio.AppendInt64(buf, microsecSinceY2K) + + return buf, nil +} + +type encodePlanTimestampCodecText struct{} + +func (encodePlanTimestampCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + ts, err := value.(TimestampValuer).TimestampValue() + if err != nil { + return nil, err + } + + if !ts.Valid { + return nil, nil + } + + var s string + + switch ts.InfinityModifier { + case Finite: + t := discardTimeZone(ts.Time) + + // Year 0000 is 1 BC + bc := false + if year := t.Year(); year <= 0 { + year = -year + 1 + t = time.Date(year, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC) + bc = true + } + + s = t.Truncate(time.Microsecond).Format(pgTimestampFormat) + + if bc { + s = s + " BC" + } + case Infinity: + s = "infinity" + case NegativeInfinity: + s = "-infinity" + } + + buf = append(buf, s...) + + return buf, nil +} + +func discardTimeZone(t time.Time) time.Time { + if t.Location() != time.UTC { + return time.Date(t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC) + } + + return t +} + +func (c *TimestampCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case TimestampScanner: + return &scanPlanBinaryTimestampToTimestampScanner{location: c.ScanLocation} + } + case TextFormatCode: + switch target.(type) { + case TimestampScanner: + return &scanPlanTextTimestampToTimestampScanner{location: c.ScanLocation} + } + } + + return nil +} + +type scanPlanBinaryTimestampToTimestampScanner struct{ location *time.Location } + +func (plan *scanPlanBinaryTimestampToTimestampScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TimestampScanner) + + if src == nil { + return scanner.ScanTimestamp(Timestamp{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for timestamp: %v", len(src)) + } + + var ts Timestamp + microsecSinceY2K := int64(binary.BigEndian.Uint64(src)) + + switch microsecSinceY2K { + case infinityMicrosecondOffset: + ts = Timestamp{Valid: true, InfinityModifier: Infinity} + case negativeInfinityMicrosecondOffset: + ts = Timestamp{Valid: true, InfinityModifier: -Infinity} + default: + tim := time.Unix( + microsecFromUnixEpochToY2K/1000000+microsecSinceY2K/1000000, + (microsecFromUnixEpochToY2K%1000000*1000)+(microsecSinceY2K%1000000*1000), + ).UTC() + if plan.location != nil { + tim = time.Date(tim.Year(), tim.Month(), tim.Day(), tim.Hour(), tim.Minute(), tim.Second(), tim.Nanosecond(), plan.location) + } + ts = Timestamp{Time: tim, Valid: true} + } + + return scanner.ScanTimestamp(ts) +} + +type scanPlanTextTimestampToTimestampScanner struct{ location *time.Location } + +func (plan *scanPlanTextTimestampToTimestampScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TimestampScanner) + + if src == nil { + return scanner.ScanTimestamp(Timestamp{}) + } + + var ts Timestamp + sbuf := string(src) + switch sbuf { + case "infinity": + ts = Timestamp{Valid: true, InfinityModifier: Infinity} + case "-infinity": + ts = Timestamp{Valid: true, InfinityModifier: -Infinity} + default: + bc := false + if strings.HasSuffix(sbuf, " BC") { + sbuf = sbuf[:len(sbuf)-3] + bc = true + } + tim, err := time.Parse(pgTimestampFormat, sbuf) + if err != nil { + return err + } + + if bc { + year := -tim.Year() + 1 + tim = time.Date(year, tim.Month(), tim.Day(), tim.Hour(), tim.Minute(), tim.Second(), tim.Nanosecond(), tim.Location()) + } + + if plan.location != nil { + tim = time.Date(tim.Year(), tim.Month(), tim.Day(), tim.Hour(), tim.Minute(), tim.Second(), tim.Nanosecond(), plan.location) + } + + ts = Timestamp{Time: tim, Valid: true} + } + + return scanner.ScanTimestamp(ts) +} + +func (c *TimestampCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var ts Timestamp + err := codecScan(c, m, oid, format, src, &ts) + if err != nil { + return nil, err + } + + if ts.InfinityModifier != Finite { + return ts.InfinityModifier.String(), nil + } + + return ts.Time, nil +} + +func (c *TimestampCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var ts Timestamp + err := codecScan(c, m, oid, format, src, &ts) + if err != nil { + return nil, err + } + + if ts.InfinityModifier != Finite { + return ts.InfinityModifier, nil + } + + return ts.Time, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/timestamptz.go b/vendor/github.com/jackc/pgx/v5/pgtype/timestamptz.go new file mode 100644 index 0000000..7efbcff --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/timestamptz.go @@ -0,0 +1,366 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +const pgTimestamptzHourFormat = "2006-01-02 15:04:05.999999999Z07" +const pgTimestamptzMinuteFormat = "2006-01-02 15:04:05.999999999Z07:00" +const pgTimestamptzSecondFormat = "2006-01-02 15:04:05.999999999Z07:00:00" +const microsecFromUnixEpochToY2K = 946684800 * 1000000 + +const ( + negativeInfinityMicrosecondOffset = -9223372036854775808 + infinityMicrosecondOffset = 9223372036854775807 +) + +type TimestamptzScanner interface { + ScanTimestamptz(v Timestamptz) error +} + +type TimestamptzValuer interface { + TimestamptzValue() (Timestamptz, error) +} + +// Timestamptz represents the PostgreSQL timestamptz type. +type Timestamptz struct { + Time time.Time + InfinityModifier InfinityModifier + Valid bool +} + +func (tstz *Timestamptz) ScanTimestamptz(v Timestamptz) error { + *tstz = v + return nil +} + +func (tstz Timestamptz) TimestamptzValue() (Timestamptz, error) { + return tstz, nil +} + +// Scan implements the database/sql Scanner interface. +func (tstz *Timestamptz) Scan(src any) error { + if src == nil { + *tstz = Timestamptz{} + return nil + } + + switch src := src.(type) { + case string: + return (&scanPlanTextTimestamptzToTimestamptzScanner{}).Scan([]byte(src), tstz) + case time.Time: + *tstz = Timestamptz{Time: src, Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (tstz Timestamptz) Value() (driver.Value, error) { + if !tstz.Valid { + return nil, nil + } + + if tstz.InfinityModifier != Finite { + return tstz.InfinityModifier.String(), nil + } + return tstz.Time, nil +} + +func (tstz Timestamptz) MarshalJSON() ([]byte, error) { + if !tstz.Valid { + return []byte("null"), nil + } + + var s string + + switch tstz.InfinityModifier { + case Finite: + s = tstz.Time.Format(time.RFC3339Nano) + case Infinity: + s = "infinity" + case NegativeInfinity: + s = "-infinity" + } + + return json.Marshal(s) +} + +func (tstz *Timestamptz) UnmarshalJSON(b []byte) error { + var s *string + err := json.Unmarshal(b, &s) + if err != nil { + return err + } + + if s == nil { + *tstz = Timestamptz{} + return nil + } + + switch *s { + case "infinity": + *tstz = Timestamptz{Valid: true, InfinityModifier: Infinity} + case "-infinity": + *tstz = Timestamptz{Valid: true, InfinityModifier: -Infinity} + default: + // PostgreSQL uses ISO 8601 for to_json function and casting from a string to timestamptz + tim, err := time.Parse(time.RFC3339Nano, *s) + if err != nil { + return err + } + + *tstz = Timestamptz{Time: tim, Valid: true} + } + + return nil +} + +type TimestamptzCodec struct { + // ScanLocation is the location to return scanned timestamptz values in. This does not change the instant in time that + // the timestamptz represents. + ScanLocation *time.Location +} + +func (*TimestamptzCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (*TimestamptzCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (*TimestamptzCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(TimestamptzValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanTimestamptzCodecBinary{} + case TextFormatCode: + return encodePlanTimestamptzCodecText{} + } + + return nil +} + +type encodePlanTimestamptzCodecBinary struct{} + +func (encodePlanTimestamptzCodecBinary) Encode(value any, buf []byte) (newBuf []byte, err error) { + ts, err := value.(TimestamptzValuer).TimestamptzValue() + if err != nil { + return nil, err + } + + if !ts.Valid { + return nil, nil + } + + var microsecSinceY2K int64 + switch ts.InfinityModifier { + case Finite: + microsecSinceUnixEpoch := ts.Time.Unix()*1000000 + int64(ts.Time.Nanosecond())/1000 + microsecSinceY2K = microsecSinceUnixEpoch - microsecFromUnixEpochToY2K + case Infinity: + microsecSinceY2K = infinityMicrosecondOffset + case NegativeInfinity: + microsecSinceY2K = negativeInfinityMicrosecondOffset + } + + buf = pgio.AppendInt64(buf, microsecSinceY2K) + + return buf, nil +} + +type encodePlanTimestamptzCodecText struct{} + +func (encodePlanTimestamptzCodecText) Encode(value any, buf []byte) (newBuf []byte, err error) { + ts, err := value.(TimestamptzValuer).TimestamptzValue() + if err != nil { + return nil, err + } + + if !ts.Valid { + return nil, nil + } + + var s string + + switch ts.InfinityModifier { + case Finite: + + t := ts.Time.UTC().Truncate(time.Microsecond) + + // Year 0000 is 1 BC + bc := false + if year := t.Year(); year <= 0 { + year = -year + 1 + t = time.Date(year, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC) + bc = true + } + + s = t.Format(pgTimestamptzSecondFormat) + + if bc { + s = s + " BC" + } + case Infinity: + s = "infinity" + case NegativeInfinity: + s = "-infinity" + } + + buf = append(buf, s...) + + return buf, nil +} + +func (c *TimestamptzCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case TimestamptzScanner: + return &scanPlanBinaryTimestamptzToTimestamptzScanner{location: c.ScanLocation} + } + case TextFormatCode: + switch target.(type) { + case TimestamptzScanner: + return &scanPlanTextTimestamptzToTimestamptzScanner{location: c.ScanLocation} + } + } + + return nil +} + +type scanPlanBinaryTimestamptzToTimestamptzScanner struct{ location *time.Location } + +func (plan *scanPlanBinaryTimestamptzToTimestamptzScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TimestamptzScanner) + + if src == nil { + return scanner.ScanTimestamptz(Timestamptz{}) + } + + if len(src) != 8 { + return fmt.Errorf("invalid length for timestamptz: %v", len(src)) + } + + var tstz Timestamptz + microsecSinceY2K := int64(binary.BigEndian.Uint64(src)) + + switch microsecSinceY2K { + case infinityMicrosecondOffset: + tstz = Timestamptz{Valid: true, InfinityModifier: Infinity} + case negativeInfinityMicrosecondOffset: + tstz = Timestamptz{Valid: true, InfinityModifier: -Infinity} + default: + tim := time.Unix( + microsecFromUnixEpochToY2K/1000000+microsecSinceY2K/1000000, + (microsecFromUnixEpochToY2K%1000000*1000)+(microsecSinceY2K%1000000*1000), + ) + if plan.location != nil { + tim = tim.In(plan.location) + } + tstz = Timestamptz{Time: tim, Valid: true} + } + + return scanner.ScanTimestamptz(tstz) +} + +type scanPlanTextTimestamptzToTimestamptzScanner struct{ location *time.Location } + +func (plan *scanPlanTextTimestamptzToTimestamptzScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TimestamptzScanner) + + if src == nil { + return scanner.ScanTimestamptz(Timestamptz{}) + } + + var tstz Timestamptz + sbuf := string(src) + switch sbuf { + case "infinity": + tstz = Timestamptz{Valid: true, InfinityModifier: Infinity} + case "-infinity": + tstz = Timestamptz{Valid: true, InfinityModifier: -Infinity} + default: + bc := false + if strings.HasSuffix(sbuf, " BC") { + sbuf = sbuf[:len(sbuf)-3] + bc = true + } + + var format string + if len(sbuf) >= 9 && (sbuf[len(sbuf)-9] == '-' || sbuf[len(sbuf)-9] == '+') { + format = pgTimestamptzSecondFormat + } else if len(sbuf) >= 6 && (sbuf[len(sbuf)-6] == '-' || sbuf[len(sbuf)-6] == '+') { + format = pgTimestamptzMinuteFormat + } else { + format = pgTimestamptzHourFormat + } + + tim, err := time.Parse(format, sbuf) + if err != nil { + return err + } + + if bc { + year := -tim.Year() + 1 + tim = time.Date(year, tim.Month(), tim.Day(), tim.Hour(), tim.Minute(), tim.Second(), tim.Nanosecond(), tim.Location()) + } + + if plan.location != nil { + tim = tim.In(plan.location) + } + + tstz = Timestamptz{Time: tim, Valid: true} + } + + return scanner.ScanTimestamptz(tstz) +} + +func (c *TimestamptzCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var tstz Timestamptz + err := codecScan(c, m, oid, format, src, &tstz) + if err != nil { + return nil, err + } + + if tstz.InfinityModifier != Finite { + return tstz.InfinityModifier.String(), nil + } + + return tstz.Time, nil +} + +func (c *TimestamptzCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var tstz Timestamptz + err := codecScan(c, m, oid, format, src, &tstz) + if err != nil { + return nil, err + } + + if tstz.InfinityModifier != Finite { + return tstz.InfinityModifier, nil + } + + return tstz.Time, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/uint32.go b/vendor/github.com/jackc/pgx/v5/pgtype/uint32.go new file mode 100644 index 0000000..f2b2fa6 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/uint32.go @@ -0,0 +1,325 @@ +package pgtype + +import ( + "database/sql/driver" + "encoding/binary" + "fmt" + "math" + "strconv" + + "github.com/jackc/pgx/v5/internal/pgio" +) + +type Uint32Scanner interface { + ScanUint32(v Uint32) error +} + +type Uint32Valuer interface { + Uint32Value() (Uint32, error) +} + +// Uint32 is the core type that is used to represent PostgreSQL types such as OID, CID, and XID. +type Uint32 struct { + Uint32 uint32 + Valid bool +} + +func (n *Uint32) ScanUint32(v Uint32) error { + *n = v + return nil +} + +func (n Uint32) Uint32Value() (Uint32, error) { + return n, nil +} + +// Scan implements the database/sql Scanner interface. +func (dst *Uint32) Scan(src any) error { + if src == nil { + *dst = Uint32{} + return nil + } + + var n int64 + + switch src := src.(type) { + case int64: + n = src + case string: + un, err := strconv.ParseUint(src, 10, 32) + if err != nil { + return err + } + n = int64(un) + default: + return fmt.Errorf("cannot scan %T", src) + } + + if n < 0 { + return fmt.Errorf("%d is less than the minimum value for Uint32", n) + } + if n > math.MaxUint32 { + return fmt.Errorf("%d is greater than maximum value for Uint32", n) + } + + *dst = Uint32{Uint32: uint32(n), Valid: true} + + return nil +} + +// Value implements the database/sql/driver Valuer interface. +func (src Uint32) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + return int64(src.Uint32), nil +} + +type Uint32Codec struct{} + +func (Uint32Codec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (Uint32Codec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (Uint32Codec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch format { + case BinaryFormatCode: + switch value.(type) { + case uint32: + return encodePlanUint32CodecBinaryUint32{} + case Uint32Valuer: + return encodePlanUint32CodecBinaryUint32Valuer{} + case Int64Valuer: + return encodePlanUint32CodecBinaryInt64Valuer{} + } + case TextFormatCode: + switch value.(type) { + case uint32: + return encodePlanUint32CodecTextUint32{} + case Int64Valuer: + return encodePlanUint32CodecTextInt64Valuer{} + } + } + + return nil +} + +type encodePlanUint32CodecBinaryUint32 struct{} + +func (encodePlanUint32CodecBinaryUint32) Encode(value any, buf []byte) (newBuf []byte, err error) { + v := value.(uint32) + return pgio.AppendUint32(buf, v), nil +} + +type encodePlanUint32CodecBinaryUint32Valuer struct{} + +func (encodePlanUint32CodecBinaryUint32Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + v, err := value.(Uint32Valuer).Uint32Value() + if err != nil { + return nil, err + } + + if !v.Valid { + return nil, nil + } + + return pgio.AppendUint32(buf, v.Uint32), nil +} + +type encodePlanUint32CodecBinaryInt64Valuer struct{} + +func (encodePlanUint32CodecBinaryInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + v, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !v.Valid { + return nil, nil + } + + if v.Int64 < 0 { + return nil, fmt.Errorf("%d is less than minimum value for uint32", v.Int64) + } + if v.Int64 > math.MaxUint32 { + return nil, fmt.Errorf("%d is greater than maximum value for uint32", v.Int64) + } + + return pgio.AppendUint32(buf, uint32(v.Int64)), nil +} + +type encodePlanUint32CodecTextUint32 struct{} + +func (encodePlanUint32CodecTextUint32) Encode(value any, buf []byte) (newBuf []byte, err error) { + v := value.(uint32) + return append(buf, strconv.FormatUint(uint64(v), 10)...), nil +} + +type encodePlanUint32CodecTextUint32Valuer struct{} + +func (encodePlanUint32CodecTextUint32Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + v, err := value.(Uint32Valuer).Uint32Value() + if err != nil { + return nil, err + } + + if !v.Valid { + return nil, nil + } + + return append(buf, strconv.FormatUint(uint64(v.Uint32), 10)...), nil +} + +type encodePlanUint32CodecTextInt64Valuer struct{} + +func (encodePlanUint32CodecTextInt64Valuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + v, err := value.(Int64Valuer).Int64Value() + if err != nil { + return nil, err + } + + if !v.Valid { + return nil, nil + } + + if v.Int64 < 0 { + return nil, fmt.Errorf("%d is less than minimum value for uint32", v.Int64) + } + if v.Int64 > math.MaxUint32 { + return nil, fmt.Errorf("%d is greater than maximum value for uint32", v.Int64) + } + + return append(buf, strconv.FormatInt(v.Int64, 10)...), nil +} + +func (Uint32Codec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + + switch format { + case BinaryFormatCode: + switch target.(type) { + case *uint32: + return scanPlanBinaryUint32ToUint32{} + case Uint32Scanner: + return scanPlanBinaryUint32ToUint32Scanner{} + case TextScanner: + return scanPlanBinaryUint32ToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case *uint32: + return scanPlanTextAnyToUint32{} + case Uint32Scanner: + return scanPlanTextAnyToUint32Scanner{} + } + } + + return nil +} + +func (c Uint32Codec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var n uint32 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return int64(n), nil +} + +func (c Uint32Codec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var n uint32 + err := codecScan(c, m, oid, format, src, &n) + if err != nil { + return nil, err + } + return n, nil +} + +type scanPlanBinaryUint32ToUint32 struct{} + +func (scanPlanBinaryUint32ToUint32) Scan(src []byte, dst any) error { + if src == nil { + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint32: %v", len(src)) + } + + p := (dst).(*uint32) + *p = binary.BigEndian.Uint32(src) + + return nil +} + +type scanPlanBinaryUint32ToUint32Scanner struct{} + +func (scanPlanBinaryUint32ToUint32Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Uint32Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanUint32(Uint32{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint32: %v", len(src)) + } + + n := binary.BigEndian.Uint32(src) + + return s.ScanUint32(Uint32{Uint32: n, Valid: true}) +} + +type scanPlanBinaryUint32ToTextScanner struct{} + +func (scanPlanBinaryUint32ToTextScanner) Scan(src []byte, dst any) error { + s, ok := (dst).(TextScanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanText(Text{}) + } + + if len(src) != 4 { + return fmt.Errorf("invalid length for uint32: %v", len(src)) + } + + n := uint64(binary.BigEndian.Uint32(src)) + return s.ScanText(Text{String: strconv.FormatUint(n, 10), Valid: true}) +} + +type scanPlanTextAnyToUint32Scanner struct{} + +func (scanPlanTextAnyToUint32Scanner) Scan(src []byte, dst any) error { + s, ok := (dst).(Uint32Scanner) + if !ok { + return ErrScanTargetTypeChanged + } + + if src == nil { + return s.ScanUint32(Uint32{}) + } + + n, err := strconv.ParseUint(string(src), 10, 32) + if err != nil { + return err + } + + return s.ScanUint32(Uint32{Uint32: uint32(n), Valid: true}) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/uuid.go b/vendor/github.com/jackc/pgx/v5/pgtype/uuid.go new file mode 100644 index 0000000..d57c0f2 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/uuid.go @@ -0,0 +1,281 @@ +package pgtype + +import ( + "bytes" + "database/sql/driver" + "encoding/hex" + "fmt" +) + +type UUIDScanner interface { + ScanUUID(v UUID) error +} + +type UUIDValuer interface { + UUIDValue() (UUID, error) +} + +type UUID struct { + Bytes [16]byte + Valid bool +} + +func (b *UUID) ScanUUID(v UUID) error { + *b = v + return nil +} + +func (b UUID) UUIDValue() (UUID, error) { + return b, nil +} + +// parseUUID converts a string UUID in standard form to a byte array. +func parseUUID(src string) (dst [16]byte, err error) { + switch len(src) { + case 36: + src = src[0:8] + src[9:13] + src[14:18] + src[19:23] + src[24:] + case 32: + // dashes already stripped, assume valid + default: + // assume invalid. + return dst, fmt.Errorf("cannot parse UUID %v", src) + } + + buf, err := hex.DecodeString(src) + if err != nil { + return dst, err + } + + copy(dst[:], buf) + return dst, err +} + +// encodeUUID converts a uuid byte array to UUID standard string form. +func encodeUUID(src [16]byte) string { + var buf [36]byte + + hex.Encode(buf[0:8], src[:4]) + buf[8] = '-' + hex.Encode(buf[9:13], src[4:6]) + buf[13] = '-' + hex.Encode(buf[14:18], src[6:8]) + buf[18] = '-' + hex.Encode(buf[19:23], src[8:10]) + buf[23] = '-' + hex.Encode(buf[24:], src[10:]) + + return string(buf[:]) +} + +// Scan implements the database/sql Scanner interface. +func (dst *UUID) Scan(src any) error { + if src == nil { + *dst = UUID{} + return nil + } + + switch src := src.(type) { + case string: + buf, err := parseUUID(src) + if err != nil { + return err + } + *dst = UUID{Bytes: buf, Valid: true} + return nil + } + + return fmt.Errorf("cannot scan %T", src) +} + +// Value implements the database/sql/driver Valuer interface. +func (src UUID) Value() (driver.Value, error) { + if !src.Valid { + return nil, nil + } + + return encodeUUID(src.Bytes), nil +} + +func (src UUID) MarshalJSON() ([]byte, error) { + if !src.Valid { + return []byte("null"), nil + } + + var buff bytes.Buffer + buff.WriteByte('"') + buff.WriteString(encodeUUID(src.Bytes)) + buff.WriteByte('"') + return buff.Bytes(), nil +} + +func (dst *UUID) UnmarshalJSON(src []byte) error { + if bytes.Equal(src, []byte("null")) { + *dst = UUID{} + return nil + } + if len(src) != 38 { + return fmt.Errorf("invalid length for UUID: %v", len(src)) + } + buf, err := parseUUID(string(src[1 : len(src)-1])) + if err != nil { + return err + } + *dst = UUID{Bytes: buf, Valid: true} + return nil +} + +type UUIDCodec struct{} + +func (UUIDCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (UUIDCodec) PreferredFormat() int16 { + return BinaryFormatCode +} + +func (UUIDCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + if _, ok := value.(UUIDValuer); !ok { + return nil + } + + switch format { + case BinaryFormatCode: + return encodePlanUUIDCodecBinaryUUIDValuer{} + case TextFormatCode: + return encodePlanUUIDCodecTextUUIDValuer{} + } + + return nil +} + +type encodePlanUUIDCodecBinaryUUIDValuer struct{} + +func (encodePlanUUIDCodecBinaryUUIDValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + uuid, err := value.(UUIDValuer).UUIDValue() + if err != nil { + return nil, err + } + + if !uuid.Valid { + return nil, nil + } + + return append(buf, uuid.Bytes[:]...), nil +} + +type encodePlanUUIDCodecTextUUIDValuer struct{} + +func (encodePlanUUIDCodecTextUUIDValuer) Encode(value any, buf []byte) (newBuf []byte, err error) { + uuid, err := value.(UUIDValuer).UUIDValue() + if err != nil { + return nil, err + } + + if !uuid.Valid { + return nil, nil + } + + return append(buf, encodeUUID(uuid.Bytes)...), nil +} + +func (UUIDCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch format { + case BinaryFormatCode: + switch target.(type) { + case UUIDScanner: + return scanPlanBinaryUUIDToUUIDScanner{} + case TextScanner: + return scanPlanBinaryUUIDToTextScanner{} + } + case TextFormatCode: + switch target.(type) { + case UUIDScanner: + return scanPlanTextAnyToUUIDScanner{} + } + } + + return nil +} + +type scanPlanBinaryUUIDToUUIDScanner struct{} + +func (scanPlanBinaryUUIDToUUIDScanner) Scan(src []byte, dst any) error { + scanner := (dst).(UUIDScanner) + + if src == nil { + return scanner.ScanUUID(UUID{}) + } + + if len(src) != 16 { + return fmt.Errorf("invalid length for UUID: %v", len(src)) + } + + uuid := UUID{Valid: true} + copy(uuid.Bytes[:], src) + + return scanner.ScanUUID(uuid) +} + +type scanPlanBinaryUUIDToTextScanner struct{} + +func (scanPlanBinaryUUIDToTextScanner) Scan(src []byte, dst any) error { + scanner := (dst).(TextScanner) + + if src == nil { + return scanner.ScanText(Text{}) + } + + if len(src) != 16 { + return fmt.Errorf("invalid length for UUID: %v", len(src)) + } + + var buf [16]byte + copy(buf[:], src) + + return scanner.ScanText(Text{String: encodeUUID(buf), Valid: true}) +} + +type scanPlanTextAnyToUUIDScanner struct{} + +func (scanPlanTextAnyToUUIDScanner) Scan(src []byte, dst any) error { + scanner := (dst).(UUIDScanner) + + if src == nil { + return scanner.ScanUUID(UUID{}) + } + + buf, err := parseUUID(string(src)) + if err != nil { + return err + } + + return scanner.ScanUUID(UUID{Bytes: buf, Valid: true}) +} + +func (c UUIDCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + var uuid UUID + err := codecScan(c, m, oid, format, src, &uuid) + if err != nil { + return nil, err + } + + return encodeUUID(uuid.Bytes), nil +} + +func (c UUIDCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var uuid UUID + err := codecScan(c, m, oid, format, src, &uuid) + if err != nil { + return nil, err + } + return uuid.Bytes, nil +} diff --git a/vendor/github.com/jackc/pgx/v5/pgtype/xml.go b/vendor/github.com/jackc/pgx/v5/pgtype/xml.go new file mode 100644 index 0000000..fb4c49a --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgtype/xml.go @@ -0,0 +1,198 @@ +package pgtype + +import ( + "database/sql" + "database/sql/driver" + "encoding/xml" + "fmt" + "reflect" +) + +type XMLCodec struct { + Marshal func(v any) ([]byte, error) + Unmarshal func(data []byte, v any) error +} + +func (*XMLCodec) FormatSupported(format int16) bool { + return format == TextFormatCode || format == BinaryFormatCode +} + +func (*XMLCodec) PreferredFormat() int16 { + return TextFormatCode +} + +func (c *XMLCodec) PlanEncode(m *Map, oid uint32, format int16, value any) EncodePlan { + switch value.(type) { + case string: + return encodePlanXMLCodecEitherFormatString{} + case []byte: + return encodePlanXMLCodecEitherFormatByteSlice{} + + // Cannot rely on driver.Valuer being handled later because anything can be marshalled. + // + // https://github.com/jackc/pgx/issues/1430 + // + // Check for driver.Valuer must come before xml.Marshaler so that it is guaranteed to be used + // when both are implemented https://github.com/jackc/pgx/issues/1805 + case driver.Valuer: + return &encodePlanDriverValuer{m: m, oid: oid, formatCode: format} + + // Must come before trying wrap encode plans because a pointer to a struct may be unwrapped to a struct that can be + // marshalled. + // + // https://github.com/jackc/pgx/issues/1681 + case xml.Marshaler: + return &encodePlanXMLCodecEitherFormatMarshal{ + marshal: c.Marshal, + } + } + + // Because anything can be marshalled the normal wrapping in Map.PlanScan doesn't get a chance to run. So try the + // appropriate wrappers here. + for _, f := range []TryWrapEncodePlanFunc{ + TryWrapDerefPointerEncodePlan, + TryWrapFindUnderlyingTypeEncodePlan, + } { + if wrapperPlan, nextValue, ok := f(value); ok { + if nextPlan := c.PlanEncode(m, oid, format, nextValue); nextPlan != nil { + wrapperPlan.SetNext(nextPlan) + return wrapperPlan + } + } + } + + return &encodePlanXMLCodecEitherFormatMarshal{ + marshal: c.Marshal, + } +} + +type encodePlanXMLCodecEitherFormatString struct{} + +func (encodePlanXMLCodecEitherFormatString) Encode(value any, buf []byte) (newBuf []byte, err error) { + xmlString := value.(string) + buf = append(buf, xmlString...) + return buf, nil +} + +type encodePlanXMLCodecEitherFormatByteSlice struct{} + +func (encodePlanXMLCodecEitherFormatByteSlice) Encode(value any, buf []byte) (newBuf []byte, err error) { + xmlBytes := value.([]byte) + if xmlBytes == nil { + return nil, nil + } + + buf = append(buf, xmlBytes...) + return buf, nil +} + +type encodePlanXMLCodecEitherFormatMarshal struct { + marshal func(v any) ([]byte, error) +} + +func (e *encodePlanXMLCodecEitherFormatMarshal) Encode(value any, buf []byte) (newBuf []byte, err error) { + xmlBytes, err := e.marshal(value) + if err != nil { + return nil, err + } + + buf = append(buf, xmlBytes...) + return buf, nil +} + +func (c *XMLCodec) PlanScan(m *Map, oid uint32, format int16, target any) ScanPlan { + switch target.(type) { + case *string: + return scanPlanAnyToString{} + + case **string: + // This is to fix **string scanning. It seems wrong to special case **string, but it's not clear what a better + // solution would be. + // + // https://github.com/jackc/pgx/issues/1470 -- **string + // https://github.com/jackc/pgx/issues/1691 -- ** anything else + + if wrapperPlan, nextDst, ok := TryPointerPointerScanPlan(target); ok { + if nextPlan := m.planScan(oid, format, nextDst); nextPlan != nil { + if _, failed := nextPlan.(*scanPlanFail); !failed { + wrapperPlan.SetNext(nextPlan) + return wrapperPlan + } + } + } + + case *[]byte: + return scanPlanXMLToByteSlice{} + case BytesScanner: + return scanPlanBinaryBytesToBytesScanner{} + + // Cannot rely on sql.Scanner being handled later because scanPlanXMLToXMLUnmarshal will take precedence. + // + // https://github.com/jackc/pgx/issues/1418 + case sql.Scanner: + return &scanPlanSQLScanner{formatCode: format} + } + + return &scanPlanXMLToXMLUnmarshal{ + unmarshal: c.Unmarshal, + } +} + +type scanPlanXMLToByteSlice struct{} + +func (scanPlanXMLToByteSlice) Scan(src []byte, dst any) error { + dstBuf := dst.(*[]byte) + if src == nil { + *dstBuf = nil + return nil + } + + *dstBuf = make([]byte, len(src)) + copy(*dstBuf, src) + return nil +} + +type scanPlanXMLToXMLUnmarshal struct { + unmarshal func(data []byte, v any) error +} + +func (s *scanPlanXMLToXMLUnmarshal) Scan(src []byte, dst any) error { + if src == nil { + dstValue := reflect.ValueOf(dst) + if dstValue.Kind() == reflect.Ptr { + el := dstValue.Elem() + switch el.Kind() { + case reflect.Ptr, reflect.Slice, reflect.Map, reflect.Interface, reflect.Struct: + el.Set(reflect.Zero(el.Type())) + return nil + } + } + + return fmt.Errorf("cannot scan NULL into %T", dst) + } + + elem := reflect.ValueOf(dst).Elem() + elem.Set(reflect.Zero(elem.Type())) + + return s.unmarshal(src, dst) +} + +func (c *XMLCodec) DecodeDatabaseSQLValue(m *Map, oid uint32, format int16, src []byte) (driver.Value, error) { + if src == nil { + return nil, nil + } + + dstBuf := make([]byte, len(src)) + copy(dstBuf, src) + return dstBuf, nil +} + +func (c *XMLCodec) DecodeValue(m *Map, oid uint32, format int16, src []byte) (any, error) { + if src == nil { + return nil, nil + } + + var dst any + err := c.Unmarshal(src, &dst) + return dst, err +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/batch_results.go b/vendor/github.com/jackc/pgx/v5/pgxpool/batch_results.go new file mode 100644 index 0000000..5d5c681 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/batch_results.go @@ -0,0 +1,52 @@ +package pgxpool + +import ( + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" +) + +type errBatchResults struct { + err error +} + +func (br errBatchResults) Exec() (pgconn.CommandTag, error) { + return pgconn.CommandTag{}, br.err +} + +func (br errBatchResults) Query() (pgx.Rows, error) { + return errRows{err: br.err}, br.err +} + +func (br errBatchResults) QueryRow() pgx.Row { + return errRow{err: br.err} +} + +func (br errBatchResults) Close() error { + return br.err +} + +type poolBatchResults struct { + br pgx.BatchResults + c *Conn +} + +func (br *poolBatchResults) Exec() (pgconn.CommandTag, error) { + return br.br.Exec() +} + +func (br *poolBatchResults) Query() (pgx.Rows, error) { + return br.br.Query() +} + +func (br *poolBatchResults) QueryRow() pgx.Row { + return br.br.QueryRow() +} + +func (br *poolBatchResults) Close() error { + err := br.br.Close() + if br.c != nil { + br.c.Release() + br.c = nil + } + return err +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/conn.go b/vendor/github.com/jackc/pgx/v5/pgxpool/conn.go new file mode 100644 index 0000000..38c90f3 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/conn.go @@ -0,0 +1,134 @@ +package pgxpool + +import ( + "context" + "sync/atomic" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/puddle/v2" +) + +// Conn is an acquired *pgx.Conn from a Pool. +type Conn struct { + res *puddle.Resource[*connResource] + p *Pool +} + +// Release returns c to the pool it was acquired from. Once Release has been called, other methods must not be called. +// However, it is safe to call Release multiple times. Subsequent calls after the first will be ignored. +func (c *Conn) Release() { + if c.res == nil { + return + } + + conn := c.Conn() + res := c.res + c.res = nil + + if c.p.releaseTracer != nil { + c.p.releaseTracer.TraceRelease(c.p, TraceReleaseData{Conn: conn}) + } + + if conn.IsClosed() || conn.PgConn().IsBusy() || conn.PgConn().TxStatus() != 'I' { + res.Destroy() + // Signal to the health check to run since we just destroyed a connections + // and we might be below minConns now + c.p.triggerHealthCheck() + return + } + + // If the pool is consistently being used, we might never get to check the + // lifetime of a connection since we only check idle connections in checkConnsHealth + // so we also check the lifetime here and force a health check + if c.p.isExpired(res) { + atomic.AddInt64(&c.p.lifetimeDestroyCount, 1) + res.Destroy() + // Signal to the health check to run since we just destroyed a connections + // and we might be below minConns now + c.p.triggerHealthCheck() + return + } + + if c.p.afterRelease == nil { + res.Release() + return + } + + go func() { + if c.p.afterRelease(conn) { + res.Release() + } else { + res.Destroy() + // Signal to the health check to run since we just destroyed a connections + // and we might be below minConns now + c.p.triggerHealthCheck() + } + }() +} + +// Hijack assumes ownership of the connection from the pool. Caller is responsible for closing the connection. Hijack +// will panic if called on an already released or hijacked connection. +func (c *Conn) Hijack() *pgx.Conn { + if c.res == nil { + panic("cannot hijack already released or hijacked connection") + } + + conn := c.Conn() + res := c.res + c.res = nil + + res.Hijack() + + return conn +} + +func (c *Conn) Exec(ctx context.Context, sql string, arguments ...any) (pgconn.CommandTag, error) { + return c.Conn().Exec(ctx, sql, arguments...) +} + +func (c *Conn) Query(ctx context.Context, sql string, args ...any) (pgx.Rows, error) { + return c.Conn().Query(ctx, sql, args...) +} + +func (c *Conn) QueryRow(ctx context.Context, sql string, args ...any) pgx.Row { + return c.Conn().QueryRow(ctx, sql, args...) +} + +func (c *Conn) SendBatch(ctx context.Context, b *pgx.Batch) pgx.BatchResults { + return c.Conn().SendBatch(ctx, b) +} + +func (c *Conn) CopyFrom(ctx context.Context, tableName pgx.Identifier, columnNames []string, rowSrc pgx.CopyFromSource) (int64, error) { + return c.Conn().CopyFrom(ctx, tableName, columnNames, rowSrc) +} + +// Begin starts a transaction block from the *Conn without explicitly setting a transaction mode (see BeginTx with TxOptions if transaction mode is required). +func (c *Conn) Begin(ctx context.Context) (pgx.Tx, error) { + return c.Conn().Begin(ctx) +} + +// BeginTx starts a transaction block from the *Conn with txOptions determining the transaction mode. +func (c *Conn) BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) { + return c.Conn().BeginTx(ctx, txOptions) +} + +func (c *Conn) Ping(ctx context.Context) error { + return c.Conn().Ping(ctx) +} + +func (c *Conn) Conn() *pgx.Conn { + return c.connResource().conn +} + +func (c *Conn) connResource() *connResource { + return c.res.Value() +} + +func (c *Conn) getPoolRow(r pgx.Row) *poolRow { + return c.connResource().getPoolRow(c, r) +} + +func (c *Conn) getPoolRows(r pgx.Rows) *poolRows { + return c.connResource().getPoolRows(c, r) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/doc.go b/vendor/github.com/jackc/pgx/v5/pgxpool/doc.go new file mode 100644 index 0000000..099443b --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/doc.go @@ -0,0 +1,27 @@ +// Package pgxpool is a concurrency-safe connection pool for pgx. +/* +pgxpool implements a nearly identical interface to pgx connections. + +Creating a Pool + +The primary way of creating a pool is with [pgxpool.New]: + + pool, err := pgxpool.New(context.Background(), os.Getenv("DATABASE_URL")) + +The database connection string can be in URL or keyword/value format. PostgreSQL settings, pgx settings, and pool settings can be +specified here. In addition, a config struct can be created by [ParseConfig]. + + config, err := pgxpool.ParseConfig(os.Getenv("DATABASE_URL")) + if err != nil { + // ... + } + config.AfterConnect = func(ctx context.Context, conn *pgx.Conn) error { + // do something with every new connection + } + + pool, err := pgxpool.NewWithConfig(context.Background(), config) + +A pool returns without waiting for any connections to be established. Acquire a connection immediately after creating +the pool to check if a connection can successfully be established. +*/ +package pgxpool diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/pool.go b/vendor/github.com/jackc/pgx/v5/pgxpool/pool.go new file mode 100644 index 0000000..fdcba72 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/pool.go @@ -0,0 +1,717 @@ +package pgxpool + +import ( + "context" + "fmt" + "math/rand" + "runtime" + "strconv" + "sync" + "sync/atomic" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/puddle/v2" +) + +var defaultMaxConns = int32(4) +var defaultMinConns = int32(0) +var defaultMaxConnLifetime = time.Hour +var defaultMaxConnIdleTime = time.Minute * 30 +var defaultHealthCheckPeriod = time.Minute + +type connResource struct { + conn *pgx.Conn + conns []Conn + poolRows []poolRow + poolRowss []poolRows + maxAgeTime time.Time +} + +func (cr *connResource) getConn(p *Pool, res *puddle.Resource[*connResource]) *Conn { + if len(cr.conns) == 0 { + cr.conns = make([]Conn, 128) + } + + c := &cr.conns[len(cr.conns)-1] + cr.conns = cr.conns[0 : len(cr.conns)-1] + + c.res = res + c.p = p + + return c +} + +func (cr *connResource) getPoolRow(c *Conn, r pgx.Row) *poolRow { + if len(cr.poolRows) == 0 { + cr.poolRows = make([]poolRow, 128) + } + + pr := &cr.poolRows[len(cr.poolRows)-1] + cr.poolRows = cr.poolRows[0 : len(cr.poolRows)-1] + + pr.c = c + pr.r = r + + return pr +} + +func (cr *connResource) getPoolRows(c *Conn, r pgx.Rows) *poolRows { + if len(cr.poolRowss) == 0 { + cr.poolRowss = make([]poolRows, 128) + } + + pr := &cr.poolRowss[len(cr.poolRowss)-1] + cr.poolRowss = cr.poolRowss[0 : len(cr.poolRowss)-1] + + pr.c = c + pr.r = r + + return pr +} + +// Pool allows for connection reuse. +type Pool struct { + // 64 bit fields accessed with atomics must be at beginning of struct to guarantee alignment for certain 32-bit + // architectures. See BUGS section of https://pkg.go.dev/sync/atomic and https://github.com/jackc/pgx/issues/1288. + newConnsCount int64 + lifetimeDestroyCount int64 + idleDestroyCount int64 + + p *puddle.Pool[*connResource] + config *Config + beforeConnect func(context.Context, *pgx.ConnConfig) error + afterConnect func(context.Context, *pgx.Conn) error + beforeAcquire func(context.Context, *pgx.Conn) bool + afterRelease func(*pgx.Conn) bool + beforeClose func(*pgx.Conn) + minConns int32 + maxConns int32 + maxConnLifetime time.Duration + maxConnLifetimeJitter time.Duration + maxConnIdleTime time.Duration + healthCheckPeriod time.Duration + + healthCheckChan chan struct{} + + acquireTracer AcquireTracer + releaseTracer ReleaseTracer + + closeOnce sync.Once + closeChan chan struct{} +} + +// Config is the configuration struct for creating a pool. It must be created by [ParseConfig] and then it can be +// modified. +type Config struct { + ConnConfig *pgx.ConnConfig + + // BeforeConnect is called before a new connection is made. It is passed a copy of the underlying pgx.ConnConfig and + // will not impact any existing open connections. + BeforeConnect func(context.Context, *pgx.ConnConfig) error + + // AfterConnect is called after a connection is established, but before it is added to the pool. + AfterConnect func(context.Context, *pgx.Conn) error + + // BeforeAcquire is called before a connection is acquired from the pool. It must return true to allow the + // acquisition or false to indicate that the connection should be destroyed and a different connection should be + // acquired. + BeforeAcquire func(context.Context, *pgx.Conn) bool + + // AfterRelease is called after a connection is released, but before it is returned to the pool. It must return true to + // return the connection to the pool or false to destroy the connection. + AfterRelease func(*pgx.Conn) bool + + // BeforeClose is called right before a connection is closed and removed from the pool. + BeforeClose func(*pgx.Conn) + + // MaxConnLifetime is the duration since creation after which a connection will be automatically closed. + MaxConnLifetime time.Duration + + // MaxConnLifetimeJitter is the duration after MaxConnLifetime to randomly decide to close a connection. + // This helps prevent all connections from being closed at the exact same time, starving the pool. + MaxConnLifetimeJitter time.Duration + + // MaxConnIdleTime is the duration after which an idle connection will be automatically closed by the health check. + MaxConnIdleTime time.Duration + + // MaxConns is the maximum size of the pool. The default is the greater of 4 or runtime.NumCPU(). + MaxConns int32 + + // MinConns is the minimum size of the pool. After connection closes, the pool might dip below MinConns. A low + // number of MinConns might mean the pool is empty after MaxConnLifetime until the health check has a chance + // to create new connections. + MinConns int32 + + // HealthCheckPeriod is the duration between checks of the health of idle connections. + HealthCheckPeriod time.Duration + + createdByParseConfig bool // Used to enforce created by ParseConfig rule. +} + +// Copy returns a deep copy of the config that is safe to use and modify. +// The only exception is the tls.Config: +// according to the tls.Config docs it must not be modified after creation. +func (c *Config) Copy() *Config { + newConfig := new(Config) + *newConfig = *c + newConfig.ConnConfig = c.ConnConfig.Copy() + return newConfig +} + +// ConnString returns the connection string as parsed by pgxpool.ParseConfig into pgxpool.Config. +func (c *Config) ConnString() string { return c.ConnConfig.ConnString() } + +// New creates a new Pool. See [ParseConfig] for information on connString format. +func New(ctx context.Context, connString string) (*Pool, error) { + config, err := ParseConfig(connString) + if err != nil { + return nil, err + } + + return NewWithConfig(ctx, config) +} + +// NewWithConfig creates a new Pool. config must have been created by [ParseConfig]. +func NewWithConfig(ctx context.Context, config *Config) (*Pool, error) { + // Default values are set in ParseConfig. Enforce initial creation by ParseConfig rather than setting defaults from + // zero values. + if !config.createdByParseConfig { + panic("config must be created by ParseConfig") + } + + p := &Pool{ + config: config, + beforeConnect: config.BeforeConnect, + afterConnect: config.AfterConnect, + beforeAcquire: config.BeforeAcquire, + afterRelease: config.AfterRelease, + beforeClose: config.BeforeClose, + minConns: config.MinConns, + maxConns: config.MaxConns, + maxConnLifetime: config.MaxConnLifetime, + maxConnLifetimeJitter: config.MaxConnLifetimeJitter, + maxConnIdleTime: config.MaxConnIdleTime, + healthCheckPeriod: config.HealthCheckPeriod, + healthCheckChan: make(chan struct{}, 1), + closeChan: make(chan struct{}), + } + + if t, ok := config.ConnConfig.Tracer.(AcquireTracer); ok { + p.acquireTracer = t + } + + if t, ok := config.ConnConfig.Tracer.(ReleaseTracer); ok { + p.releaseTracer = t + } + + var err error + p.p, err = puddle.NewPool( + &puddle.Config[*connResource]{ + Constructor: func(ctx context.Context) (*connResource, error) { + atomic.AddInt64(&p.newConnsCount, 1) + connConfig := p.config.ConnConfig.Copy() + + // Connection will continue in background even if Acquire is canceled. Ensure that a connect won't hang forever. + if connConfig.ConnectTimeout <= 0 { + connConfig.ConnectTimeout = 2 * time.Minute + } + + if p.beforeConnect != nil { + if err := p.beforeConnect(ctx, connConfig); err != nil { + return nil, err + } + } + + conn, err := pgx.ConnectConfig(ctx, connConfig) + if err != nil { + return nil, err + } + + if p.afterConnect != nil { + err = p.afterConnect(ctx, conn) + if err != nil { + conn.Close(ctx) + return nil, err + } + } + + jitterSecs := rand.Float64() * config.MaxConnLifetimeJitter.Seconds() + maxAgeTime := time.Now().Add(config.MaxConnLifetime).Add(time.Duration(jitterSecs) * time.Second) + + cr := &connResource{ + conn: conn, + conns: make([]Conn, 64), + poolRows: make([]poolRow, 64), + poolRowss: make([]poolRows, 64), + maxAgeTime: maxAgeTime, + } + + return cr, nil + }, + Destructor: func(value *connResource) { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + conn := value.conn + if p.beforeClose != nil { + p.beforeClose(conn) + } + conn.Close(ctx) + select { + case <-conn.PgConn().CleanupDone(): + case <-ctx.Done(): + } + cancel() + }, + MaxSize: config.MaxConns, + }, + ) + if err != nil { + return nil, err + } + + go func() { + p.createIdleResources(ctx, int(p.minConns)) + p.backgroundHealthCheck() + }() + + return p, nil +} + +// ParseConfig builds a Config from connString. It parses connString with the same behavior as [pgx.ParseConfig] with the +// addition of the following variables: +// +// - pool_max_conns: integer greater than 0 +// - pool_min_conns: integer 0 or greater +// - pool_max_conn_lifetime: duration string +// - pool_max_conn_idle_time: duration string +// - pool_health_check_period: duration string +// - pool_max_conn_lifetime_jitter: duration string +// +// See Config for definitions of these arguments. +// +// # Example Keyword/Value +// user=jack password=secret host=pg.example.com port=5432 dbname=mydb sslmode=verify-ca pool_max_conns=10 +// +// # Example URL +// postgres://jack:secret@pg.example.com:5432/mydb?sslmode=verify-ca&pool_max_conns=10 +func ParseConfig(connString string) (*Config, error) { + connConfig, err := pgx.ParseConfig(connString) + if err != nil { + return nil, err + } + + config := &Config{ + ConnConfig: connConfig, + createdByParseConfig: true, + } + + if s, ok := config.ConnConfig.Config.RuntimeParams["pool_max_conns"]; ok { + delete(connConfig.Config.RuntimeParams, "pool_max_conns") + n, err := strconv.ParseInt(s, 10, 32) + if err != nil { + return nil, fmt.Errorf("cannot parse pool_max_conns: %w", err) + } + if n < 1 { + return nil, fmt.Errorf("pool_max_conns too small: %d", n) + } + config.MaxConns = int32(n) + } else { + config.MaxConns = defaultMaxConns + if numCPU := int32(runtime.NumCPU()); numCPU > config.MaxConns { + config.MaxConns = numCPU + } + } + + if s, ok := config.ConnConfig.Config.RuntimeParams["pool_min_conns"]; ok { + delete(connConfig.Config.RuntimeParams, "pool_min_conns") + n, err := strconv.ParseInt(s, 10, 32) + if err != nil { + return nil, fmt.Errorf("cannot parse pool_min_conns: %w", err) + } + config.MinConns = int32(n) + } else { + config.MinConns = defaultMinConns + } + + if s, ok := config.ConnConfig.Config.RuntimeParams["pool_max_conn_lifetime"]; ok { + delete(connConfig.Config.RuntimeParams, "pool_max_conn_lifetime") + d, err := time.ParseDuration(s) + if err != nil { + return nil, fmt.Errorf("invalid pool_max_conn_lifetime: %w", err) + } + config.MaxConnLifetime = d + } else { + config.MaxConnLifetime = defaultMaxConnLifetime + } + + if s, ok := config.ConnConfig.Config.RuntimeParams["pool_max_conn_idle_time"]; ok { + delete(connConfig.Config.RuntimeParams, "pool_max_conn_idle_time") + d, err := time.ParseDuration(s) + if err != nil { + return nil, fmt.Errorf("invalid pool_max_conn_idle_time: %w", err) + } + config.MaxConnIdleTime = d + } else { + config.MaxConnIdleTime = defaultMaxConnIdleTime + } + + if s, ok := config.ConnConfig.Config.RuntimeParams["pool_health_check_period"]; ok { + delete(connConfig.Config.RuntimeParams, "pool_health_check_period") + d, err := time.ParseDuration(s) + if err != nil { + return nil, fmt.Errorf("invalid pool_health_check_period: %w", err) + } + config.HealthCheckPeriod = d + } else { + config.HealthCheckPeriod = defaultHealthCheckPeriod + } + + if s, ok := config.ConnConfig.Config.RuntimeParams["pool_max_conn_lifetime_jitter"]; ok { + delete(connConfig.Config.RuntimeParams, "pool_max_conn_lifetime_jitter") + d, err := time.ParseDuration(s) + if err != nil { + return nil, fmt.Errorf("invalid pool_max_conn_lifetime_jitter: %w", err) + } + config.MaxConnLifetimeJitter = d + } + + return config, nil +} + +// Close closes all connections in the pool and rejects future Acquire calls. Blocks until all connections are returned +// to pool and closed. +func (p *Pool) Close() { + p.closeOnce.Do(func() { + close(p.closeChan) + p.p.Close() + }) +} + +func (p *Pool) isExpired(res *puddle.Resource[*connResource]) bool { + return time.Now().After(res.Value().maxAgeTime) +} + +func (p *Pool) triggerHealthCheck() { + go func() { + // Destroy is asynchronous so we give it time to actually remove itself from + // the pool otherwise we might try to check the pool size too soon + time.Sleep(500 * time.Millisecond) + select { + case p.healthCheckChan <- struct{}{}: + default: + } + }() +} + +func (p *Pool) backgroundHealthCheck() { + ticker := time.NewTicker(p.healthCheckPeriod) + defer ticker.Stop() + for { + select { + case <-p.closeChan: + return + case <-p.healthCheckChan: + p.checkHealth() + case <-ticker.C: + p.checkHealth() + } + } +} + +func (p *Pool) checkHealth() { + for { + // If checkMinConns failed we don't destroy any connections since we couldn't + // even get to minConns + if err := p.checkMinConns(); err != nil { + // Should we log this error somewhere? + break + } + if !p.checkConnsHealth() { + // Since we didn't destroy any connections we can stop looping + break + } + // Technically Destroy is asynchronous but 500ms should be enough for it to + // remove it from the underlying pool + select { + case <-p.closeChan: + return + case <-time.After(500 * time.Millisecond): + } + } +} + +// checkConnsHealth will check all idle connections, destroy a connection if +// it's idle or too old, and returns true if any were destroyed +func (p *Pool) checkConnsHealth() bool { + var destroyed bool + totalConns := p.Stat().TotalConns() + resources := p.p.AcquireAllIdle() + for _, res := range resources { + // We're okay going under minConns if the lifetime is up + if p.isExpired(res) && totalConns >= p.minConns { + atomic.AddInt64(&p.lifetimeDestroyCount, 1) + res.Destroy() + destroyed = true + // Since Destroy is async we manually decrement totalConns. + totalConns-- + } else if res.IdleDuration() > p.maxConnIdleTime && totalConns > p.minConns { + atomic.AddInt64(&p.idleDestroyCount, 1) + res.Destroy() + destroyed = true + // Since Destroy is async we manually decrement totalConns. + totalConns-- + } else { + res.ReleaseUnused() + } + } + return destroyed +} + +func (p *Pool) checkMinConns() error { + // TotalConns can include ones that are being destroyed but we should have + // sleep(500ms) around all of the destroys to help prevent that from throwing + // off this check + toCreate := p.minConns - p.Stat().TotalConns() + if toCreate > 0 { + return p.createIdleResources(context.Background(), int(toCreate)) + } + return nil +} + +func (p *Pool) createIdleResources(parentCtx context.Context, targetResources int) error { + ctx, cancel := context.WithCancel(parentCtx) + defer cancel() + + errs := make(chan error, targetResources) + + for i := 0; i < targetResources; i++ { + go func() { + err := p.p.CreateResource(ctx) + // Ignore ErrNotAvailable since it means that the pool has become full since we started creating resource. + if err == puddle.ErrNotAvailable { + err = nil + } + errs <- err + }() + } + + var firstError error + for i := 0; i < targetResources; i++ { + err := <-errs + if err != nil && firstError == nil { + cancel() + firstError = err + } + } + + return firstError +} + +// Acquire returns a connection (*Conn) from the Pool +func (p *Pool) Acquire(ctx context.Context) (c *Conn, err error) { + if p.acquireTracer != nil { + ctx = p.acquireTracer.TraceAcquireStart(ctx, p, TraceAcquireStartData{}) + defer func() { + var conn *pgx.Conn + if c != nil { + conn = c.Conn() + } + p.acquireTracer.TraceAcquireEnd(ctx, p, TraceAcquireEndData{Conn: conn, Err: err}) + }() + } + + for { + res, err := p.p.Acquire(ctx) + if err != nil { + return nil, err + } + + cr := res.Value() + + if res.IdleDuration() > time.Second { + err := cr.conn.Ping(ctx) + if err != nil { + res.Destroy() + continue + } + } + + if p.beforeAcquire == nil || p.beforeAcquire(ctx, cr.conn) { + return cr.getConn(p, res), nil + } + + res.Destroy() + } +} + +// AcquireFunc acquires a *Conn and calls f with that *Conn. ctx will only affect the Acquire. It has no effect on the +// call of f. The return value is either an error acquiring the *Conn or the return value of f. The *Conn is +// automatically released after the call of f. +func (p *Pool) AcquireFunc(ctx context.Context, f func(*Conn) error) error { + conn, err := p.Acquire(ctx) + if err != nil { + return err + } + defer conn.Release() + + return f(conn) +} + +// AcquireAllIdle atomically acquires all currently idle connections. Its intended use is for health check and +// keep-alive functionality. It does not update pool statistics. +func (p *Pool) AcquireAllIdle(ctx context.Context) []*Conn { + resources := p.p.AcquireAllIdle() + conns := make([]*Conn, 0, len(resources)) + for _, res := range resources { + cr := res.Value() + if p.beforeAcquire == nil || p.beforeAcquire(ctx, cr.conn) { + conns = append(conns, cr.getConn(p, res)) + } else { + res.Destroy() + } + } + + return conns +} + +// Reset closes all connections, but leaves the pool open. It is intended for use when an error is detected that would +// disrupt all connections (such as a network interruption or a server state change). +// +// It is safe to reset a pool while connections are checked out. Those connections will be closed when they are returned +// to the pool. +func (p *Pool) Reset() { + p.p.Reset() +} + +// Config returns a copy of config that was used to initialize this pool. +func (p *Pool) Config() *Config { return p.config.Copy() } + +// Stat returns a pgxpool.Stat struct with a snapshot of Pool statistics. +func (p *Pool) Stat() *Stat { + return &Stat{ + s: p.p.Stat(), + newConnsCount: atomic.LoadInt64(&p.newConnsCount), + lifetimeDestroyCount: atomic.LoadInt64(&p.lifetimeDestroyCount), + idleDestroyCount: atomic.LoadInt64(&p.idleDestroyCount), + } +} + +// Exec acquires a connection from the Pool and executes the given SQL. +// SQL can be either a prepared statement name or an SQL string. +// Arguments should be referenced positionally from the SQL string as $1, $2, etc. +// The acquired connection is returned to the pool when the Exec function returns. +func (p *Pool) Exec(ctx context.Context, sql string, arguments ...any) (pgconn.CommandTag, error) { + c, err := p.Acquire(ctx) + if err != nil { + return pgconn.CommandTag{}, err + } + defer c.Release() + + return c.Exec(ctx, sql, arguments...) +} + +// Query acquires a connection and executes a query that returns pgx.Rows. +// Arguments should be referenced positionally from the SQL string as $1, $2, etc. +// See pgx.Rows documentation to close the returned Rows and return the acquired connection to the Pool. +// +// If there is an error, the returned pgx.Rows will be returned in an error state. +// If preferred, ignore the error returned from Query and handle errors using the returned pgx.Rows. +// +// For extra control over how the query is executed, the types QuerySimpleProtocol, QueryResultFormats, and +// QueryResultFormatsByOID may be used as the first args to control exactly how the query is executed. This is rarely +// needed. See the documentation for those types for details. +func (p *Pool) Query(ctx context.Context, sql string, args ...any) (pgx.Rows, error) { + c, err := p.Acquire(ctx) + if err != nil { + return errRows{err: err}, err + } + + rows, err := c.Query(ctx, sql, args...) + if err != nil { + c.Release() + return errRows{err: err}, err + } + + return c.getPoolRows(rows), nil +} + +// QueryRow acquires a connection and executes a query that is expected +// to return at most one row (pgx.Row). Errors are deferred until pgx.Row's +// Scan method is called. If the query selects no rows, pgx.Row's Scan will +// return ErrNoRows. Otherwise, pgx.Row's Scan scans the first selected row +// and discards the rest. The acquired connection is returned to the Pool when +// pgx.Row's Scan method is called. +// +// Arguments should be referenced positionally from the SQL string as $1, $2, etc. +// +// For extra control over how the query is executed, the types QuerySimpleProtocol, QueryResultFormats, and +// QueryResultFormatsByOID may be used as the first args to control exactly how the query is executed. This is rarely +// needed. See the documentation for those types for details. +func (p *Pool) QueryRow(ctx context.Context, sql string, args ...any) pgx.Row { + c, err := p.Acquire(ctx) + if err != nil { + return errRow{err: err} + } + + row := c.QueryRow(ctx, sql, args...) + return c.getPoolRow(row) +} + +func (p *Pool) SendBatch(ctx context.Context, b *pgx.Batch) pgx.BatchResults { + c, err := p.Acquire(ctx) + if err != nil { + return errBatchResults{err: err} + } + + br := c.SendBatch(ctx, b) + return &poolBatchResults{br: br, c: c} +} + +// Begin acquires a connection from the Pool and starts a transaction. Unlike database/sql, the context only affects the begin command. i.e. there is no +// auto-rollback on context cancellation. Begin initiates a transaction block without explicitly setting a transaction mode for the block (see BeginTx with TxOptions if transaction mode is required). +// *pgxpool.Tx is returned, which implements the pgx.Tx interface. +// Commit or Rollback must be called on the returned transaction to finalize the transaction block. +func (p *Pool) Begin(ctx context.Context) (pgx.Tx, error) { + return p.BeginTx(ctx, pgx.TxOptions{}) +} + +// BeginTx acquires a connection from the Pool and starts a transaction with pgx.TxOptions determining the transaction mode. +// Unlike database/sql, the context only affects the begin command. i.e. there is no auto-rollback on context cancellation. +// *pgxpool.Tx is returned, which implements the pgx.Tx interface. +// Commit or Rollback must be called on the returned transaction to finalize the transaction block. +func (p *Pool) BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) { + c, err := p.Acquire(ctx) + if err != nil { + return nil, err + } + + t, err := c.BeginTx(ctx, txOptions) + if err != nil { + c.Release() + return nil, err + } + + return &Tx{t: t, c: c}, nil +} + +func (p *Pool) CopyFrom(ctx context.Context, tableName pgx.Identifier, columnNames []string, rowSrc pgx.CopyFromSource) (int64, error) { + c, err := p.Acquire(ctx) + if err != nil { + return 0, err + } + defer c.Release() + + return c.Conn().CopyFrom(ctx, tableName, columnNames, rowSrc) +} + +// Ping acquires a connection from the Pool and executes an empty sql statement against it. +// If the sql returns without error, the database Ping is considered successful, otherwise, the error is returned. +func (p *Pool) Ping(ctx context.Context) error { + c, err := p.Acquire(ctx) + if err != nil { + return err + } + defer c.Release() + return c.Ping(ctx) +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/rows.go b/vendor/github.com/jackc/pgx/v5/pgxpool/rows.go new file mode 100644 index 0000000..f834b7e --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/rows.go @@ -0,0 +1,116 @@ +package pgxpool + +import ( + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" +) + +type errRows struct { + err error +} + +func (errRows) Close() {} +func (e errRows) Err() error { return e.err } +func (errRows) CommandTag() pgconn.CommandTag { return pgconn.CommandTag{} } +func (errRows) FieldDescriptions() []pgconn.FieldDescription { return nil } +func (errRows) Next() bool { return false } +func (e errRows) Scan(dest ...any) error { return e.err } +func (e errRows) Values() ([]any, error) { return nil, e.err } +func (e errRows) RawValues() [][]byte { return nil } +func (e errRows) Conn() *pgx.Conn { return nil } + +type errRow struct { + err error +} + +func (e errRow) Scan(dest ...any) error { return e.err } + +type poolRows struct { + r pgx.Rows + c *Conn + err error +} + +func (rows *poolRows) Close() { + rows.r.Close() + if rows.c != nil { + rows.c.Release() + rows.c = nil + } +} + +func (rows *poolRows) Err() error { + if rows.err != nil { + return rows.err + } + return rows.r.Err() +} + +func (rows *poolRows) CommandTag() pgconn.CommandTag { + return rows.r.CommandTag() +} + +func (rows *poolRows) FieldDescriptions() []pgconn.FieldDescription { + return rows.r.FieldDescriptions() +} + +func (rows *poolRows) Next() bool { + if rows.err != nil { + return false + } + + n := rows.r.Next() + if !n { + rows.Close() + } + return n +} + +func (rows *poolRows) Scan(dest ...any) error { + err := rows.r.Scan(dest...) + if err != nil { + rows.Close() + } + return err +} + +func (rows *poolRows) Values() ([]any, error) { + values, err := rows.r.Values() + if err != nil { + rows.Close() + } + return values, err +} + +func (rows *poolRows) RawValues() [][]byte { + return rows.r.RawValues() +} + +func (rows *poolRows) Conn() *pgx.Conn { + return rows.r.Conn() +} + +type poolRow struct { + r pgx.Row + c *Conn + err error +} + +func (row *poolRow) Scan(dest ...any) error { + if row.err != nil { + return row.err + } + + panicked := true + defer func() { + if panicked && row.c != nil { + row.c.Release() + } + }() + err := row.r.Scan(dest...) + panicked = false + if row.c != nil { + row.c.Release() + } + return err +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/stat.go b/vendor/github.com/jackc/pgx/v5/pgxpool/stat.go new file mode 100644 index 0000000..cfa0c4c --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/stat.go @@ -0,0 +1,84 @@ +package pgxpool + +import ( + "time" + + "github.com/jackc/puddle/v2" +) + +// Stat is a snapshot of Pool statistics. +type Stat struct { + s *puddle.Stat + newConnsCount int64 + lifetimeDestroyCount int64 + idleDestroyCount int64 +} + +// AcquireCount returns the cumulative count of successful acquires from the pool. +func (s *Stat) AcquireCount() int64 { + return s.s.AcquireCount() +} + +// AcquireDuration returns the total duration of all successful acquires from +// the pool. +func (s *Stat) AcquireDuration() time.Duration { + return s.s.AcquireDuration() +} + +// AcquiredConns returns the number of currently acquired connections in the pool. +func (s *Stat) AcquiredConns() int32 { + return s.s.AcquiredResources() +} + +// CanceledAcquireCount returns the cumulative count of acquires from the pool +// that were canceled by a context. +func (s *Stat) CanceledAcquireCount() int64 { + return s.s.CanceledAcquireCount() +} + +// ConstructingConns returns the number of conns with construction in progress in +// the pool. +func (s *Stat) ConstructingConns() int32 { + return s.s.ConstructingResources() +} + +// EmptyAcquireCount returns the cumulative count of successful acquires from the pool +// that waited for a resource to be released or constructed because the pool was +// empty. +func (s *Stat) EmptyAcquireCount() int64 { + return s.s.EmptyAcquireCount() +} + +// IdleConns returns the number of currently idle conns in the pool. +func (s *Stat) IdleConns() int32 { + return s.s.IdleResources() +} + +// MaxConns returns the maximum size of the pool. +func (s *Stat) MaxConns() int32 { + return s.s.MaxResources() +} + +// TotalConns returns the total number of resources currently in the pool. +// The value is the sum of ConstructingConns, AcquiredConns, and +// IdleConns. +func (s *Stat) TotalConns() int32 { + return s.s.TotalResources() +} + +// NewConnsCount returns the cumulative count of new connections opened. +func (s *Stat) NewConnsCount() int64 { + return s.newConnsCount +} + +// MaxLifetimeDestroyCount returns the cumulative count of connections destroyed +// because they exceeded MaxConnLifetime. +func (s *Stat) MaxLifetimeDestroyCount() int64 { + return s.lifetimeDestroyCount +} + +// MaxIdleDestroyCount returns the cumulative count of connections destroyed because +// they exceeded MaxConnIdleTime. +func (s *Stat) MaxIdleDestroyCount() int64 { + return s.idleDestroyCount +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/tracer.go b/vendor/github.com/jackc/pgx/v5/pgxpool/tracer.go new file mode 100644 index 0000000..78b9d15 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/tracer.go @@ -0,0 +1,33 @@ +package pgxpool + +import ( + "context" + + "github.com/jackc/pgx/v5" +) + +// AcquireTracer traces Acquire. +type AcquireTracer interface { + // TraceAcquireStart is called at the beginning of Acquire. + // The returned context is used for the rest of the call and will be passed to the TraceAcquireEnd. + TraceAcquireStart(ctx context.Context, pool *Pool, data TraceAcquireStartData) context.Context + // TraceAcquireEnd is called when a connection has been acquired. + TraceAcquireEnd(ctx context.Context, pool *Pool, data TraceAcquireEndData) +} + +type TraceAcquireStartData struct{} + +type TraceAcquireEndData struct { + Conn *pgx.Conn + Err error +} + +// ReleaseTracer traces Release. +type ReleaseTracer interface { + // TraceRelease is called at the beginning of Release. + TraceRelease(pool *Pool, data TraceReleaseData) +} + +type TraceReleaseData struct { + Conn *pgx.Conn +} diff --git a/vendor/github.com/jackc/pgx/v5/pgxpool/tx.go b/vendor/github.com/jackc/pgx/v5/pgxpool/tx.go new file mode 100644 index 0000000..b49e7f4 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/pgxpool/tx.go @@ -0,0 +1,83 @@ +package pgxpool + +import ( + "context" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" +) + +// Tx represents a database transaction acquired from a Pool. +type Tx struct { + t pgx.Tx + c *Conn +} + +// Begin starts a pseudo nested transaction implemented with a savepoint. +func (tx *Tx) Begin(ctx context.Context) (pgx.Tx, error) { + return tx.t.Begin(ctx) +} + +// Commit commits the transaction and returns the associated connection back to the Pool. Commit will return an error +// where errors.Is(ErrTxClosed) is true if the Tx is already closed, but is otherwise safe to call multiple times. If +// the commit fails with a rollback status (e.g. the transaction was already in a broken state) then ErrTxCommitRollback +// will be returned. +func (tx *Tx) Commit(ctx context.Context) error { + err := tx.t.Commit(ctx) + if tx.c != nil { + tx.c.Release() + tx.c = nil + } + return err +} + +// Rollback rolls back the transaction and returns the associated connection back to the Pool. Rollback will return +// where an error where errors.Is(ErrTxClosed) is true if the Tx is already closed, but is otherwise safe to call +// multiple times. Hence, defer tx.Rollback() is safe even if tx.Commit() will be called first in a non-error condition. +func (tx *Tx) Rollback(ctx context.Context) error { + err := tx.t.Rollback(ctx) + if tx.c != nil { + tx.c.Release() + tx.c = nil + } + return err +} + +func (tx *Tx) CopyFrom(ctx context.Context, tableName pgx.Identifier, columnNames []string, rowSrc pgx.CopyFromSource) (int64, error) { + return tx.t.CopyFrom(ctx, tableName, columnNames, rowSrc) +} + +func (tx *Tx) SendBatch(ctx context.Context, b *pgx.Batch) pgx.BatchResults { + return tx.t.SendBatch(ctx, b) +} + +func (tx *Tx) LargeObjects() pgx.LargeObjects { + return tx.t.LargeObjects() +} + +// Prepare creates a prepared statement with name and sql. If the name is empty, +// an anonymous prepared statement will be used. sql can contain placeholders +// for bound parameters. These placeholders are referenced positionally as $1, $2, etc. +// +// Prepare is idempotent; i.e. it is safe to call Prepare multiple times with the same +// name and sql arguments. This allows a code path to Prepare and Query/Exec without +// needing to first check whether the statement has already been prepared. +func (tx *Tx) Prepare(ctx context.Context, name, sql string) (*pgconn.StatementDescription, error) { + return tx.t.Prepare(ctx, name, sql) +} + +func (tx *Tx) Exec(ctx context.Context, sql string, arguments ...any) (pgconn.CommandTag, error) { + return tx.t.Exec(ctx, sql, arguments...) +} + +func (tx *Tx) Query(ctx context.Context, sql string, args ...any) (pgx.Rows, error) { + return tx.t.Query(ctx, sql, args...) +} + +func (tx *Tx) QueryRow(ctx context.Context, sql string, args ...any) pgx.Row { + return tx.t.QueryRow(ctx, sql, args...) +} + +func (tx *Tx) Conn() *pgx.Conn { + return tx.t.Conn() +} diff --git a/vendor/github.com/jackc/pgx/v5/rows.go b/vendor/github.com/jackc/pgx/v5/rows.go new file mode 100644 index 0000000..f23625d --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/rows.go @@ -0,0 +1,856 @@ +package pgx + +import ( + "context" + "errors" + "fmt" + "reflect" + "strings" + "sync" + "time" + + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgtype" +) + +// Rows is the result set returned from *Conn.Query. Rows must be closed before +// the *Conn can be used again. Rows are closed by explicitly calling Close(), +// calling Next() until it returns false, or when a fatal error occurs. +// +// Once a Rows is closed the only methods that may be called are Close(), Err(), +// and CommandTag(). +// +// Rows is an interface instead of a struct to allow tests to mock Query. However, +// adding a method to an interface is technically a breaking change. Because of this +// the Rows interface is partially excluded from semantic version requirements. +// Methods will not be removed or changed, but new methods may be added. +type Rows interface { + // Close closes the rows, making the connection ready for use again. It is safe + // to call Close after rows is already closed. + Close() + + // Err returns any error that occurred while reading. Err must only be called after the Rows is closed (either by + // calling Close or by Next returning false). If it is called early it may return nil even if there was an error + // executing the query. + Err() error + + // CommandTag returns the command tag from this query. It is only available after Rows is closed. + CommandTag() pgconn.CommandTag + + // FieldDescriptions returns the field descriptions of the columns. It may return nil. In particular this can occur + // when there was an error executing the query. + FieldDescriptions() []pgconn.FieldDescription + + // Next prepares the next row for reading. It returns true if there is another + // row and false if no more rows are available or a fatal error has occurred. + // It automatically closes rows when all rows are read. + // + // Callers should check rows.Err() after rows.Next() returns false to detect + // whether result-set reading ended prematurely due to an error. See + // Conn.Query for details. + // + // For simpler error handling, consider using the higher-level pgx v5 + // CollectRows() and ForEachRow() helpers instead. + Next() bool + + // Scan reads the values from the current row into dest values positionally. + // dest can include pointers to core types, values implementing the Scanner + // interface, and nil. nil will skip the value entirely. It is an error to + // call Scan without first calling Next() and checking that it returned true. + Scan(dest ...any) error + + // Values returns the decoded row values. As with Scan(), it is an error to + // call Values without first calling Next() and checking that it returned + // true. + Values() ([]any, error) + + // RawValues returns the unparsed bytes of the row values. The returned data is only valid until the next Next + // call or the Rows is closed. + RawValues() [][]byte + + // Conn returns the underlying *Conn on which the query was executed. This may return nil if Rows did not come from a + // *Conn (e.g. if it was created by RowsFromResultReader) + Conn() *Conn +} + +// Row is a convenience wrapper over Rows that is returned by QueryRow. +// +// Row is an interface instead of a struct to allow tests to mock QueryRow. However, +// adding a method to an interface is technically a breaking change. Because of this +// the Row interface is partially excluded from semantic version requirements. +// Methods will not be removed or changed, but new methods may be added. +type Row interface { + // Scan works the same as Rows. with the following exceptions. If no + // rows were found it returns ErrNoRows. If multiple rows are returned it + // ignores all but the first. + Scan(dest ...any) error +} + +// RowScanner scans an entire row at a time into the RowScanner. +type RowScanner interface { + // ScanRows scans the row. + ScanRow(rows Rows) error +} + +// connRow implements the Row interface for Conn.QueryRow. +type connRow baseRows + +func (r *connRow) Scan(dest ...any) (err error) { + rows := (*baseRows)(r) + + if rows.Err() != nil { + return rows.Err() + } + + for _, d := range dest { + if _, ok := d.(*pgtype.DriverBytes); ok { + rows.Close() + return fmt.Errorf("cannot scan into *pgtype.DriverBytes from QueryRow") + } + } + + if !rows.Next() { + if rows.Err() == nil { + return ErrNoRows + } + return rows.Err() + } + + rows.Scan(dest...) + rows.Close() + return rows.Err() +} + +// baseRows implements the Rows interface for Conn.Query. +type baseRows struct { + typeMap *pgtype.Map + resultReader *pgconn.ResultReader + + values [][]byte + + commandTag pgconn.CommandTag + err error + closed bool + + scanPlans []pgtype.ScanPlan + scanTypes []reflect.Type + + conn *Conn + multiResultReader *pgconn.MultiResultReader + + queryTracer QueryTracer + batchTracer BatchTracer + ctx context.Context + startTime time.Time + sql string + args []any + rowCount int +} + +func (rows *baseRows) FieldDescriptions() []pgconn.FieldDescription { + return rows.resultReader.FieldDescriptions() +} + +func (rows *baseRows) Close() { + if rows.closed { + return + } + + rows.closed = true + + if rows.resultReader != nil { + var closeErr error + rows.commandTag, closeErr = rows.resultReader.Close() + if rows.err == nil { + rows.err = closeErr + } + } + + if rows.multiResultReader != nil { + closeErr := rows.multiResultReader.Close() + if rows.err == nil { + rows.err = closeErr + } + } + + if rows.err != nil && rows.conn != nil && rows.sql != "" { + if sc := rows.conn.statementCache; sc != nil { + sc.Invalidate(rows.sql) + } + + if sc := rows.conn.descriptionCache; sc != nil { + sc.Invalidate(rows.sql) + } + } + + if rows.batchTracer != nil { + rows.batchTracer.TraceBatchQuery(rows.ctx, rows.conn, TraceBatchQueryData{SQL: rows.sql, Args: rows.args, CommandTag: rows.commandTag, Err: rows.err}) + } else if rows.queryTracer != nil { + rows.queryTracer.TraceQueryEnd(rows.ctx, rows.conn, TraceQueryEndData{rows.commandTag, rows.err}) + } +} + +func (rows *baseRows) CommandTag() pgconn.CommandTag { + return rows.commandTag +} + +func (rows *baseRows) Err() error { + return rows.err +} + +// fatal signals an error occurred after the query was sent to the server. It +// closes the rows automatically. +func (rows *baseRows) fatal(err error) { + if rows.err != nil { + return + } + + rows.err = err + rows.Close() +} + +func (rows *baseRows) Next() bool { + if rows.closed { + return false + } + + if rows.resultReader.NextRow() { + rows.rowCount++ + rows.values = rows.resultReader.Values() + return true + } else { + rows.Close() + return false + } +} + +func (rows *baseRows) Scan(dest ...any) error { + m := rows.typeMap + fieldDescriptions := rows.FieldDescriptions() + values := rows.values + + if len(fieldDescriptions) != len(values) { + err := fmt.Errorf("number of field descriptions must equal number of values, got %d and %d", len(fieldDescriptions), len(values)) + rows.fatal(err) + return err + } + + if len(dest) == 1 { + if rc, ok := dest[0].(RowScanner); ok { + err := rc.ScanRow(rows) + if err != nil { + rows.fatal(err) + } + return err + } + } + + if len(fieldDescriptions) != len(dest) { + err := fmt.Errorf("number of field descriptions must equal number of destinations, got %d and %d", len(fieldDescriptions), len(dest)) + rows.fatal(err) + return err + } + + if rows.scanPlans == nil { + rows.scanPlans = make([]pgtype.ScanPlan, len(values)) + rows.scanTypes = make([]reflect.Type, len(values)) + for i := range dest { + rows.scanPlans[i] = m.PlanScan(fieldDescriptions[i].DataTypeOID, fieldDescriptions[i].Format, dest[i]) + rows.scanTypes[i] = reflect.TypeOf(dest[i]) + } + } + + for i, dst := range dest { + if dst == nil { + continue + } + + if rows.scanTypes[i] != reflect.TypeOf(dst) { + rows.scanPlans[i] = m.PlanScan(fieldDescriptions[i].DataTypeOID, fieldDescriptions[i].Format, dest[i]) + rows.scanTypes[i] = reflect.TypeOf(dest[i]) + } + + err := rows.scanPlans[i].Scan(values[i], dst) + if err != nil { + err = ScanArgError{ColumnIndex: i, Err: err} + rows.fatal(err) + return err + } + } + + return nil +} + +func (rows *baseRows) Values() ([]any, error) { + if rows.closed { + return nil, errors.New("rows is closed") + } + + values := make([]any, 0, len(rows.FieldDescriptions())) + + for i := range rows.FieldDescriptions() { + buf := rows.values[i] + fd := &rows.FieldDescriptions()[i] + + if buf == nil { + values = append(values, nil) + continue + } + + if dt, ok := rows.typeMap.TypeForOID(fd.DataTypeOID); ok { + value, err := dt.Codec.DecodeValue(rows.typeMap, fd.DataTypeOID, fd.Format, buf) + if err != nil { + rows.fatal(err) + } + values = append(values, value) + } else { + switch fd.Format { + case TextFormatCode: + values = append(values, string(buf)) + case BinaryFormatCode: + newBuf := make([]byte, len(buf)) + copy(newBuf, buf) + values = append(values, newBuf) + default: + rows.fatal(errors.New("unknown format code")) + } + } + + if rows.Err() != nil { + return nil, rows.Err() + } + } + + return values, rows.Err() +} + +func (rows *baseRows) RawValues() [][]byte { + return rows.values +} + +func (rows *baseRows) Conn() *Conn { + return rows.conn +} + +type ScanArgError struct { + ColumnIndex int + Err error +} + +func (e ScanArgError) Error() string { + return fmt.Sprintf("can't scan into dest[%d]: %v", e.ColumnIndex, e.Err) +} + +func (e ScanArgError) Unwrap() error { + return e.Err +} + +// ScanRow decodes raw row data into dest. It can be used to scan rows read from the lower level pgconn interface. +// +// typeMap - OID to Go type mapping. +// fieldDescriptions - OID and format of values +// values - the raw data as returned from the PostgreSQL server +// dest - the destination that values will be decoded into +func ScanRow(typeMap *pgtype.Map, fieldDescriptions []pgconn.FieldDescription, values [][]byte, dest ...any) error { + if len(fieldDescriptions) != len(values) { + return fmt.Errorf("number of field descriptions must equal number of values, got %d and %d", len(fieldDescriptions), len(values)) + } + if len(fieldDescriptions) != len(dest) { + return fmt.Errorf("number of field descriptions must equal number of destinations, got %d and %d", len(fieldDescriptions), len(dest)) + } + + for i, d := range dest { + if d == nil { + continue + } + + err := typeMap.Scan(fieldDescriptions[i].DataTypeOID, fieldDescriptions[i].Format, values[i], d) + if err != nil { + return ScanArgError{ColumnIndex: i, Err: err} + } + } + + return nil +} + +// RowsFromResultReader returns a Rows that will read from values resultReader and decode with typeMap. It can be used +// to read from the lower level pgconn interface. +func RowsFromResultReader(typeMap *pgtype.Map, resultReader *pgconn.ResultReader) Rows { + return &baseRows{ + typeMap: typeMap, + resultReader: resultReader, + } +} + +// ForEachRow iterates through rows. For each row it scans into the elements of scans and calls fn. If any row +// fails to scan or fn returns an error the query will be aborted and the error will be returned. Rows will be closed +// when ForEachRow returns. +func ForEachRow(rows Rows, scans []any, fn func() error) (pgconn.CommandTag, error) { + defer rows.Close() + + for rows.Next() { + err := rows.Scan(scans...) + if err != nil { + return pgconn.CommandTag{}, err + } + + err = fn() + if err != nil { + return pgconn.CommandTag{}, err + } + } + + if err := rows.Err(); err != nil { + return pgconn.CommandTag{}, err + } + + return rows.CommandTag(), nil +} + +// CollectableRow is the subset of Rows methods that a RowToFunc is allowed to call. +type CollectableRow interface { + FieldDescriptions() []pgconn.FieldDescription + Scan(dest ...any) error + Values() ([]any, error) + RawValues() [][]byte +} + +// RowToFunc is a function that scans or otherwise converts row to a T. +type RowToFunc[T any] func(row CollectableRow) (T, error) + +// AppendRows iterates through rows, calling fn for each row, and appending the results into a slice of T. +// +// This function closes the rows automatically on return. +func AppendRows[T any, S ~[]T](slice S, rows Rows, fn RowToFunc[T]) (S, error) { + defer rows.Close() + + for rows.Next() { + value, err := fn(rows) + if err != nil { + return nil, err + } + slice = append(slice, value) + } + + if err := rows.Err(); err != nil { + return nil, err + } + + return slice, nil +} + +// CollectRows iterates through rows, calling fn for each row, and collecting the results into a slice of T. +// +// This function closes the rows automatically on return. +func CollectRows[T any](rows Rows, fn RowToFunc[T]) ([]T, error) { + return AppendRows([]T{}, rows, fn) +} + +// CollectOneRow calls fn for the first row in rows and returns the result. If no rows are found returns an error where errors.Is(ErrNoRows) is true. +// CollectOneRow is to CollectRows as QueryRow is to Query. +// +// This function closes the rows automatically on return. +func CollectOneRow[T any](rows Rows, fn RowToFunc[T]) (T, error) { + defer rows.Close() + + var value T + var err error + + if !rows.Next() { + if err = rows.Err(); err != nil { + return value, err + } + return value, ErrNoRows + } + + value, err = fn(rows) + if err != nil { + return value, err + } + + rows.Close() + return value, rows.Err() +} + +// CollectExactlyOneRow calls fn for the first row in rows and returns the result. +// - If no rows are found returns an error where errors.Is(ErrNoRows) is true. +// - If more than 1 row is found returns an error where errors.Is(ErrTooManyRows) is true. +// +// This function closes the rows automatically on return. +func CollectExactlyOneRow[T any](rows Rows, fn RowToFunc[T]) (T, error) { + defer rows.Close() + + var ( + err error + value T + ) + + if !rows.Next() { + if err = rows.Err(); err != nil { + return value, err + } + + return value, ErrNoRows + } + + value, err = fn(rows) + if err != nil { + return value, err + } + + if rows.Next() { + var zero T + + return zero, ErrTooManyRows + } + + return value, rows.Err() +} + +// RowTo returns a T scanned from row. +func RowTo[T any](row CollectableRow) (T, error) { + var value T + err := row.Scan(&value) + return value, err +} + +// RowTo returns a the address of a T scanned from row. +func RowToAddrOf[T any](row CollectableRow) (*T, error) { + var value T + err := row.Scan(&value) + return &value, err +} + +// RowToMap returns a map scanned from row. +func RowToMap(row CollectableRow) (map[string]any, error) { + var value map[string]any + err := row.Scan((*mapRowScanner)(&value)) + return value, err +} + +type mapRowScanner map[string]any + +func (rs *mapRowScanner) ScanRow(rows Rows) error { + values, err := rows.Values() + if err != nil { + return err + } + + *rs = make(mapRowScanner, len(values)) + + for i := range values { + (*rs)[string(rows.FieldDescriptions()[i].Name)] = values[i] + } + + return nil +} + +// RowToStructByPos returns a T scanned from row. T must be a struct. T must have the same number a public fields as row +// has fields. The row and T fields will be matched by position. If the "db" struct tag is "-" then the field will be +// ignored. +func RowToStructByPos[T any](row CollectableRow) (T, error) { + var value T + err := (&positionalStructRowScanner{ptrToStruct: &value}).ScanRow(row) + return value, err +} + +// RowToAddrOfStructByPos returns the address of a T scanned from row. T must be a struct. T must have the same number a +// public fields as row has fields. The row and T fields will be matched by position. If the "db" struct tag is "-" then +// the field will be ignored. +func RowToAddrOfStructByPos[T any](row CollectableRow) (*T, error) { + var value T + err := (&positionalStructRowScanner{ptrToStruct: &value}).ScanRow(row) + return &value, err +} + +type positionalStructRowScanner struct { + ptrToStruct any +} + +func (rs *positionalStructRowScanner) ScanRow(rows CollectableRow) error { + typ := reflect.TypeOf(rs.ptrToStruct).Elem() + fields := lookupStructFields(typ) + if len(rows.RawValues()) > len(fields) { + return fmt.Errorf( + "got %d values, but dst struct has only %d fields", + len(rows.RawValues()), + len(fields), + ) + } + scanTargets := setupStructScanTargets(rs.ptrToStruct, fields) + return rows.Scan(scanTargets...) +} + +// Map from reflect.Type -> []structRowField +var positionalStructFieldMap sync.Map + +func lookupStructFields(t reflect.Type) []structRowField { + if cached, ok := positionalStructFieldMap.Load(t); ok { + return cached.([]structRowField) + } + + fieldStack := make([]int, 0, 1) + fields := computeStructFields(t, make([]structRowField, 0, t.NumField()), &fieldStack) + fieldsIface, _ := positionalStructFieldMap.LoadOrStore(t, fields) + return fieldsIface.([]structRowField) +} + +func computeStructFields( + t reflect.Type, + fields []structRowField, + fieldStack *[]int, +) []structRowField { + tail := len(*fieldStack) + *fieldStack = append(*fieldStack, 0) + for i := 0; i < t.NumField(); i++ { + sf := t.Field(i) + (*fieldStack)[tail] = i + // Handle anonymous struct embedding, but do not try to handle embedded pointers. + if sf.Anonymous && sf.Type.Kind() == reflect.Struct { + fields = computeStructFields(sf.Type, fields, fieldStack) + } else if sf.PkgPath == "" { + dbTag, _ := sf.Tag.Lookup(structTagKey) + if dbTag == "-" { + // Field is ignored, skip it. + continue + } + fields = append(fields, structRowField{ + path: append([]int(nil), *fieldStack...), + }) + } + } + *fieldStack = (*fieldStack)[:tail] + return fields +} + +// RowToStructByName returns a T scanned from row. T must be a struct. T must have the same number of named public +// fields as row has fields. The row and T fields will be matched by name. The match is case-insensitive. The database +// column name can be overridden with a "db" struct tag. If the "db" struct tag is "-" then the field will be ignored. +func RowToStructByName[T any](row CollectableRow) (T, error) { + var value T + err := (&namedStructRowScanner{ptrToStruct: &value}).ScanRow(row) + return value, err +} + +// RowToAddrOfStructByName returns the address of a T scanned from row. T must be a struct. T must have the same number +// of named public fields as row has fields. The row and T fields will be matched by name. The match is +// case-insensitive. The database column name can be overridden with a "db" struct tag. If the "db" struct tag is "-" +// then the field will be ignored. +func RowToAddrOfStructByName[T any](row CollectableRow) (*T, error) { + var value T + err := (&namedStructRowScanner{ptrToStruct: &value}).ScanRow(row) + return &value, err +} + +// RowToStructByNameLax returns a T scanned from row. T must be a struct. T must have greater than or equal number of named public +// fields as row has fields. The row and T fields will be matched by name. The match is case-insensitive. The database +// column name can be overridden with a "db" struct tag. If the "db" struct tag is "-" then the field will be ignored. +func RowToStructByNameLax[T any](row CollectableRow) (T, error) { + var value T + err := (&namedStructRowScanner{ptrToStruct: &value, lax: true}).ScanRow(row) + return value, err +} + +// RowToAddrOfStructByNameLax returns the address of a T scanned from row. T must be a struct. T must have greater than or +// equal number of named public fields as row has fields. The row and T fields will be matched by name. The match is +// case-insensitive. The database column name can be overridden with a "db" struct tag. If the "db" struct tag is "-" +// then the field will be ignored. +func RowToAddrOfStructByNameLax[T any](row CollectableRow) (*T, error) { + var value T + err := (&namedStructRowScanner{ptrToStruct: &value, lax: true}).ScanRow(row) + return &value, err +} + +type namedStructRowScanner struct { + ptrToStruct any + lax bool +} + +func (rs *namedStructRowScanner) ScanRow(rows CollectableRow) error { + typ := reflect.TypeOf(rs.ptrToStruct).Elem() + fldDescs := rows.FieldDescriptions() + namedStructFields, err := lookupNamedStructFields(typ, fldDescs) + if err != nil { + return err + } + if !rs.lax && namedStructFields.missingField != "" { + return fmt.Errorf("cannot find field %s in returned row", namedStructFields.missingField) + } + fields := namedStructFields.fields + scanTargets := setupStructScanTargets(rs.ptrToStruct, fields) + return rows.Scan(scanTargets...) +} + +// Map from namedStructFieldMap -> *namedStructFields +var namedStructFieldMap sync.Map + +type namedStructFieldsKey struct { + t reflect.Type + colNames string +} + +type namedStructFields struct { + fields []structRowField + // missingField is the first field from the struct without a corresponding row field. + // This is used to construct the correct error message for non-lax queries. + missingField string +} + +func lookupNamedStructFields( + t reflect.Type, + fldDescs []pgconn.FieldDescription, +) (*namedStructFields, error) { + key := namedStructFieldsKey{ + t: t, + colNames: joinFieldNames(fldDescs), + } + if cached, ok := namedStructFieldMap.Load(key); ok { + return cached.(*namedStructFields), nil + } + + // We could probably do two-levels of caching, where we compute the key -> fields mapping + // for a type only once, cache it by type, then use that to compute the column -> fields + // mapping for a given set of columns. + fieldStack := make([]int, 0, 1) + fields, missingField := computeNamedStructFields( + fldDescs, + t, + make([]structRowField, len(fldDescs)), + &fieldStack, + ) + for i, f := range fields { + if f.path == nil { + return nil, fmt.Errorf( + "struct doesn't have corresponding row field %s", + fldDescs[i].Name, + ) + } + } + + fieldsIface, _ := namedStructFieldMap.LoadOrStore( + key, + &namedStructFields{fields: fields, missingField: missingField}, + ) + return fieldsIface.(*namedStructFields), nil +} + +func joinFieldNames(fldDescs []pgconn.FieldDescription) string { + switch len(fldDescs) { + case 0: + return "" + case 1: + return fldDescs[0].Name + } + + totalSize := len(fldDescs) - 1 // Space for separator bytes. + for _, d := range fldDescs { + totalSize += len(d.Name) + } + var b strings.Builder + b.Grow(totalSize) + b.WriteString(fldDescs[0].Name) + for _, d := range fldDescs[1:] { + b.WriteByte(0) // Join with NUL byte as it's (presumably) not a valid column character. + b.WriteString(d.Name) + } + return b.String() +} + +func computeNamedStructFields( + fldDescs []pgconn.FieldDescription, + t reflect.Type, + fields []structRowField, + fieldStack *[]int, +) ([]structRowField, string) { + var missingField string + tail := len(*fieldStack) + *fieldStack = append(*fieldStack, 0) + for i := 0; i < t.NumField(); i++ { + sf := t.Field(i) + (*fieldStack)[tail] = i + if sf.PkgPath != "" && !sf.Anonymous { + // Field is unexported, skip it. + continue + } + // Handle anonymous struct embedding, but do not try to handle embedded pointers. + if sf.Anonymous && sf.Type.Kind() == reflect.Struct { + var missingSubField string + fields, missingSubField = computeNamedStructFields( + fldDescs, + sf.Type, + fields, + fieldStack, + ) + if missingField == "" { + missingField = missingSubField + } + } else { + dbTag, dbTagPresent := sf.Tag.Lookup(structTagKey) + if dbTagPresent { + dbTag, _, _ = strings.Cut(dbTag, ",") + } + if dbTag == "-" { + // Field is ignored, skip it. + continue + } + colName := dbTag + if !dbTagPresent { + colName = sf.Name + } + fpos := fieldPosByName(fldDescs, colName, !dbTagPresent) + if fpos == -1 { + if missingField == "" { + missingField = colName + } + continue + } + fields[fpos] = structRowField{ + path: append([]int(nil), *fieldStack...), + } + } + } + *fieldStack = (*fieldStack)[:tail] + + return fields, missingField +} + +const structTagKey = "db" + +func fieldPosByName(fldDescs []pgconn.FieldDescription, field string, normalize bool) (i int) { + i = -1 + + if normalize { + field = strings.ReplaceAll(field, "_", "") + } + for i, desc := range fldDescs { + if normalize { + if strings.EqualFold(strings.ReplaceAll(desc.Name, "_", ""), field) { + return i + } + } else { + if desc.Name == field { + return i + } + } + } + return +} + +// structRowField describes a field of a struct. +// +// TODO: It would be a bit more efficient to track the path using the pointer +// offset within the (outermost) struct and use unsafe.Pointer arithmetic to +// construct references when scanning rows. However, it's not clear it's worth +// using unsafe for this. +type structRowField struct { + path []int +} + +func setupStructScanTargets(receiver any, fields []structRowField) []any { + scanTargets := make([]any, len(fields)) + v := reflect.ValueOf(receiver).Elem() + for i, f := range fields { + scanTargets[i] = v.FieldByIndex(f.path).Addr().Interface() + } + return scanTargets +} diff --git a/vendor/github.com/jackc/pgx/v4/stdlib/sql.go b/vendor/github.com/jackc/pgx/v5/stdlib/sql.go similarity index 70% rename from vendor/github.com/jackc/pgx/v4/stdlib/sql.go rename to vendor/github.com/jackc/pgx/v5/stdlib/sql.go index f43ae32..c1d00ab 100644 --- a/vendor/github.com/jackc/pgx/v4/stdlib/sql.go +++ b/vendor/github.com/jackc/pgx/v5/stdlib/sql.go @@ -4,48 +4,65 @@ // // db, err := sql.Open("pgx", "postgres://pgx_md5:secret@localhost:5432/pgx_test?sslmode=disable") // if err != nil { -// return err +// return err // } // -// Or from a DSN string. +// Or from a keyword/value string. // // db, err := sql.Open("pgx", "user=postgres password=secret host=localhost port=5432 database=pgx_test sslmode=disable") // if err != nil { -// return err +// return err // } // +// Or from a *pgxpool.Pool. +// +// pool, err := pgxpool.New(context.Background(), os.Getenv("DATABASE_URL")) +// if err != nil { +// return err +// } +// +// db := stdlib.OpenDBFromPool(pool) +// // Or a pgx.ConnConfig can be used to set configuration not accessible via connection string. In this case the // pgx.ConnConfig must first be registered with the driver. This registration returns a connection string which is used // with sql.Open. // // connConfig, _ := pgx.ParseConfig(os.Getenv("DATABASE_URL")) -// connConfig.Logger = myLogger +// connConfig.Tracer = &tracelog.TraceLog{Logger: myLogger, LogLevel: tracelog.LogLevelInfo} // connStr := stdlib.RegisterConnConfig(connConfig) // db, _ := sql.Open("pgx", connStr) // -// pgx uses standard PostgreSQL positional parameters in queries. e.g. $1, $2. -// It does not support named parameters. +// pgx uses standard PostgreSQL positional parameters in queries. e.g. $1, $2. It does not support named parameters. // // db.QueryRow("select * from users where id=$1", userID) // -// In Go 1.13 and above (*sql.Conn) Raw() can be used to get a *pgx.Conn from the standard -// database/sql.DB connection pool. This allows operations that use pgx specific functionality. +// (*sql.Conn) Raw() can be used to get a *pgx.Conn from the standard database/sql.DB connection pool. This allows +// operations that use pgx specific functionality. // // // Given db is a *sql.DB // conn, err := db.Conn(context.Background()) // if err != nil { -// // handle error from acquiring connection from DB pool +// // handle error from acquiring connection from DB pool // } // -// err = conn.Raw(func(driverConn interface{}) error { -// conn := driverConn.(*stdlib.Conn).Conn() // conn is a *pgx.Conn -// // Do pgx specific stuff with conn -// conn.CopyFrom(...) -// return nil +// err = conn.Raw(func(driverConn any) error { +// conn := driverConn.(*stdlib.Conn).Conn() // conn is a *pgx.Conn +// // Do pgx specific stuff with conn +// conn.CopyFrom(...) +// return nil // }) // if err != nil { -// // handle error that occurred while using *pgx.Conn +// // handle error that occurred while using *pgx.Conn // } +// +// # PostgreSQL Specific Data Types +// +// The pgtype package provides support for PostgreSQL specific types. *pgtype.Map.SQLScanner is an adapter that makes +// these types usable as a sql.Scanner. +// +// m := pgtype.NewMap() +// var a []int64 +// err := db.QueryRow("select '{1,2,3}'::bigint[]").Scan(m.SQLScanner(&a)) package stdlib import ( @@ -58,14 +75,16 @@ import ( "math" "math/rand" "reflect" + "slices" "strconv" "strings" "sync" "time" - "github.com/jackc/pgconn" - "github.com/jackc/pgtype" - "github.com/jackc/pgx/v4" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgtype" + "github.com/jackc/pgx/v5/pgxpool" ) // Only intrinsic types should be binary format with database/sql. @@ -73,24 +92,17 @@ var databaseSQLResultFormats pgx.QueryResultFormatsByOID var pgxDriver *Driver -type ctxKey int - -var ctxKeyFakeTx ctxKey = 0 - -var ErrNotPgx = errors.New("not pgx *sql.DB") - func init() { pgxDriver = &Driver{ configs: make(map[string]*pgx.ConnConfig), } - fakeTxConns = make(map[*pgx.Conn]*sql.Tx) // if pgx driver was already registered by different pgx major version then we // skip registration under the default name. - if !contains(sql.Drivers(), "pgx") { + if !slices.Contains(sql.Drivers(), "pgx") { sql.Register("pgx", pgxDriver) } - sql.Register("pgx/v4", pgxDriver) + sql.Register("pgx/v5", pgxDriver) databaseSQLResultFormats = pgx.QueryResultFormatsByOID{ pgtype.BoolOID: 1, @@ -109,34 +121,18 @@ func init() { } } -// TODO replace by slices.Contains when experimental package will be merged to stdlib -// https://pkg.go.dev/golang.org/x/exp/slices#Contains -func contains(list []string, y string) bool { - for _, x := range list { - if x == y { - return true - } - } - return false -} - -var ( - fakeTxMutex sync.Mutex - fakeTxConns map[*pgx.Conn]*sql.Tx -) - // OptionOpenDB options for configuring the driver when opening a new db pool. type OptionOpenDB func(*connector) // OptionBeforeConnect provides a callback for before connect. It is passed a shallow copy of the ConnConfig that will -// be used to connect, so only its immediate members should be modified. +// be used to connect, so only its immediate members should be modified. Used only if db is opened with *pgx.ConnConfig. func OptionBeforeConnect(bc func(context.Context, *pgx.ConnConfig) error) OptionOpenDB { return func(dc *connector) { dc.BeforeConnect = bc } } -// OptionAfterConnect provides a callback for after connect. +// OptionAfterConnect provides a callback for after connect. Used only if db is opened with *pgx.ConnConfig. func OptionAfterConnect(ac func(context.Context, *pgx.Conn) error) OptionOpenDB { return func(dc *connector) { dc.AfterConnect = ac @@ -161,7 +157,7 @@ func RandomizeHostOrderFunc(ctx context.Context, connConfig *pgx.ConnConfig) err return nil } - newFallbacks := append([]*pgconn.FallbackConfig{&pgconn.FallbackConfig{ + newFallbacks := append([]*pgconn.FallbackConfig{{ Host: connConfig.Host, Port: connConfig.Port, TLSConfig: connConfig.TLSConfig, @@ -195,13 +191,42 @@ func GetConnector(config pgx.ConnConfig, opts ...OptionOpenDB) driver.Connector return c } +// GetPoolConnector creates a new driver.Connector from the given *pgxpool.Pool. By using this be sure to set the +// maximum idle connections of the *sql.DB created with this connector to zero since they must be managed from the +// *pgxpool.Pool. This is required to avoid acquiring all the connections from the pgxpool and starving any direct +// users of the pgxpool. +func GetPoolConnector(pool *pgxpool.Pool, opts ...OptionOpenDB) driver.Connector { + c := connector{ + pool: pool, + ResetSession: func(context.Context, *pgx.Conn) error { return nil }, // noop reset session by default + driver: pgxDriver, + } + + for _, opt := range opts { + opt(&c) + } + + return c +} + func OpenDB(config pgx.ConnConfig, opts ...OptionOpenDB) *sql.DB { c := GetConnector(config, opts...) return sql.OpenDB(c) } +// OpenDBFromPool creates a new *sql.DB from the given *pgxpool.Pool. Note that this method automatically sets the +// maximum number of idle connections in *sql.DB to zero, since they must be managed from the *pgxpool.Pool. This is +// required to avoid acquiring all the connections from the pgxpool and starving any direct users of the pgxpool. +func OpenDBFromPool(pool *pgxpool.Pool, opts ...OptionOpenDB) *sql.DB { + c := GetPoolConnector(pool, opts...) + db := sql.OpenDB(c) + db.SetMaxIdleConns(0) + return db +} + type connector struct { pgx.ConnConfig + pool *pgxpool.Pool BeforeConnect func(context.Context, *pgx.ConnConfig) error // function to call before creation of every new connection AfterConnect func(context.Context, *pgx.Conn) error // function to call after creation of every new connection ResetSession func(context.Context, *pgx.Conn) error // function is called before a connection is reused @@ -211,25 +236,53 @@ type connector struct { // Connect implement driver.Connector interface func (c connector) Connect(ctx context.Context) (driver.Conn, error) { var ( - err error - conn *pgx.Conn + connConfig pgx.ConnConfig + conn *pgx.Conn + close func(context.Context) error + err error ) - // Create a shallow copy of the config, so that BeforeConnect can safely modify it - connConfig := c.ConnConfig - if err = c.BeforeConnect(ctx, &connConfig); err != nil { - return nil, err - } + if c.pool == nil { + // Create a shallow copy of the config, so that BeforeConnect can safely modify it + connConfig = c.ConnConfig - if conn, err = pgx.ConnectConfig(ctx, &connConfig); err != nil { - return nil, err - } + if err = c.BeforeConnect(ctx, &connConfig); err != nil { + return nil, err + } - if err = c.AfterConnect(ctx, conn); err != nil { - return nil, err + if conn, err = pgx.ConnectConfig(ctx, &connConfig); err != nil { + return nil, err + } + + if err = c.AfterConnect(ctx, conn); err != nil { + return nil, err + } + + close = conn.Close + } else { + var pconn *pgxpool.Conn + + pconn, err = c.pool.Acquire(ctx) + if err != nil { + return nil, err + } + + conn = pconn.Conn() + + close = func(_ context.Context) error { + pconn.Release() + return nil + } } - return &Conn{conn: conn, driver: c.driver, connConfig: connConfig, resetSessionFunc: c.ResetSession}, nil + return &Conn{ + conn: conn, + close: close, + driver: c.driver, + connConfig: connConfig, + resetSessionFunc: c.ResetSession, + psRefCounts: make(map[*pgconn.StatementDescription]int), + }, nil } // Driver implement driver.Connector interface @@ -306,9 +359,11 @@ func (dc *driverConnector) Connect(ctx context.Context) (driver.Conn, error) { c := &Conn{ conn: conn, + close: conn.Close, driver: dc.driver, connConfig: *connConfig, resetSessionFunc: func(context.Context, *pgx.Conn) error { return nil }, + psRefCounts: make(map[*pgconn.StatementDescription]int), } return c, nil @@ -329,11 +384,20 @@ func UnregisterConnConfig(connStr string) { } type Conn struct { - conn *pgx.Conn - psCount int64 // Counter used for creating unique prepared statement names - driver *Driver - connConfig pgx.ConnConfig - resetSessionFunc func(context.Context, *pgx.Conn) error // Function is called before a connection is reused + conn *pgx.Conn + close func(context.Context) error + driver *Driver + connConfig pgx.ConnConfig + resetSessionFunc func(context.Context, *pgx.Conn) error // Function is called before a connection is reused + lastResetSessionTime time.Time + + // psRefCounts contains reference counts for prepared statements. Prepare uses the underlying pgx logic to generate + // deterministic statement names from the statement text. If this query has already been prepared then the existing + // *pgconn.StatementDescription will be returned. However, this means that if Close is called on the returned Stmt + // then the underlying prepared statement will be closed even when the underlying prepared statement is still in use + // by another database/sql Stmt. To prevent this psRefCounts keeps track of how many database/sql statements are using + // the same underlying statement and only closes the underlying statement when the reference count reaches 0. + psRefCounts map[*pgconn.StatementDescription]int } // Conn returns the underlying *pgx.Conn @@ -350,13 +414,11 @@ func (c *Conn) PrepareContext(ctx context.Context, query string) (driver.Stmt, e return nil, driver.ErrBadConn } - name := fmt.Sprintf("pgx_%d", c.psCount) - c.psCount++ - - sd, err := c.conn.Prepare(ctx, name, query) + sd, err := c.conn.Prepare(ctx, query, query) if err != nil { return nil, err } + c.psRefCounts[sd]++ return &Stmt{sd: sd, conn: c}, nil } @@ -364,7 +426,7 @@ func (c *Conn) PrepareContext(ctx context.Context, query string) (driver.Stmt, e func (c *Conn) Close() error { ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) defer cancel() - return c.conn.Close(ctx) + return c.close(ctx) } func (c *Conn) Begin() (driver.Tx, error) { @@ -376,11 +438,6 @@ func (c *Conn) BeginTx(ctx context.Context, opts driver.TxOptions) (driver.Tx, e return nil, driver.ErrBadConn } - if pconn, ok := ctx.Value(ctxKeyFakeTx).(**pgx.Conn); ok { - *pconn = c.conn - return fakeTx{}, nil - } - var pgxOpts pgx.TxOptions switch sql.IsolationLevel(opts.Isolation) { case sql.LevelDefault: @@ -430,7 +487,7 @@ func (c *Conn) QueryContext(ctx context.Context, query string, argsV []driver.Na return nil, driver.ErrBadConn } - args := []interface{}{databaseSQLResultFormats} + args := []any{databaseSQLResultFormats} args = append(args, namedValueToInterface(argsV)...) rows, err := c.conn.Query(ctx, query, args...) @@ -476,6 +533,14 @@ func (c *Conn) ResetSession(ctx context.Context) error { return driver.ErrBadConn } + now := time.Now() + if now.Sub(c.lastResetSessionTime) > time.Second { + if err := c.conn.PgConn().Ping(ctx); err != nil { + return driver.ErrBadConn + } + } + c.lastResetSessionTime = now + return c.resetSessionFunc(ctx, c.conn) } @@ -487,7 +552,16 @@ type Stmt struct { func (s *Stmt) Close() error { ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) defer cancel() - return s.conn.conn.Deallocate(ctx, s.sd.Name) + + refCount := s.conn.psRefCounts[s.sd] + if refCount == 1 { + delete(s.conn.psRefCounts, s.sd) + } else { + s.conn.psRefCounts[s.sd]-- + return nil + } + + return s.conn.conn.Deallocate(ctx, s.sd.SQL) } func (s *Stmt) NumInput() int { @@ -499,7 +573,7 @@ func (s *Stmt) Exec(argsV []driver.Value) (driver.Result, error) { } func (s *Stmt) ExecContext(ctx context.Context, argsV []driver.NamedValue) (driver.Result, error) { - return s.conn.ExecContext(ctx, s.sd.Name, argsV) + return s.conn.ExecContext(ctx, s.sd.SQL, argsV) } func (s *Stmt) Query(argsV []driver.Value) (driver.Rows, error) { @@ -507,7 +581,7 @@ func (s *Stmt) Query(argsV []driver.Value) (driver.Rows, error) { } func (s *Stmt) QueryContext(ctx context.Context, argsV []driver.NamedValue) (driver.Rows, error) { - return s.conn.QueryContext(ctx, s.sd.Name, argsV) + return s.conn.QueryContext(ctx, s.sd.SQL, argsV) } type rowValueFunc func(src []byte) (driver.Value, error) @@ -536,7 +610,7 @@ func (r *Rows) Columns() []string { // ColumnTypeDatabaseTypeName returns the database system type name. If the name is unknown the OID is returned. func (r *Rows) ColumnTypeDatabaseTypeName(index int) string { - if dt, ok := r.conn.conn.ConnInfo().DataTypeForOID(r.rows.FieldDescriptions()[index].DataTypeOID); ok { + if dt, ok := r.conn.conn.TypeMap().TypeForOID(r.rows.FieldDescriptions()[index].DataTypeOID); ok { return strings.ToUpper(dt.Name) } @@ -611,7 +685,7 @@ func (r *Rows) Close() error { } func (r *Rows) Next(dest []driver.Value) error { - ci := r.conn.conn.ConnInfo() + m := r.conn.conn.TypeMap() fieldDescriptions := r.rows.FieldDescriptions() if r.valueFuncs == nil { @@ -624,23 +698,23 @@ func (r *Rows) Next(dest []driver.Value) error { switch fd.DataTypeOID { case pgtype.BoolOID: var d bool - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return d, err } case pgtype.ByteaOID: var d []byte - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return d, err } - case pgtype.CIDOID: - var d pgtype.CID - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + case pgtype.CIDOID, pgtype.OIDOID, pgtype.XIDOID: + var d pgtype.Uint32 + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) if err != nil { return nil, err } @@ -648,9 +722,9 @@ func (r *Rows) Next(dest []driver.Value) error { } case pgtype.DateOID: var d pgtype.Date - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) if err != nil { return nil, err } @@ -658,74 +732,54 @@ func (r *Rows) Next(dest []driver.Value) error { } case pgtype.Float4OID: var d float32 - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return float64(d), err } case pgtype.Float8OID: var d float64 - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return d, err } case pgtype.Int2OID: var d int16 - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return int64(d), err } case pgtype.Int4OID: var d int32 - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return int64(d), err } case pgtype.Int8OID: var d int64 - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return d, err } - case pgtype.JSONOID: - var d pgtype.JSON - scanPlan := ci.PlanScan(dataTypeOID, format, &d) - r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) - if err != nil { - return nil, err - } - return d.Value() - } - case pgtype.JSONBOID: - var d pgtype.JSONB - scanPlan := ci.PlanScan(dataTypeOID, format, &d) - r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) - if err != nil { - return nil, err - } - return d.Value() - } - case pgtype.OIDOID: - var d pgtype.OIDValue - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + case pgtype.JSONOID, pgtype.JSONBOID: + var d []byte + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) if err != nil { return nil, err } - return d.Value() + return d, nil } case pgtype.TimestampOID: var d pgtype.Timestamp - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) if err != nil { return nil, err } @@ -733,29 +787,29 @@ func (r *Rows) Next(dest []driver.Value) error { } case pgtype.TimestamptzOID: var d pgtype.Timestamptz - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) if err != nil { return nil, err } return d.Value() } - case pgtype.XIDOID: - var d pgtype.XID - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + case pgtype.XMLOID: + var d []byte + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) if err != nil { return nil, err } - return d.Value() + return d, nil } default: var d string - scanPlan := ci.PlanScan(dataTypeOID, format, &d) + scanPlan := m.PlanScan(dataTypeOID, format, &d) r.valueFuncs[i] = func(src []byte) (driver.Value, error) { - err := scanPlan.Scan(ci, dataTypeOID, format, src, &d) + err := scanPlan.Scan(src, &d) return d, err } } @@ -783,7 +837,7 @@ func (r *Rows) Next(dest []driver.Value) error { var err error dest[i], err = r.valueFuncs[i](rv) if err != nil { - return fmt.Errorf("convert field %d failed: %v", i, err) + return fmt.Errorf("convert field %d failed: %w", i, err) } } else { dest[i] = nil @@ -793,11 +847,11 @@ func (r *Rows) Next(dest []driver.Value) error { return nil } -func valueToInterface(argsV []driver.Value) []interface{} { - args := make([]interface{}, 0, len(argsV)) +func valueToInterface(argsV []driver.Value) []any { + args := make([]any, 0, len(argsV)) for _, v := range argsV { if v != nil { - args = append(args, v.(interface{})) + args = append(args, v.(any)) } else { args = append(args, nil) } @@ -805,11 +859,11 @@ func valueToInterface(argsV []driver.Value) []interface{} { return args } -func namedValueToInterface(argsV []driver.NamedValue) []interface{} { - args := make([]interface{}, 0, len(argsV)) +func namedValueToInterface(argsV []driver.NamedValue) []any { + args := make([]any, 0, len(argsV)) for _, v := range argsV { if v.Value != nil { - args = append(args, v.Value.(interface{})) + args = append(args, v.Value.(any)) } else { args = append(args, nil) } @@ -825,55 +879,3 @@ type wrapTx struct { func (wtx wrapTx) Commit() error { return wtx.tx.Commit(wtx.ctx) } func (wtx wrapTx) Rollback() error { return wtx.tx.Rollback(wtx.ctx) } - -type fakeTx struct{} - -func (fakeTx) Commit() error { return nil } - -func (fakeTx) Rollback() error { return nil } - -// AcquireConn acquires a *pgx.Conn from database/sql connection pool. It must be released with ReleaseConn. -// -// In Go 1.13 this functionality has been incorporated into the standard library in the db.Conn.Raw() method. -func AcquireConn(db *sql.DB) (*pgx.Conn, error) { - var conn *pgx.Conn - ctx := context.WithValue(context.Background(), ctxKeyFakeTx, &conn) - tx, err := db.BeginTx(ctx, nil) - if err != nil { - return nil, err - } - if conn == nil { - tx.Rollback() - return nil, ErrNotPgx - } - - fakeTxMutex.Lock() - fakeTxConns[conn] = tx - fakeTxMutex.Unlock() - - return conn, nil -} - -// ReleaseConn releases a *pgx.Conn acquired with AcquireConn. -func ReleaseConn(db *sql.DB, conn *pgx.Conn) error { - var tx *sql.Tx - var ok bool - - if conn.PgConn().IsBusy() || conn.PgConn().TxStatus() != 'I' { - ctx, cancel := context.WithTimeout(context.Background(), time.Second) - defer cancel() - conn.Close(ctx) - } - - fakeTxMutex.Lock() - tx, ok = fakeTxConns[conn] - if ok { - delete(fakeTxConns, conn) - fakeTxMutex.Unlock() - } else { - fakeTxMutex.Unlock() - return fmt.Errorf("can't release conn that is not acquired") - } - - return tx.Rollback() -} diff --git a/vendor/github.com/jackc/pgx/v5/tracer.go b/vendor/github.com/jackc/pgx/v5/tracer.go new file mode 100644 index 0000000..58ca99f --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/tracer.go @@ -0,0 +1,107 @@ +package pgx + +import ( + "context" + + "github.com/jackc/pgx/v5/pgconn" +) + +// QueryTracer traces Query, QueryRow, and Exec. +type QueryTracer interface { + // TraceQueryStart is called at the beginning of Query, QueryRow, and Exec calls. The returned context is used for the + // rest of the call and will be passed to TraceQueryEnd. + TraceQueryStart(ctx context.Context, conn *Conn, data TraceQueryStartData) context.Context + + TraceQueryEnd(ctx context.Context, conn *Conn, data TraceQueryEndData) +} + +type TraceQueryStartData struct { + SQL string + Args []any +} + +type TraceQueryEndData struct { + CommandTag pgconn.CommandTag + Err error +} + +// BatchTracer traces SendBatch. +type BatchTracer interface { + // TraceBatchStart is called at the beginning of SendBatch calls. The returned context is used for the + // rest of the call and will be passed to TraceBatchQuery and TraceBatchEnd. + TraceBatchStart(ctx context.Context, conn *Conn, data TraceBatchStartData) context.Context + + TraceBatchQuery(ctx context.Context, conn *Conn, data TraceBatchQueryData) + TraceBatchEnd(ctx context.Context, conn *Conn, data TraceBatchEndData) +} + +type TraceBatchStartData struct { + Batch *Batch +} + +type TraceBatchQueryData struct { + SQL string + Args []any + CommandTag pgconn.CommandTag + Err error +} + +type TraceBatchEndData struct { + Err error +} + +// CopyFromTracer traces CopyFrom. +type CopyFromTracer interface { + // TraceCopyFromStart is called at the beginning of CopyFrom calls. The returned context is used for the + // rest of the call and will be passed to TraceCopyFromEnd. + TraceCopyFromStart(ctx context.Context, conn *Conn, data TraceCopyFromStartData) context.Context + + TraceCopyFromEnd(ctx context.Context, conn *Conn, data TraceCopyFromEndData) +} + +type TraceCopyFromStartData struct { + TableName Identifier + ColumnNames []string +} + +type TraceCopyFromEndData struct { + CommandTag pgconn.CommandTag + Err error +} + +// PrepareTracer traces Prepare. +type PrepareTracer interface { + // TracePrepareStart is called at the beginning of Prepare calls. The returned context is used for the + // rest of the call and will be passed to TracePrepareEnd. + TracePrepareStart(ctx context.Context, conn *Conn, data TracePrepareStartData) context.Context + + TracePrepareEnd(ctx context.Context, conn *Conn, data TracePrepareEndData) +} + +type TracePrepareStartData struct { + Name string + SQL string +} + +type TracePrepareEndData struct { + AlreadyPrepared bool + Err error +} + +// ConnectTracer traces Connect and ConnectConfig. +type ConnectTracer interface { + // TraceConnectStart is called at the beginning of Connect and ConnectConfig calls. The returned context is used for + // the rest of the call and will be passed to TraceConnectEnd. + TraceConnectStart(ctx context.Context, data TraceConnectStartData) context.Context + + TraceConnectEnd(ctx context.Context, data TraceConnectEndData) +} + +type TraceConnectStartData struct { + ConnConfig *ConnConfig +} + +type TraceConnectEndData struct { + Conn *Conn + Err error +} diff --git a/vendor/github.com/jackc/pgx/v4/tx.go b/vendor/github.com/jackc/pgx/v5/tx.go similarity index 70% rename from vendor/github.com/jackc/pgx/v4/tx.go rename to vendor/github.com/jackc/pgx/v5/tx.go index 9ecaa17..8feeb51 100644 --- a/vendor/github.com/jackc/pgx/v4/tx.go +++ b/vendor/github.com/jackc/pgx/v5/tx.go @@ -1,13 +1,13 @@ package pgx import ( - "bytes" "context" "errors" "fmt" "strconv" + "strings" - "github.com/jackc/pgconn" + "github.com/jackc/pgx/v5/pgconn" ) // TxIsoLevel is the transaction isolation level (serializable, repeatable read, read committed or read uncommitted) @@ -44,6 +44,10 @@ type TxOptions struct { IsoLevel TxIsoLevel AccessMode TxAccessMode DeferrableMode TxDeferrableMode + + // BeginQuery is the SQL query that will be executed to begin the transaction. This allows using non-standard syntax + // such as BEGIN PRIORITY HIGH with CockroachDB. If set this will override the other settings. + BeginQuery string } var emptyTxOptions TxOptions @@ -52,16 +56,26 @@ func (txOptions TxOptions) beginSQL() string { if txOptions == emptyTxOptions { return "begin" } - buf := &bytes.Buffer{} + + if txOptions.BeginQuery != "" { + return txOptions.BeginQuery + } + + var buf strings.Builder + buf.Grow(64) // 64 - maximum length of string with available options buf.WriteString("begin") + if txOptions.IsoLevel != "" { - fmt.Fprintf(buf, " isolation level %s", txOptions.IsoLevel) + buf.WriteString(" isolation level ") + buf.WriteString(string(txOptions.IsoLevel)) } if txOptions.AccessMode != "" { - fmt.Fprintf(buf, " %s", txOptions.AccessMode) + buf.WriteByte(' ') + buf.WriteString(string(txOptions.AccessMode)) } if txOptions.DeferrableMode != "" { - fmt.Fprintf(buf, " %s", txOptions.DeferrableMode) + buf.WriteByte(' ') + buf.WriteString(string(txOptions.DeferrableMode)) } return buf.String() @@ -94,39 +108,6 @@ func (c *Conn) BeginTx(ctx context.Context, txOptions TxOptions) (Tx, error) { return &dbTx{conn: c}, nil } -// BeginFunc starts a transaction and calls f. If f does not return an error the transaction is committed. If f returns -// an error the transaction is rolled back. The context will be used when executing the transaction control statements -// (BEGIN, ROLLBACK, and COMMIT) but does not otherwise affect the execution of f. -func (c *Conn) BeginFunc(ctx context.Context, f func(Tx) error) (err error) { - return c.BeginTxFunc(ctx, TxOptions{}, f) -} - -// BeginTxFunc starts a transaction with txOptions determining the transaction mode and calls f. If f does not return -// an error the transaction is committed. If f returns an error the transaction is rolled back. The context will be -// used when executing the transaction control statements (BEGIN, ROLLBACK, and COMMIT) but does not otherwise affect -// the execution of f. -func (c *Conn) BeginTxFunc(ctx context.Context, txOptions TxOptions, f func(Tx) error) (err error) { - var tx Tx - tx, err = c.BeginTx(ctx, txOptions) - if err != nil { - return err - } - defer func() { - rollbackErr := tx.Rollback(ctx) - if rollbackErr != nil && !errors.Is(rollbackErr, ErrTxClosed) { - err = rollbackErr - } - }() - - fErr := f(tx) - if fErr != nil { - _ = tx.Rollback(ctx) // ignore rollback error as there is already an error to return - return fErr - } - - return tx.Commit(ctx) -} - // Tx represents a database transaction. // // Tx is an interface instead of a struct to enable connection pools to be implemented without relying on internal pgx @@ -138,20 +119,17 @@ type Tx interface { // Begin starts a pseudo nested transaction. Begin(ctx context.Context) (Tx, error) - // BeginFunc starts a pseudo nested transaction and executes f. If f does not return an err the pseudo nested - // transaction will be committed. If it does then it will be rolled back. - BeginFunc(ctx context.Context, f func(Tx) error) (err error) - // Commit commits the transaction if this is a real transaction or releases the savepoint if this is a pseudo nested - // transaction. Commit will return ErrTxClosed if the Tx is already closed, but is otherwise safe to call multiple - // times. If the commit fails with a rollback status (e.g. the transaction was already in a broken state) then - // ErrTxCommitRollback will be returned. + // transaction. Commit will return an error where errors.Is(ErrTxClosed) is true if the Tx is already closed, but is + // otherwise safe to call multiple times. If the commit fails with a rollback status (e.g. the transaction was already + // in a broken state) then an error where errors.Is(ErrTxCommitRollback) is true will be returned. Commit(ctx context.Context) error // Rollback rolls back the transaction if this is a real transaction or rolls back to the savepoint if this is a - // pseudo nested transaction. Rollback will return ErrTxClosed if the Tx is already closed, but is otherwise safe to - // call multiple times. Hence, a defer tx.Rollback() is safe even if tx.Commit() will be called first in a non-error - // condition. Any other failure of a real transaction will result in the connection being closed. + // pseudo nested transaction. Rollback will return an error where errors.Is(ErrTxClosed) is true if the Tx is already + // closed, but is otherwise safe to call multiple times. Hence, a defer tx.Rollback() is safe even if tx.Commit() will + // be called first in a non-error condition. Any other failure of a real transaction will result in the connection + // being closed. Rollback(ctx context.Context) error CopyFrom(ctx context.Context, tableName Identifier, columnNames []string, rowSrc CopyFromSource) (int64, error) @@ -160,10 +138,9 @@ type Tx interface { Prepare(ctx context.Context, name, sql string) (*pgconn.StatementDescription, error) - Exec(ctx context.Context, sql string, arguments ...interface{}) (commandTag pgconn.CommandTag, err error) - Query(ctx context.Context, sql string, args ...interface{}) (Rows, error) - QueryRow(ctx context.Context, sql string, args ...interface{}) Row - QueryFunc(ctx context.Context, sql string, args []interface{}, scans []interface{}, f func(QueryFuncRow) error) (pgconn.CommandTag, error) + Exec(ctx context.Context, sql string, arguments ...any) (commandTag pgconn.CommandTag, err error) + Query(ctx context.Context, sql string, args ...any) (Rows, error) + QueryRow(ctx context.Context, sql string, args ...any) Row // Conn returns the underlying *Conn that on which this transaction is executing. Conn() *Conn @@ -175,7 +152,6 @@ type Tx interface { // called on the dbTx. type dbTx struct { conn *Conn - err error savepointNum int64 closed bool } @@ -195,32 +171,6 @@ func (tx *dbTx) Begin(ctx context.Context) (Tx, error) { return &dbSimulatedNestedTx{tx: tx, savepointNum: tx.savepointNum}, nil } -func (tx *dbTx) BeginFunc(ctx context.Context, f func(Tx) error) (err error) { - if tx.closed { - return ErrTxClosed - } - - var savepoint Tx - savepoint, err = tx.Begin(ctx) - if err != nil { - return err - } - defer func() { - rollbackErr := savepoint.Rollback(ctx) - if rollbackErr != nil && !errors.Is(rollbackErr, ErrTxClosed) { - err = rollbackErr - } - }() - - fErr := f(savepoint) - if fErr != nil { - _ = savepoint.Rollback(ctx) // ignore rollback error as there is already an error to return - return fErr - } - - return savepoint.Commit(ctx) -} - // Commit commits the transaction. func (tx *dbTx) Commit(ctx context.Context) error { if tx.closed { @@ -235,7 +185,7 @@ func (tx *dbTx) Commit(ctx context.Context) error { } return err } - if string(commandTag) == "ROLLBACK" { + if commandTag.String() == "ROLLBACK" { return ErrTxCommitRollback } @@ -263,7 +213,7 @@ func (tx *dbTx) Rollback(ctx context.Context) error { } // Exec delegates to the underlying *Conn -func (tx *dbTx) Exec(ctx context.Context, sql string, arguments ...interface{}) (commandTag pgconn.CommandTag, err error) { +func (tx *dbTx) Exec(ctx context.Context, sql string, arguments ...any) (commandTag pgconn.CommandTag, err error) { if tx.closed { return pgconn.CommandTag{}, ErrTxClosed } @@ -281,29 +231,20 @@ func (tx *dbTx) Prepare(ctx context.Context, name, sql string) (*pgconn.Statemen } // Query delegates to the underlying *Conn -func (tx *dbTx) Query(ctx context.Context, sql string, args ...interface{}) (Rows, error) { +func (tx *dbTx) Query(ctx context.Context, sql string, args ...any) (Rows, error) { if tx.closed { // Because checking for errors can be deferred to the *Rows, build one with the error err := ErrTxClosed - return &connRows{closed: true, err: err}, err + return &baseRows{closed: true, err: err}, err } return tx.conn.Query(ctx, sql, args...) } // QueryRow delegates to the underlying *Conn -func (tx *dbTx) QueryRow(ctx context.Context, sql string, args ...interface{}) Row { +func (tx *dbTx) QueryRow(ctx context.Context, sql string, args ...any) Row { rows, _ := tx.Query(ctx, sql, args...) - return (*connRow)(rows.(*connRows)) -} - -// QueryFunc delegates to the underlying *Conn. -func (tx *dbTx) QueryFunc(ctx context.Context, sql string, args []interface{}, scans []interface{}, f func(QueryFuncRow) error) (pgconn.CommandTag, error) { - if tx.closed { - return nil, ErrTxClosed - } - - return tx.conn.QueryFunc(ctx, sql, args, scans, f) + return (*connRow)(rows.(*baseRows)) } // CopyFrom delegates to the underlying *Conn @@ -349,14 +290,6 @@ func (sp *dbSimulatedNestedTx) Begin(ctx context.Context) (Tx, error) { return sp.tx.Begin(ctx) } -func (sp *dbSimulatedNestedTx) BeginFunc(ctx context.Context, f func(Tx) error) (err error) { - if sp.closed { - return ErrTxClosed - } - - return sp.tx.BeginFunc(ctx, f) -} - // Commit releases the savepoint essentially committing the pseudo nested transaction. func (sp *dbSimulatedNestedTx) Commit(ctx context.Context) error { if sp.closed { @@ -382,9 +315,9 @@ func (sp *dbSimulatedNestedTx) Rollback(ctx context.Context) error { } // Exec delegates to the underlying Tx -func (sp *dbSimulatedNestedTx) Exec(ctx context.Context, sql string, arguments ...interface{}) (commandTag pgconn.CommandTag, err error) { +func (sp *dbSimulatedNestedTx) Exec(ctx context.Context, sql string, arguments ...any) (commandTag pgconn.CommandTag, err error) { if sp.closed { - return nil, ErrTxClosed + return pgconn.CommandTag{}, ErrTxClosed } return sp.tx.Exec(ctx, sql, arguments...) @@ -400,29 +333,20 @@ func (sp *dbSimulatedNestedTx) Prepare(ctx context.Context, name, sql string) (* } // Query delegates to the underlying Tx -func (sp *dbSimulatedNestedTx) Query(ctx context.Context, sql string, args ...interface{}) (Rows, error) { +func (sp *dbSimulatedNestedTx) Query(ctx context.Context, sql string, args ...any) (Rows, error) { if sp.closed { // Because checking for errors can be deferred to the *Rows, build one with the error err := ErrTxClosed - return &connRows{closed: true, err: err}, err + return &baseRows{closed: true, err: err}, err } return sp.tx.Query(ctx, sql, args...) } // QueryRow delegates to the underlying Tx -func (sp *dbSimulatedNestedTx) QueryRow(ctx context.Context, sql string, args ...interface{}) Row { +func (sp *dbSimulatedNestedTx) QueryRow(ctx context.Context, sql string, args ...any) Row { rows, _ := sp.Query(ctx, sql, args...) - return (*connRow)(rows.(*connRows)) -} - -// QueryFunc delegates to the underlying Tx. -func (sp *dbSimulatedNestedTx) QueryFunc(ctx context.Context, sql string, args []interface{}, scans []interface{}, f func(QueryFuncRow) error) (pgconn.CommandTag, error) { - if sp.closed { - return nil, ErrTxClosed - } - - return sp.tx.QueryFunc(ctx, sql, args, scans, f) + return (*connRow)(rows.(*baseRows)) } // CopyFrom delegates to the underlying *Conn @@ -450,3 +374,59 @@ func (sp *dbSimulatedNestedTx) LargeObjects() LargeObjects { func (sp *dbSimulatedNestedTx) Conn() *Conn { return sp.tx.Conn() } + +// BeginFunc calls Begin on db and then calls fn. If fn does not return an error then it calls Commit on db. If fn +// returns an error it calls Rollback on db. The context will be used when executing the transaction control statements +// (BEGIN, ROLLBACK, and COMMIT) but does not otherwise affect the execution of fn. +func BeginFunc( + ctx context.Context, + db interface { + Begin(ctx context.Context) (Tx, error) + }, + fn func(Tx) error, +) (err error) { + var tx Tx + tx, err = db.Begin(ctx) + if err != nil { + return err + } + + return beginFuncExec(ctx, tx, fn) +} + +// BeginTxFunc calls BeginTx on db and then calls fn. If fn does not return an error then it calls Commit on db. If fn +// returns an error it calls Rollback on db. The context will be used when executing the transaction control statements +// (BEGIN, ROLLBACK, and COMMIT) but does not otherwise affect the execution of fn. +func BeginTxFunc( + ctx context.Context, + db interface { + BeginTx(ctx context.Context, txOptions TxOptions) (Tx, error) + }, + txOptions TxOptions, + fn func(Tx) error, +) (err error) { + var tx Tx + tx, err = db.BeginTx(ctx, txOptions) + if err != nil { + return err + } + + return beginFuncExec(ctx, tx, fn) +} + +func beginFuncExec(ctx context.Context, tx Tx, fn func(Tx) error) (err error) { + defer func() { + rollbackErr := tx.Rollback(ctx) + if rollbackErr != nil && !errors.Is(rollbackErr, ErrTxClosed) { + err = rollbackErr + } + }() + + fErr := fn(tx) + if fErr != nil { + _ = tx.Rollback(ctx) // ignore rollback error as there is already an error to return + return fErr + } + + return tx.Commit(ctx) +} diff --git a/vendor/github.com/jackc/pgx/v5/values.go b/vendor/github.com/jackc/pgx/v5/values.go new file mode 100644 index 0000000..6e2ff30 --- /dev/null +++ b/vendor/github.com/jackc/pgx/v5/values.go @@ -0,0 +1,63 @@ +package pgx + +import ( + "errors" + + "github.com/jackc/pgx/v5/internal/pgio" + "github.com/jackc/pgx/v5/pgtype" +) + +// PostgreSQL format codes +const ( + TextFormatCode = 0 + BinaryFormatCode = 1 +) + +func convertSimpleArgument(m *pgtype.Map, arg any) (any, error) { + buf, err := m.Encode(0, TextFormatCode, arg, []byte{}) + if err != nil { + return nil, err + } + if buf == nil { + return nil, nil + } + return string(buf), nil +} + +func encodeCopyValue(m *pgtype.Map, buf []byte, oid uint32, arg any) ([]byte, error) { + sp := len(buf) + buf = pgio.AppendInt32(buf, -1) + argBuf, err := m.Encode(oid, BinaryFormatCode, arg, buf) + if err != nil { + if argBuf2, err2 := tryScanStringCopyValueThenEncode(m, buf, oid, arg); err2 == nil { + argBuf = argBuf2 + } else { + return nil, err + } + } + + if argBuf != nil { + buf = argBuf + pgio.SetInt32(buf[sp:], int32(len(buf[sp:])-4)) + } + return buf, nil +} + +func tryScanStringCopyValueThenEncode(m *pgtype.Map, buf []byte, oid uint32, arg any) ([]byte, error) { + s, ok := arg.(string) + if !ok { + textBuf, err := m.Encode(oid, TextFormatCode, arg, nil) + if err != nil { + return nil, errors.New("not a string and cannot be encoded as text") + } + s = string(textBuf) + } + + var v any + err := m.Scan(oid, TextFormatCode, []byte(s), &v) + if err != nil { + return nil, err + } + + return m.Encode(oid, BinaryFormatCode, v, buf) +} diff --git a/vendor/github.com/jackc/puddle/v2/CHANGELOG.md b/vendor/github.com/jackc/puddle/v2/CHANGELOG.md new file mode 100644 index 0000000..d0d202c --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/CHANGELOG.md @@ -0,0 +1,79 @@ +# 2.2.2 (September 10, 2024) + +* Add empty acquire time to stats (Maxim Ivanov) +* Stop importing nanotime from runtime via linkname (maypok86) + +# 2.2.1 (July 15, 2023) + +* Fix: CreateResource cannot overflow pool. This changes documented behavior of CreateResource. Previously, + CreateResource could create a resource even if the pool was full. This could cause the pool to overflow. While this + was documented, it was documenting incorrect behavior. CreateResource now returns an error if the pool is full. + +# 2.2.0 (February 11, 2023) + +* Use Go 1.19 atomics and drop go.uber.org/atomic dependency + +# 2.1.2 (November 12, 2022) + +* Restore support to Go 1.18 via go.uber.org/atomic + +# 2.1.1 (November 11, 2022) + +* Fix create resource concurrently with Stat call race + +# 2.1.0 (October 28, 2022) + +* Concurrency control is now implemented with a semaphore. This simplifies some internal logic, resolves a few error conditions (including a deadlock), and improves performance. (Jan Dubsky) +* Go 1.19 is now required for the improved atomic support. + +# 2.0.1 (October 28, 2022) + +* Fix race condition when Close is called concurrently with multiple constructors + +# 2.0.0 (September 17, 2022) + +* Use generics instead of interface{} (Столяров Владимир Алексеевич) +* Add Reset +* Do not cancel resource construction when Acquire is canceled +* NewPool takes Config + +# 1.3.0 (August 27, 2022) + +* Acquire creates resources in background to allow creation to continue after Acquire is canceled (James Hartig) + +# 1.2.1 (December 2, 2021) + +* TryAcquire now does not block when background constructing resource + +# 1.2.0 (November 20, 2021) + +* Add TryAcquire (A. Jensen) +* Fix: remove memory leak / unintentionally pinned memory when shrinking slices (Alexander Staubo) +* Fix: Do not leave pool locked after panic from nil context + +# 1.1.4 (September 11, 2021) + +* Fix: Deadlock in CreateResource if pool was closed during resource acquisition (Dmitriy Matrenichev) + +# 1.1.3 (December 3, 2020) + +* Fix: Failed resource creation could cause concurrent Acquire to hang. (Evgeny Vanslov) + +# 1.1.2 (September 26, 2020) + +* Fix: Resource.Destroy no longer removes itself from the pool before its destructor has completed. +* Fix: Prevent crash when pool is closed while resource is being created. + +# 1.1.1 (April 2, 2020) + +* Pool.Close can be safely called multiple times +* AcquireAllIDle immediately returns nil if pool is closed +* CreateResource checks if pool is closed before taking any action +* Fix potential race condition when CreateResource and Close are called concurrently. CreateResource now checks if pool is closed before adding newly created resource to pool. + +# 1.1.0 (February 5, 2020) + +* Use runtime.nanotime for faster tracking of acquire time and last usage time. +* Track resource idle time to enable client health check logic. (Patrick Ellul) +* Add CreateResource to construct a new resource without acquiring it. (Patrick Ellul) +* Fix deadlock race when acquire is cancelled. (Michael Tharp) diff --git a/vendor/github.com/jackc/chunkreader/v2/LICENSE b/vendor/github.com/jackc/puddle/v2/LICENSE similarity index 96% rename from vendor/github.com/jackc/chunkreader/v2/LICENSE rename to vendor/github.com/jackc/puddle/v2/LICENSE index c1c4f50..bcc286c 100644 --- a/vendor/github.com/jackc/chunkreader/v2/LICENSE +++ b/vendor/github.com/jackc/puddle/v2/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2019 Jack Christensen +Copyright (c) 2018 Jack Christensen MIT License diff --git a/vendor/github.com/jackc/puddle/v2/README.md b/vendor/github.com/jackc/puddle/v2/README.md new file mode 100644 index 0000000..fa82a9d --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/README.md @@ -0,0 +1,80 @@ +[![Go Reference](https://pkg.go.dev/badge/github.com/jackc/puddle/v2.svg)](https://pkg.go.dev/github.com/jackc/puddle/v2) +![Build Status](https://github.com/jackc/puddle/actions/workflows/ci.yml/badge.svg) + +# Puddle + +Puddle is a tiny generic resource pool library for Go that uses the standard +context library to signal cancellation of acquires. It is designed to contain +the minimum functionality required for a resource pool. It can be used directly +or it can be used as the base for a domain specific resource pool. For example, +a database connection pool may use puddle internally and implement health checks +and keep-alive behavior without needing to implement any concurrent code of its +own. + +## Features + +* Acquire cancellation via context standard library +* Statistics API for monitoring pool pressure +* No dependencies outside of standard library and golang.org/x/sync +* High performance +* 100% test coverage of reachable code + +## Example Usage + +```go +package main + +import ( + "context" + "log" + "net" + + "github.com/jackc/puddle/v2" +) + +func main() { + constructor := func(context.Context) (net.Conn, error) { + return net.Dial("tcp", "127.0.0.1:8080") + } + destructor := func(value net.Conn) { + value.Close() + } + maxPoolSize := int32(10) + + pool, err := puddle.NewPool(&puddle.Config[net.Conn]{Constructor: constructor, Destructor: destructor, MaxSize: maxPoolSize}) + if err != nil { + log.Fatal(err) + } + + // Acquire resource from the pool. + res, err := pool.Acquire(context.Background()) + if err != nil { + log.Fatal(err) + } + + // Use resource. + _, err = res.Value().Write([]byte{1}) + if err != nil { + log.Fatal(err) + } + + // Release when done. + res.Release() +} +``` + +## Status + +Puddle is stable and feature complete. + +* Bug reports and fixes are welcome. +* New features will usually not be accepted if they can be feasibly implemented in a wrapper. +* Performance optimizations will usually not be accepted unless the performance issue rises to the level of a bug. + +## Supported Go Versions + +puddle supports the same versions of Go that are supported by the Go project. For [Go](https://golang.org/doc/devel/release.html#policy) that is the two most recent major releases. This means puddle supports Go 1.19 and higher. + +## License + +MIT diff --git a/vendor/github.com/jackc/puddle/v2/context.go b/vendor/github.com/jackc/puddle/v2/context.go new file mode 100644 index 0000000..e19d2a6 --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/context.go @@ -0,0 +1,24 @@ +package puddle + +import ( + "context" + "time" +) + +// valueCancelCtx combines two contexts into one. One context is used for values and the other is used for cancellation. +type valueCancelCtx struct { + valueCtx context.Context + cancelCtx context.Context +} + +func (ctx *valueCancelCtx) Deadline() (time.Time, bool) { return ctx.cancelCtx.Deadline() } +func (ctx *valueCancelCtx) Done() <-chan struct{} { return ctx.cancelCtx.Done() } +func (ctx *valueCancelCtx) Err() error { return ctx.cancelCtx.Err() } +func (ctx *valueCancelCtx) Value(key any) any { return ctx.valueCtx.Value(key) } + +func newValueCancelCtx(valueCtx, cancelContext context.Context) context.Context { + return &valueCancelCtx{ + valueCtx: valueCtx, + cancelCtx: cancelContext, + } +} diff --git a/vendor/github.com/jackc/puddle/v2/doc.go b/vendor/github.com/jackc/puddle/v2/doc.go new file mode 100644 index 0000000..818e4a6 --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/doc.go @@ -0,0 +1,11 @@ +// Package puddle is a generic resource pool with type-parametrized api. +/* + +Puddle is a tiny generic resource pool library for Go that uses the standard +context library to signal cancellation of acquires. It is designed to contain +the minimum functionality a resource pool needs that cannot be implemented +without concurrency concerns. For example, a database connection pool may use +puddle internally and implement health checks and keep-alive behavior without +needing to implement any concurrent code of its own. +*/ +package puddle diff --git a/vendor/github.com/jackc/puddle/v2/internal/genstack/gen_stack.go b/vendor/github.com/jackc/puddle/v2/internal/genstack/gen_stack.go new file mode 100644 index 0000000..7e4660c --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/internal/genstack/gen_stack.go @@ -0,0 +1,85 @@ +package genstack + +// GenStack implements a generational stack. +// +// GenStack works as common stack except for the fact that all elements in the +// older generation are guaranteed to be popped before any element in the newer +// generation. New elements are always pushed to the current (newest) +// generation. +// +// We could also say that GenStack behaves as a stack in case of a single +// generation, but it behaves as a queue of individual generation stacks. +type GenStack[T any] struct { + // We can represent arbitrary number of generations using 2 stacks. The + // new stack stores all new pushes and the old stack serves all reads. + // Old stack can represent multiple generations. If old == new, then all + // elements pushed in previous (not current) generations have already + // been popped. + + old *stack[T] + new *stack[T] +} + +// NewGenStack creates a new empty GenStack. +func NewGenStack[T any]() *GenStack[T] { + s := &stack[T]{} + return &GenStack[T]{ + old: s, + new: s, + } +} + +func (s *GenStack[T]) Pop() (T, bool) { + // Pushes always append to the new stack, so if the old once becomes + // empty, it will remail empty forever. + if s.old.len() == 0 && s.old != s.new { + s.old = s.new + } + + if s.old.len() == 0 { + var zero T + return zero, false + } + + return s.old.pop(), true +} + +// Push pushes a new element at the top of the stack. +func (s *GenStack[T]) Push(v T) { s.new.push(v) } + +// NextGen starts a new stack generation. +func (s *GenStack[T]) NextGen() { + if s.old == s.new { + s.new = &stack[T]{} + return + } + + // We need to pop from the old stack to the top of the new stack. Let's + // have an example: + // + // Old: 4 3 2 1 + // New: 8 7 6 5 + // PopOrder: 1 2 3 4 5 6 7 8 + // + // + // To preserve pop order, we have to take all elements from the old + // stack and push them to the top of new stack: + // + // New: 8 7 6 5 4 3 2 1 + // + s.new.push(s.old.takeAll()...) + + // We have the old stack allocated and empty, so why not to reuse it as + // new new stack. + s.old, s.new = s.new, s.old +} + +// Len returns number of elements in the stack. +func (s *GenStack[T]) Len() int { + l := s.old.len() + if s.old != s.new { + l += s.new.len() + } + + return l +} diff --git a/vendor/github.com/jackc/puddle/v2/internal/genstack/stack.go b/vendor/github.com/jackc/puddle/v2/internal/genstack/stack.go new file mode 100644 index 0000000..dbced0c --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/internal/genstack/stack.go @@ -0,0 +1,39 @@ +package genstack + +// stack is a wrapper around an array implementing a stack. +// +// We cannot use slice to represent the stack because append might change the +// pointer value of the slice. That would be an issue in GenStack +// implementation. +type stack[T any] struct { + arr []T +} + +// push pushes a new element at the top of a stack. +func (s *stack[T]) push(vs ...T) { s.arr = append(s.arr, vs...) } + +// pop pops the stack top-most element. +// +// If stack length is zero, this method panics. +func (s *stack[T]) pop() T { + idx := s.len() - 1 + val := s.arr[idx] + + // Avoid memory leak + var zero T + s.arr[idx] = zero + + s.arr = s.arr[:idx] + return val +} + +// takeAll returns all elements in the stack in order as they are stored - i.e. +// the top-most stack element is the last one. +func (s *stack[T]) takeAll() []T { + arr := s.arr + s.arr = nil + return arr +} + +// len returns number of elements in the stack. +func (s *stack[T]) len() int { return len(s.arr) } diff --git a/vendor/github.com/jackc/puddle/v2/log.go b/vendor/github.com/jackc/puddle/v2/log.go new file mode 100644 index 0000000..b21b946 --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/log.go @@ -0,0 +1,32 @@ +package puddle + +import "unsafe" + +type ints interface { + int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64 +} + +// log2Int returns log2 of an integer. This function panics if val < 0. For val +// == 0, returns 0. +func log2Int[T ints](val T) uint8 { + if val <= 0 { + panic("log2 of non-positive number does not exist") + } + + return log2IntRange(val, 0, uint8(8*unsafe.Sizeof(val))) +} + +func log2IntRange[T ints](val T, begin, end uint8) uint8 { + length := end - begin + if length == 1 { + return begin + } + + delim := begin + length/2 + mask := T(1) << delim + if mask > val { + return log2IntRange(val, begin, delim) + } else { + return log2IntRange(val, delim, end) + } +} diff --git a/vendor/github.com/jackc/puddle/v2/nanotime.go b/vendor/github.com/jackc/puddle/v2/nanotime.go new file mode 100644 index 0000000..8a5351a --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/nanotime.go @@ -0,0 +1,16 @@ +package puddle + +import "time" + +// nanotime returns the time in nanoseconds since process start. +// +// This approach, described at +// https://github.com/golang/go/issues/61765#issuecomment-1672090302, +// is fast, monotonic, and portable, and avoids the previous +// dependence on runtime.nanotime using the (unsafe) linkname hack. +// In particular, time.Since does less work than time.Now. +func nanotime() int64 { + return time.Since(globalStart).Nanoseconds() +} + +var globalStart = time.Now() diff --git a/vendor/github.com/jackc/puddle/v2/pool.go b/vendor/github.com/jackc/puddle/v2/pool.go new file mode 100644 index 0000000..c411d2f --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/pool.go @@ -0,0 +1,710 @@ +package puddle + +import ( + "context" + "errors" + "sync" + "sync/atomic" + "time" + + "github.com/jackc/puddle/v2/internal/genstack" + "golang.org/x/sync/semaphore" +) + +const ( + resourceStatusConstructing = 0 + resourceStatusIdle = iota + resourceStatusAcquired = iota + resourceStatusHijacked = iota +) + +// ErrClosedPool occurs on an attempt to acquire a connection from a closed pool +// or a pool that is closed while the acquire is waiting. +var ErrClosedPool = errors.New("closed pool") + +// ErrNotAvailable occurs on an attempt to acquire a resource from a pool +// that is at maximum capacity and has no available resources. +var ErrNotAvailable = errors.New("resource not available") + +// Constructor is a function called by the pool to construct a resource. +type Constructor[T any] func(ctx context.Context) (res T, err error) + +// Destructor is a function called by the pool to destroy a resource. +type Destructor[T any] func(res T) + +// Resource is the resource handle returned by acquiring from the pool. +type Resource[T any] struct { + value T + pool *Pool[T] + creationTime time.Time + lastUsedNano int64 + poolResetCount int + status byte +} + +// Value returns the resource value. +func (res *Resource[T]) Value() T { + if !(res.status == resourceStatusAcquired || res.status == resourceStatusHijacked) { + panic("tried to access resource that is not acquired or hijacked") + } + return res.value +} + +// Release returns the resource to the pool. res must not be subsequently used. +func (res *Resource[T]) Release() { + if res.status != resourceStatusAcquired { + panic("tried to release resource that is not acquired") + } + res.pool.releaseAcquiredResource(res, nanotime()) +} + +// ReleaseUnused returns the resource to the pool without updating when it was last used used. i.e. LastUsedNanotime +// will not change. res must not be subsequently used. +func (res *Resource[T]) ReleaseUnused() { + if res.status != resourceStatusAcquired { + panic("tried to release resource that is not acquired") + } + res.pool.releaseAcquiredResource(res, res.lastUsedNano) +} + +// Destroy returns the resource to the pool for destruction. res must not be +// subsequently used. +func (res *Resource[T]) Destroy() { + if res.status != resourceStatusAcquired { + panic("tried to destroy resource that is not acquired") + } + go res.pool.destroyAcquiredResource(res) +} + +// Hijack assumes ownership of the resource from the pool. Caller is responsible +// for cleanup of resource value. +func (res *Resource[T]) Hijack() { + if res.status != resourceStatusAcquired { + panic("tried to hijack resource that is not acquired") + } + res.pool.hijackAcquiredResource(res) +} + +// CreationTime returns when the resource was created by the pool. +func (res *Resource[T]) CreationTime() time.Time { + if !(res.status == resourceStatusAcquired || res.status == resourceStatusHijacked) { + panic("tried to access resource that is not acquired or hijacked") + } + return res.creationTime +} + +// LastUsedNanotime returns when Release was last called on the resource measured in nanoseconds from an arbitrary time +// (a monotonic time). Returns creation time if Release has never been called. This is only useful to compare with +// other calls to LastUsedNanotime. In almost all cases, IdleDuration should be used instead. +func (res *Resource[T]) LastUsedNanotime() int64 { + if !(res.status == resourceStatusAcquired || res.status == resourceStatusHijacked) { + panic("tried to access resource that is not acquired or hijacked") + } + + return res.lastUsedNano +} + +// IdleDuration returns the duration since Release was last called on the resource. This is equivalent to subtracting +// LastUsedNanotime to the current nanotime. +func (res *Resource[T]) IdleDuration() time.Duration { + if !(res.status == resourceStatusAcquired || res.status == resourceStatusHijacked) { + panic("tried to access resource that is not acquired or hijacked") + } + + return time.Duration(nanotime() - res.lastUsedNano) +} + +// Pool is a concurrency-safe resource pool. +type Pool[T any] struct { + // mux is the pool internal lock. Any modification of shared state of + // the pool (but Acquires of acquireSem) must be performed only by + // holder of the lock. Long running operations are not allowed when mux + // is held. + mux sync.Mutex + // acquireSem provides an allowance to acquire a resource. + // + // Releases are allowed only when caller holds mux. Acquires have to + // happen before mux is locked (doesn't apply to semaphore.TryAcquire in + // AcquireAllIdle). + acquireSem *semaphore.Weighted + destructWG sync.WaitGroup + + allResources resList[T] + idleResources *genstack.GenStack[*Resource[T]] + + constructor Constructor[T] + destructor Destructor[T] + maxSize int32 + + acquireCount int64 + acquireDuration time.Duration + emptyAcquireCount int64 + emptyAcquireWaitTime time.Duration + canceledAcquireCount atomic.Int64 + + resetCount int + + baseAcquireCtx context.Context + cancelBaseAcquireCtx context.CancelFunc + closed bool +} + +type Config[T any] struct { + Constructor Constructor[T] + Destructor Destructor[T] + MaxSize int32 +} + +// NewPool creates a new pool. Returns an error iff MaxSize is less than 1. +func NewPool[T any](config *Config[T]) (*Pool[T], error) { + if config.MaxSize < 1 { + return nil, errors.New("MaxSize must be >= 1") + } + + baseAcquireCtx, cancelBaseAcquireCtx := context.WithCancel(context.Background()) + + return &Pool[T]{ + acquireSem: semaphore.NewWeighted(int64(config.MaxSize)), + idleResources: genstack.NewGenStack[*Resource[T]](), + maxSize: config.MaxSize, + constructor: config.Constructor, + destructor: config.Destructor, + baseAcquireCtx: baseAcquireCtx, + cancelBaseAcquireCtx: cancelBaseAcquireCtx, + }, nil +} + +// Close destroys all resources in the pool and rejects future Acquire calls. +// Blocks until all resources are returned to pool and destroyed. +func (p *Pool[T]) Close() { + defer p.destructWG.Wait() + + p.mux.Lock() + defer p.mux.Unlock() + + if p.closed { + return + } + p.closed = true + p.cancelBaseAcquireCtx() + + for res, ok := p.idleResources.Pop(); ok; res, ok = p.idleResources.Pop() { + p.allResources.remove(res) + go p.destructResourceValue(res.value) + } +} + +// Stat is a snapshot of Pool statistics. +type Stat struct { + constructingResources int32 + acquiredResources int32 + idleResources int32 + maxResources int32 + acquireCount int64 + acquireDuration time.Duration + emptyAcquireCount int64 + emptyAcquireWaitTime time.Duration + canceledAcquireCount int64 +} + +// TotalResources returns the total number of resources currently in the pool. +// The value is the sum of ConstructingResources, AcquiredResources, and +// IdleResources. +func (s *Stat) TotalResources() int32 { + return s.constructingResources + s.acquiredResources + s.idleResources +} + +// ConstructingResources returns the number of resources with construction in progress in +// the pool. +func (s *Stat) ConstructingResources() int32 { + return s.constructingResources +} + +// AcquiredResources returns the number of currently acquired resources in the pool. +func (s *Stat) AcquiredResources() int32 { + return s.acquiredResources +} + +// IdleResources returns the number of currently idle resources in the pool. +func (s *Stat) IdleResources() int32 { + return s.idleResources +} + +// MaxResources returns the maximum size of the pool. +func (s *Stat) MaxResources() int32 { + return s.maxResources +} + +// AcquireCount returns the cumulative count of successful acquires from the pool. +func (s *Stat) AcquireCount() int64 { + return s.acquireCount +} + +// AcquireDuration returns the total duration of all successful acquires from +// the pool. +func (s *Stat) AcquireDuration() time.Duration { + return s.acquireDuration +} + +// EmptyAcquireCount returns the cumulative count of successful acquires from the pool +// that waited for a resource to be released or constructed because the pool was +// empty. +func (s *Stat) EmptyAcquireCount() int64 { + return s.emptyAcquireCount +} + +// EmptyAcquireWaitTime returns the cumulative time waited for successful acquires +// from the pool for a resource to be released or constructed because the pool was +// empty. +func (s *Stat) EmptyAcquireWaitTime() time.Duration { + return s.emptyAcquireWaitTime +} + +// CanceledAcquireCount returns the cumulative count of acquires from the pool +// that were canceled by a context. +func (s *Stat) CanceledAcquireCount() int64 { + return s.canceledAcquireCount +} + +// Stat returns the current pool statistics. +func (p *Pool[T]) Stat() *Stat { + p.mux.Lock() + defer p.mux.Unlock() + + s := &Stat{ + maxResources: p.maxSize, + acquireCount: p.acquireCount, + emptyAcquireCount: p.emptyAcquireCount, + emptyAcquireWaitTime: p.emptyAcquireWaitTime, + canceledAcquireCount: p.canceledAcquireCount.Load(), + acquireDuration: p.acquireDuration, + } + + for _, res := range p.allResources { + switch res.status { + case resourceStatusConstructing: + s.constructingResources += 1 + case resourceStatusIdle: + s.idleResources += 1 + case resourceStatusAcquired: + s.acquiredResources += 1 + } + } + + return s +} + +// tryAcquireIdleResource checks if there is any idle resource. If there is +// some, this method removes it from idle list and returns it. If the idle pool +// is empty, this method returns nil and doesn't modify the idleResources slice. +// +// WARNING: Caller of this method must hold the pool mutex! +func (p *Pool[T]) tryAcquireIdleResource() *Resource[T] { + res, ok := p.idleResources.Pop() + if !ok { + return nil + } + + res.status = resourceStatusAcquired + return res +} + +// createNewResource creates a new resource and inserts it into list of pool +// resources. +// +// WARNING: Caller of this method must hold the pool mutex! +func (p *Pool[T]) createNewResource() *Resource[T] { + res := &Resource[T]{ + pool: p, + creationTime: time.Now(), + lastUsedNano: nanotime(), + poolResetCount: p.resetCount, + status: resourceStatusConstructing, + } + + p.allResources.append(res) + p.destructWG.Add(1) + + return res +} + +// Acquire gets a resource from the pool. If no resources are available and the pool is not at maximum capacity it will +// create a new resource. If the pool is at maximum capacity it will block until a resource is available. ctx can be +// used to cancel the Acquire. +// +// If Acquire creates a new resource the resource constructor function will receive a context that delegates Value() to +// ctx. Canceling ctx will cause Acquire to return immediately but it will not cancel the resource creation. This avoids +// the problem of it being impossible to create resources when the time to create a resource is greater than any one +// caller of Acquire is willing to wait. +func (p *Pool[T]) Acquire(ctx context.Context) (_ *Resource[T], err error) { + select { + case <-ctx.Done(): + p.canceledAcquireCount.Add(1) + return nil, ctx.Err() + default: + } + + return p.acquire(ctx) +} + +// acquire is a continuation of Acquire function that doesn't check context +// validity. +// +// This function exists solely only for benchmarking purposes. +func (p *Pool[T]) acquire(ctx context.Context) (*Resource[T], error) { + startNano := nanotime() + + var waitedForLock bool + if !p.acquireSem.TryAcquire(1) { + waitedForLock = true + err := p.acquireSem.Acquire(ctx, 1) + if err != nil { + p.canceledAcquireCount.Add(1) + return nil, err + } + } + + p.mux.Lock() + if p.closed { + p.acquireSem.Release(1) + p.mux.Unlock() + return nil, ErrClosedPool + } + + // If a resource is available in the pool. + if res := p.tryAcquireIdleResource(); res != nil { + waitTime := time.Duration(nanotime() - startNano) + if waitedForLock { + p.emptyAcquireCount += 1 + p.emptyAcquireWaitTime += waitTime + } + p.acquireCount += 1 + p.acquireDuration += waitTime + p.mux.Unlock() + return res, nil + } + + if len(p.allResources) >= int(p.maxSize) { + // Unreachable code. + panic("bug: semaphore allowed more acquires than pool allows") + } + + // The resource is not idle, but there is enough space to create one. + res := p.createNewResource() + p.mux.Unlock() + + res, err := p.initResourceValue(ctx, res) + if err != nil { + return nil, err + } + + p.mux.Lock() + defer p.mux.Unlock() + + p.emptyAcquireCount += 1 + p.acquireCount += 1 + waitTime := time.Duration(nanotime() - startNano) + p.acquireDuration += waitTime + p.emptyAcquireWaitTime += waitTime + + return res, nil +} + +func (p *Pool[T]) initResourceValue(ctx context.Context, res *Resource[T]) (*Resource[T], error) { + // Create the resource in a goroutine to immediately return from Acquire + // if ctx is canceled without also canceling the constructor. + // + // See: + // - https://github.com/jackc/pgx/issues/1287 + // - https://github.com/jackc/pgx/issues/1259 + constructErrChan := make(chan error) + go func() { + constructorCtx := newValueCancelCtx(ctx, p.baseAcquireCtx) + value, err := p.constructor(constructorCtx) + if err != nil { + p.mux.Lock() + p.allResources.remove(res) + p.destructWG.Done() + + // The resource won't be acquired because its + // construction failed. We have to allow someone else to + // take that resouce. + p.acquireSem.Release(1) + p.mux.Unlock() + + select { + case constructErrChan <- err: + case <-ctx.Done(): + // The caller is cancelled, so no-one awaits the + // error. This branch avoid goroutine leak. + } + return + } + + // The resource is already in p.allResources where it might be read. So we need to acquire the lock to update its + // status. + p.mux.Lock() + res.value = value + res.status = resourceStatusAcquired + p.mux.Unlock() + + // This select works because the channel is unbuffered. + select { + case constructErrChan <- nil: + case <-ctx.Done(): + p.releaseAcquiredResource(res, res.lastUsedNano) + } + }() + + select { + case <-ctx.Done(): + p.canceledAcquireCount.Add(1) + return nil, ctx.Err() + case err := <-constructErrChan: + if err != nil { + return nil, err + } + return res, nil + } +} + +// TryAcquire gets a resource from the pool if one is immediately available. If not, it returns ErrNotAvailable. If no +// resources are available but the pool has room to grow, a resource will be created in the background. ctx is only +// used to cancel the background creation. +func (p *Pool[T]) TryAcquire(ctx context.Context) (*Resource[T], error) { + if !p.acquireSem.TryAcquire(1) { + return nil, ErrNotAvailable + } + + p.mux.Lock() + defer p.mux.Unlock() + + if p.closed { + p.acquireSem.Release(1) + return nil, ErrClosedPool + } + + // If a resource is available now + if res := p.tryAcquireIdleResource(); res != nil { + p.acquireCount += 1 + return res, nil + } + + if len(p.allResources) >= int(p.maxSize) { + // Unreachable code. + panic("bug: semaphore allowed more acquires than pool allows") + } + + res := p.createNewResource() + go func() { + value, err := p.constructor(ctx) + + p.mux.Lock() + defer p.mux.Unlock() + // We have to create the resource and only then release the + // semaphore - For the time being there is no resource that + // someone could acquire. + defer p.acquireSem.Release(1) + + if err != nil { + p.allResources.remove(res) + p.destructWG.Done() + return + } + + res.value = value + res.status = resourceStatusIdle + p.idleResources.Push(res) + }() + + return nil, ErrNotAvailable +} + +// acquireSemAll tries to acquire num free tokens from sem. This function is +// guaranteed to acquire at least the lowest number of tokens that has been +// available in the semaphore during runtime of this function. +// +// For the time being, semaphore doesn't allow to acquire all tokens atomically +// (see https://github.com/golang/sync/pull/19). We simulate this by trying all +// powers of 2 that are less or equal to num. +// +// For example, let's immagine we have 19 free tokens in the semaphore which in +// total has 24 tokens (i.e. the maxSize of the pool is 24 resources). Then if +// num is 24, the log2Uint(24) is 4 and we try to acquire 16, 8, 4, 2 and 1 +// tokens. Out of those, the acquire of 16, 2 and 1 tokens will succeed. +// +// Naturally, Acquires and Releases of the semaphore might take place +// concurrently. For this reason, it's not guaranteed that absolutely all free +// tokens in the semaphore will be acquired. But it's guaranteed that at least +// the minimal number of tokens that has been present over the whole process +// will be acquired. This is sufficient for the use-case we have in this +// package. +// +// TODO: Replace this with acquireSem.TryAcquireAll() if it gets to +// upstream. https://github.com/golang/sync/pull/19 +func acquireSemAll(sem *semaphore.Weighted, num int) int { + if sem.TryAcquire(int64(num)) { + return num + } + + var acquired int + for i := int(log2Int(num)); i >= 0; i-- { + val := 1 << i + if sem.TryAcquire(int64(val)) { + acquired += val + } + } + + return acquired +} + +// AcquireAllIdle acquires all currently idle resources. Its intended use is for +// health check and keep-alive functionality. It does not update pool +// statistics. +func (p *Pool[T]) AcquireAllIdle() []*Resource[T] { + p.mux.Lock() + defer p.mux.Unlock() + + if p.closed { + return nil + } + + numIdle := p.idleResources.Len() + if numIdle == 0 { + return nil + } + + // In acquireSemAll we use only TryAcquire and not Acquire. Because + // TryAcquire cannot block, the fact that we hold mutex locked and try + // to acquire semaphore cannot result in dead-lock. + // + // Because the mutex is locked, no parallel Release can run. This + // implies that the number of tokens can only decrease because some + // Acquire/TryAcquire call can consume the semaphore token. Consequently + // acquired is always less or equal to numIdle. Moreover if acquired < + // numIdle, then there are some parallel Acquire/TryAcquire calls that + // will take the remaining idle connections. + acquired := acquireSemAll(p.acquireSem, numIdle) + + idle := make([]*Resource[T], acquired) + for i := range idle { + res, _ := p.idleResources.Pop() + res.status = resourceStatusAcquired + idle[i] = res + } + + // We have to bump the generation to ensure that Acquire/TryAcquire + // calls running in parallel (those which caused acquired < numIdle) + // will consume old connections and not freshly released connections + // instead. + p.idleResources.NextGen() + + return idle +} + +// CreateResource constructs a new resource without acquiring it. It goes straight in the IdlePool. If the pool is full +// it returns an error. It can be useful to maintain warm resources under little load. +func (p *Pool[T]) CreateResource(ctx context.Context) error { + if !p.acquireSem.TryAcquire(1) { + return ErrNotAvailable + } + + p.mux.Lock() + if p.closed { + p.acquireSem.Release(1) + p.mux.Unlock() + return ErrClosedPool + } + + if len(p.allResources) >= int(p.maxSize) { + p.acquireSem.Release(1) + p.mux.Unlock() + return ErrNotAvailable + } + + res := p.createNewResource() + p.mux.Unlock() + + value, err := p.constructor(ctx) + p.mux.Lock() + defer p.mux.Unlock() + defer p.acquireSem.Release(1) + if err != nil { + p.allResources.remove(res) + p.destructWG.Done() + return err + } + + res.value = value + res.status = resourceStatusIdle + + // If closed while constructing resource then destroy it and return an error + if p.closed { + go p.destructResourceValue(res.value) + return ErrClosedPool + } + + p.idleResources.Push(res) + + return nil +} + +// Reset destroys all resources, but leaves the pool open. It is intended for use when an error is detected that would +// disrupt all resources (such as a network interruption or a server state change). +// +// It is safe to reset a pool while resources are checked out. Those resources will be destroyed when they are returned +// to the pool. +func (p *Pool[T]) Reset() { + p.mux.Lock() + defer p.mux.Unlock() + + p.resetCount++ + + for res, ok := p.idleResources.Pop(); ok; res, ok = p.idleResources.Pop() { + p.allResources.remove(res) + go p.destructResourceValue(res.value) + } +} + +// releaseAcquiredResource returns res to the the pool. +func (p *Pool[T]) releaseAcquiredResource(res *Resource[T], lastUsedNano int64) { + p.mux.Lock() + defer p.mux.Unlock() + defer p.acquireSem.Release(1) + + if p.closed || res.poolResetCount != p.resetCount { + p.allResources.remove(res) + go p.destructResourceValue(res.value) + } else { + res.lastUsedNano = lastUsedNano + res.status = resourceStatusIdle + p.idleResources.Push(res) + } +} + +// Remove removes res from the pool and closes it. If res is not part of the +// pool Remove will panic. +func (p *Pool[T]) destroyAcquiredResource(res *Resource[T]) { + p.destructResourceValue(res.value) + + p.mux.Lock() + defer p.mux.Unlock() + defer p.acquireSem.Release(1) + + p.allResources.remove(res) +} + +func (p *Pool[T]) hijackAcquiredResource(res *Resource[T]) { + p.mux.Lock() + defer p.mux.Unlock() + defer p.acquireSem.Release(1) + + p.allResources.remove(res) + res.status = resourceStatusHijacked + p.destructWG.Done() // not responsible for destructing hijacked resources +} + +func (p *Pool[T]) destructResourceValue(value T) { + p.destructor(value) + p.destructWG.Done() +} diff --git a/vendor/github.com/jackc/puddle/v2/resource_list.go b/vendor/github.com/jackc/puddle/v2/resource_list.go new file mode 100644 index 0000000..b243095 --- /dev/null +++ b/vendor/github.com/jackc/puddle/v2/resource_list.go @@ -0,0 +1,28 @@ +package puddle + +type resList[T any] []*Resource[T] + +func (l *resList[T]) append(val *Resource[T]) { *l = append(*l, val) } + +func (l *resList[T]) popBack() *Resource[T] { + idx := len(*l) - 1 + val := (*l)[idx] + (*l)[idx] = nil // Avoid memory leak + *l = (*l)[:idx] + + return val +} + +func (l *resList[T]) remove(val *Resource[T]) { + for i, elem := range *l { + if elem == val { + lastIdx := len(*l) - 1 + (*l)[i] = (*l)[lastIdx] + (*l)[lastIdx] = nil // Avoid memory leak + (*l) = (*l)[:lastIdx] + return + } + } + + panic("BUG: removeResource could not find res in slice") +} diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md index accd7ab..21508ed 100644 --- a/vendor/github.com/klauspost/cpuid/v2/README.md +++ b/vendor/github.com/klauspost/cpuid/v2/README.md @@ -9,10 +9,7 @@ You can access the CPU information by accessing the shared CPU variable of the c Package home: https://github.com/klauspost/cpuid [![PkgGoDev](https://pkg.go.dev/badge/github.com/klauspost/cpuid)](https://pkg.go.dev/github.com/klauspost/cpuid/v2) -[![Build Status][3]][4] - -[3]: https://travis-ci.org/klauspost/cpuid.svg?branch=master -[4]: https://travis-ci.org/klauspost/cpuid +[![Go](https://github.com/klauspost/cpuid/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/cpuid/actions/workflows/go.yml) ## installing @@ -285,7 +282,12 @@ Exit Code 1 | AMXINT8 | Tile computational operations on 8-bit integers | | AMXFP16 | Tile computational operations on FP16 numbers | | AMXTILE | Tile architecture | +| APX_F | Intel APX | | AVX | AVX functions | +| AVX10 | If set the Intel AVX10 Converged Vector ISA is supported | +| AVX10_128 | If set indicates that AVX10 128-bit vector support is present | +| AVX10_256 | If set indicates that AVX10 256-bit vector support is present | +| AVX10_512 | If set indicates that AVX10 512-bit vector support is present | | AVX2 | AVX2 functions | | AVX512BF16 | AVX-512 BFLOAT16 Instructions | | AVX512BITALG | AVX-512 Bit Algorithms | @@ -308,6 +310,7 @@ Exit Code 1 | AVXSLOW | Indicates the CPU performs 2 128 bit operations instead of one | | AVXVNNI | AVX (VEX encoded) VNNI neural network instructions | | AVXVNNIINT8 | AVX-VNNI-INT8 instructions | +| AVXVNNIINT16 | AVX-VNNI-INT16 instructions | | BHI_CTRL | Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 | | BMI1 | Bit Manipulation Instruction Set 1 | | BMI2 | Bit Manipulation Instruction Set 2 | @@ -365,6 +368,8 @@ Exit Code 1 | IDPRED_CTRL | IPRED_DIS | | INT_WBINVD | WBINVD/WBNOINVD are interruptible. | | INVLPGB | NVLPGB and TLBSYNC instruction supported | +| KEYLOCKER | Key locker | +| KEYLOCKERW | Key locker wide | | LAHF | LAHF/SAHF in long mode | | LAM | If set, CPU supports Linear Address Masking | | LBRVIRT | LBR virtualization | @@ -380,7 +385,7 @@ Exit Code 1 | MOVDIRI | Move Doubleword as Direct Store | | MOVSB_ZL | Fast Zero-Length MOVSB | | MPX | Intel MPX (Memory Protection Extensions) | -| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD | +| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD | | MSRIRC | Instruction Retired Counter MSR available | | MSRLIST | Read/Write List of Model Specific Registers | | MSR_PAGEFLUSH | Page Flush MSR available | diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go index d015c74..53bc18c 100644 --- a/vendor/github.com/klauspost/cpuid/v2/cpuid.go +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go @@ -67,188 +67,201 @@ const ( // Keep index -1 as unknown UNKNOWN = -1 - // Add features - ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) - AESNI // Advanced Encryption Standard New Instructions - AMD3DNOW // AMD 3DNOW - AMD3DNOWEXT // AMD 3DNowExt - AMXBF16 // Tile computational operations on BFLOAT16 numbers - AMXFP16 // Tile computational operations on FP16 numbers - AMXINT8 // Tile computational operations on 8-bit integers - AMXTILE // Tile architecture - AVX // AVX functions - AVX2 // AVX2 functions - AVX512BF16 // AVX-512 BFLOAT16 Instructions - AVX512BITALG // AVX-512 Bit Algorithms - AVX512BW // AVX-512 Byte and Word Instructions - AVX512CD // AVX-512 Conflict Detection Instructions - AVX512DQ // AVX-512 Doubleword and Quadword Instructions - AVX512ER // AVX-512 Exponential and Reciprocal Instructions - AVX512F // AVX-512 Foundation - AVX512FP16 // AVX-512 FP16 Instructions - AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions - AVX512PF // AVX-512 Prefetch Instructions - AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions - AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 - AVX512VL // AVX-512 Vector Length Extensions - AVX512VNNI // AVX-512 Vector Neural Network Instructions - AVX512VP2INTERSECT // AVX-512 Intersect for D/Q - AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword - AVXIFMA // AVX-IFMA instructions - AVXNECONVERT // AVX-NE-CONVERT instructions - AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one - AVXVNNI // AVX (VEX encoded) VNNI neural network instructions - AVXVNNIINT8 // AVX-VNNI-INT8 instructions - BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 - BMI1 // Bit Manipulation Instruction Set 1 - BMI2 // Bit Manipulation Instruction Set 2 - CETIBT // Intel CET Indirect Branch Tracking - CETSS // Intel CET Shadow Stack - CLDEMOTE // Cache Line Demote - CLMUL // Carry-less Multiplication - CLZERO // CLZERO instruction supported - CMOV // i686 CMOV - CMPCCXADD // CMPCCXADD instructions - CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB - CMPXCHG8 // CMPXCHG8 instruction - CPBOOST // Core Performance Boost - CPPC // AMD: Collaborative Processor Performance Control - CX16 // CMPXCHG16B Instruction - EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ - ENQCMD // Enqueue Command - ERMS // Enhanced REP MOVSB/STOSB - F16C // Half-precision floating-point conversion - FLUSH_L1D // Flush L1D cache - FMA3 // Intel FMA 3. Does not imply AVX. - FMA4 // Bulldozer FMA4 functions - FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide - FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide - FSRM // Fast Short Rep Mov - FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9 - FXSROPT // FXSAVE/FXRSTOR optimizations - GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. - HLE // Hardware Lock Elision - HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR - HTT // Hyperthreading (enabled) - HWA // Hardware assert supported. Indicates support for MSRC001_10 - HYBRID_CPU // This part has CPUs of more than one type. - HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors - IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel) - IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR - IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) - IBRS // AMD: Indirect Branch Restricted Speculation - IBRS_PREFERRED // AMD: IBRS is preferred over software solution - IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection - IBS // Instruction Based Sampling (AMD) - IBSBRNTRGT // Instruction Based Sampling Feature (AMD) - IBSFETCHSAM // Instruction Based Sampling Feature (AMD) - IBSFFV // Instruction Based Sampling Feature (AMD) - IBSOPCNT // Instruction Based Sampling Feature (AMD) - IBSOPCNTEXT // Instruction Based Sampling Feature (AMD) - IBSOPSAM // Instruction Based Sampling Feature (AMD) - IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) - IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) - IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported - IBS_OPDATA4 // AMD: IBS op data 4 MSR supported - IBS_OPFUSE // AMD: Indicates support for IbsOpFuse - IBS_PREVENTHOST // Disallowing IBS use by the host supported - IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4 - IDPRED_CTRL // IPRED_DIS - INT_WBINVD // WBINVD/WBNOINVD are interruptible. - INVLPGB // NVLPGB and TLBSYNC instruction supported - LAHF // LAHF/SAHF in long mode - LAM // If set, CPU supports Linear Address Masking - LBRVIRT // LBR virtualization - LZCNT // LZCNT instruction - MCAOVERFLOW // MCA overflow recovery support. - MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it. - MCOMMIT // MCOMMIT instruction supported - MD_CLEAR // VERW clears CPU buffers - MMX // standard MMX - MMXEXT // SSE integer functions or AMD MMX ext - MOVBE // MOVBE instruction (big-endian) - MOVDIR64B // Move 64 Bytes as Direct Store - MOVDIRI // Move Doubleword as Direct Store - MOVSB_ZL // Fast Zero-Length MOVSB - MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD - MPX // Intel MPX (Memory Protection Extensions) - MSRIRC // Instruction Retired Counter MSR available - MSRLIST // Read/Write List of Model Specific Registers - MSR_PAGEFLUSH // Page Flush MSR available - NRIPS // Indicates support for NRIP save on VMEXIT - NX // NX (No-Execute) bit - OSXSAVE // XSAVE enabled by OS - PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption - POPCNT // POPCNT instruction - PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled - PREFETCHI // PREFETCHIT0/1 instructions - PSFD // Predictive Store Forward Disable - RDPRU // RDPRU instruction supported - RDRAND // RDRAND instruction is available - RDSEED // RDSEED instruction is available - RDTSCP // RDTSCP Instruction - RRSBA_CTRL // Restricted RSB Alternate - RTM // Restricted Transactional Memory - RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. - SERIALIZE // Serialize Instruction Execution - SEV // AMD Secure Encrypted Virtualization supported - SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host - SEV_ALTERNATIVE // AMD SEV Alternate Injection supported - SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests - SEV_ES // AMD SEV Encrypted State supported - SEV_RESTRICTED // AMD SEV Restricted Injection supported - SEV_SNP // AMD SEV Secure Nested Paging supported - SGX // Software Guard Extensions - SGXLC // Software Guard Extensions Launch Control - SHA // Intel SHA Extensions - SME // AMD Secure Memory Encryption supported - SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced - SPEC_CTRL_SSBD // Speculative Store Bypass Disable - SRBDS_CTRL // SRBDS mitigation MSR available - SSE // SSE functions - SSE2 // P4 SSE functions - SSE3 // Prescott SSE3 functions - SSE4 // Penryn SSE4.1 functions - SSE42 // Nehalem SSE4.2 functions - SSE4A // AMD Barcelona microarchitecture SSE4a instructions - SSSE3 // Conroe SSSE3 functions - STIBP // Single Thread Indirect Branch Predictors - STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On - STOSB_SHORT // Fast short STOSB - SUCCOR // Software uncorrectable error containment and recovery capability. - SVM // AMD Secure Virtual Machine - SVMDA // Indicates support for the SVM decode assists. - SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control - SVML // AMD SVM lock. Indicates support for SVM-Lock. - SVMNP // AMD SVM nested paging - SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter - SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold - SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions. - SYSEE // SYSENTER and SYSEXIT instructions - TBM // AMD Trailing Bit Manipulation - TDX_GUEST // Intel Trust Domain Extensions Guest - TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations - TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. - TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. - TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 - TSXLDTRK // Intel TSX Suspend Load Address Tracking - VAES // Vector AES. AVX(512) versions requires additional checks. - VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits. - VMPL // AMD VM Permission Levels supported - VMSA_REGPROT // AMD VMSA Register Protection supported - VMX // Virtual Machine Extensions - VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. - VTE // AMD Virtual Transparent Encryption supported - WAITPKG // TPAUSE, UMONITOR, UMWAIT - WBNOINVD // Write Back and Do Not Invalidate Cache - WRMSRNS // Non-Serializing Write to Model Specific Register - X87 // FPU - XGETBV1 // Supports XGETBV with ECX = 1 - XOP // Bulldozer XOP functions - XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV - XSAVEC // Supports XSAVEC and the compacted form of XRSTOR. - XSAVEOPT // XSAVEOPT available - XSAVES // Supports XSAVES/XRSTORS and IA32_XSS + // x86 features + ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + AESNI // Advanced Encryption Standard New Instructions + AMD3DNOW // AMD 3DNOW + AMD3DNOWEXT // AMD 3DNowExt + AMXBF16 // Tile computational operations on BFLOAT16 numbers + AMXFP16 // Tile computational operations on FP16 numbers + AMXINT8 // Tile computational operations on 8-bit integers + AMXTILE // Tile architecture + APX_F // Intel APX + AVX // AVX functions + AVX10 // If set the Intel AVX10 Converged Vector ISA is supported + AVX10_128 // If set indicates that AVX10 128-bit vector support is present + AVX10_256 // If set indicates that AVX10 256-bit vector support is present + AVX10_512 // If set indicates that AVX10 512-bit vector support is present + AVX2 // AVX2 functions + AVX512BF16 // AVX-512 BFLOAT16 Instructions + AVX512BITALG // AVX-512 Bit Algorithms + AVX512BW // AVX-512 Byte and Word Instructions + AVX512CD // AVX-512 Conflict Detection Instructions + AVX512DQ // AVX-512 Doubleword and Quadword Instructions + AVX512ER // AVX-512 Exponential and Reciprocal Instructions + AVX512F // AVX-512 Foundation + AVX512FP16 // AVX-512 FP16 Instructions + AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions + AVX512PF // AVX-512 Prefetch Instructions + AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions + AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 + AVX512VL // AVX-512 Vector Length Extensions + AVX512VNNI // AVX-512 Vector Neural Network Instructions + AVX512VP2INTERSECT // AVX-512 Intersect for D/Q + AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword + AVXIFMA // AVX-IFMA instructions + AVXNECONVERT // AVX-NE-CONVERT instructions + AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one + AVXVNNI // AVX (VEX encoded) VNNI neural network instructions + AVXVNNIINT8 // AVX-VNNI-INT8 instructions + AVXVNNIINT16 // AVX-VNNI-INT16 instructions + BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 + BMI1 // Bit Manipulation Instruction Set 1 + BMI2 // Bit Manipulation Instruction Set 2 + CETIBT // Intel CET Indirect Branch Tracking + CETSS // Intel CET Shadow Stack + CLDEMOTE // Cache Line Demote + CLMUL // Carry-less Multiplication + CLZERO // CLZERO instruction supported + CMOV // i686 CMOV + CMPCCXADD // CMPCCXADD instructions + CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB + CMPXCHG8 // CMPXCHG8 instruction + CPBOOST // Core Performance Boost + CPPC // AMD: Collaborative Processor Performance Control + CX16 // CMPXCHG16B Instruction + EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ + ENQCMD // Enqueue Command + ERMS // Enhanced REP MOVSB/STOSB + F16C // Half-precision floating-point conversion + FLUSH_L1D // Flush L1D cache + FMA3 // Intel FMA 3. Does not imply AVX. + FMA4 // Bulldozer FMA4 functions + FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide + FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide + FSRM // Fast Short Rep Mov + FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9 + FXSROPT // FXSAVE/FXRSTOR optimizations + GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. + HLE // Hardware Lock Elision + HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR + HTT // Hyperthreading (enabled) + HWA // Hardware assert supported. Indicates support for MSRC001_10 + HYBRID_CPU // This part has CPUs of more than one type. + HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors + IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel) + IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR + IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) + IBPB_BRTYPE // Indicates that MSR 49h (PRED_CMD) bit 0 (IBPB) flushes all branch type predictions from the CPU branch predictor + IBRS // AMD: Indirect Branch Restricted Speculation + IBRS_PREFERRED // AMD: IBRS is preferred over software solution + IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection + IBS // Instruction Based Sampling (AMD) + IBSBRNTRGT // Instruction Based Sampling Feature (AMD) + IBSFETCHSAM // Instruction Based Sampling Feature (AMD) + IBSFFV // Instruction Based Sampling Feature (AMD) + IBSOPCNT // Instruction Based Sampling Feature (AMD) + IBSOPCNTEXT // Instruction Based Sampling Feature (AMD) + IBSOPSAM // Instruction Based Sampling Feature (AMD) + IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) + IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) + IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported + IBS_OPDATA4 // AMD: IBS op data 4 MSR supported + IBS_OPFUSE // AMD: Indicates support for IbsOpFuse + IBS_PREVENTHOST // Disallowing IBS use by the host supported + IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4 + IDPRED_CTRL // IPRED_DIS + INT_WBINVD // WBINVD/WBNOINVD are interruptible. + INVLPGB // NVLPGB and TLBSYNC instruction supported + KEYLOCKER // Key locker + KEYLOCKERW // Key locker wide + LAHF // LAHF/SAHF in long mode + LAM // If set, CPU supports Linear Address Masking + LBRVIRT // LBR virtualization + LZCNT // LZCNT instruction + MCAOVERFLOW // MCA overflow recovery support. + MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it. + MCOMMIT // MCOMMIT instruction supported + MD_CLEAR // VERW clears CPU buffers + MMX // standard MMX + MMXEXT // SSE integer functions or AMD MMX ext + MOVBE // MOVBE instruction (big-endian) + MOVDIR64B // Move 64 Bytes as Direct Store + MOVDIRI // Move Doubleword as Direct Store + MOVSB_ZL // Fast Zero-Length MOVSB + MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD + MPX // Intel MPX (Memory Protection Extensions) + MSRIRC // Instruction Retired Counter MSR available + MSRLIST // Read/Write List of Model Specific Registers + MSR_PAGEFLUSH // Page Flush MSR available + NRIPS // Indicates support for NRIP save on VMEXIT + NX // NX (No-Execute) bit + OSXSAVE // XSAVE enabled by OS + PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption + POPCNT // POPCNT instruction + PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled + PREFETCHI // PREFETCHIT0/1 instructions + PSFD // Predictive Store Forward Disable + RDPRU // RDPRU instruction supported + RDRAND // RDRAND instruction is available + RDSEED // RDSEED instruction is available + RDTSCP // RDTSCP Instruction + RRSBA_CTRL // Restricted RSB Alternate + RTM // Restricted Transactional Memory + RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. + SBPB // Indicates support for the Selective Branch Predictor Barrier + SERIALIZE // Serialize Instruction Execution + SEV // AMD Secure Encrypted Virtualization supported + SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host + SEV_ALTERNATIVE // AMD SEV Alternate Injection supported + SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests + SEV_ES // AMD SEV Encrypted State supported + SEV_RESTRICTED // AMD SEV Restricted Injection supported + SEV_SNP // AMD SEV Secure Nested Paging supported + SGX // Software Guard Extensions + SGXLC // Software Guard Extensions Launch Control + SHA // Intel SHA Extensions + SME // AMD Secure Memory Encryption supported + SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced + SPEC_CTRL_SSBD // Speculative Store Bypass Disable + SRBDS_CTRL // SRBDS mitigation MSR available + SRSO_MSR_FIX // Indicates that software may use MSR BP_CFG[BpSpecReduce] to mitigate SRSO. + SRSO_NO // Indicates the CPU is not subject to the SRSO vulnerability + SRSO_USER_KERNEL_NO // Indicates the CPU is not subject to the SRSO vulnerability across user/kernel boundaries + SSE // SSE functions + SSE2 // P4 SSE functions + SSE3 // Prescott SSE3 functions + SSE4 // Penryn SSE4.1 functions + SSE42 // Nehalem SSE4.2 functions + SSE4A // AMD Barcelona microarchitecture SSE4a instructions + SSSE3 // Conroe SSSE3 functions + STIBP // Single Thread Indirect Branch Predictors + STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On + STOSB_SHORT // Fast short STOSB + SUCCOR // Software uncorrectable error containment and recovery capability. + SVM // AMD Secure Virtual Machine + SVMDA // Indicates support for the SVM decode assists. + SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control + SVML // AMD SVM lock. Indicates support for SVM-Lock. + SVMNP // AMD SVM nested paging + SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter + SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold + SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions. + SYSEE // SYSENTER and SYSEXIT instructions + TBM // AMD Trailing Bit Manipulation + TDX_GUEST // Intel Trust Domain Extensions Guest + TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations + TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. + TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. + TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 + TSXLDTRK // Intel TSX Suspend Load Address Tracking + VAES // Vector AES. AVX(512) versions requires additional checks. + VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits. + VMPL // AMD VM Permission Levels supported + VMSA_REGPROT // AMD VMSA Register Protection supported + VMX // Virtual Machine Extensions + VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. + VTE // AMD Virtual Transparent Encryption supported + WAITPKG // TPAUSE, UMONITOR, UMWAIT + WBNOINVD // Write Back and Do Not Invalidate Cache + WRMSRNS // Non-Serializing Write to Model Specific Register + X87 // FPU + XGETBV1 // Supports XGETBV with ECX = 1 + XOP // Bulldozer XOP functions + XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV + XSAVEC // Supports XSAVEC and the compacted form of XRSTOR. + XSAVEOPT // XSAVEOPT available + XSAVES // Supports XSAVES/XRSTORS and IA32_XSS // ARM features: AESARM // AES instructions @@ -302,9 +315,11 @@ type CPUInfo struct { L2 int // L2 Cache (per core or shared). Will be -1 if undetected L3 int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected } - SGX SGXSupport - maxFunc uint32 - maxExFunc uint32 + SGX SGXSupport + AMDMemEncryption AMDMemEncryptionSupport + AVX10Level uint8 + maxFunc uint32 + maxExFunc uint32 } var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) @@ -1071,6 +1086,32 @@ func hasSGX(available, lc bool) (rval SGXSupport) { return } +type AMDMemEncryptionSupport struct { + Available bool + CBitPossition uint32 + NumVMPL uint32 + PhysAddrReduction uint32 + NumEntryptedGuests uint32 + MinSevNoEsAsid uint32 +} + +func hasAMDMemEncryption(available bool) (rval AMDMemEncryptionSupport) { + rval.Available = available + if !available { + return + } + + _, b, c, d := cpuidex(0x8000001f, 0) + + rval.CBitPossition = b & 0x3f + rval.PhysAddrReduction = (b >> 6) & 0x3F + rval.NumVMPL = (b >> 12) & 0xf + rval.NumEntryptedGuests = c + rval.MinSevNoEsAsid = d + + return +} + func support() flagSet { var fs flagSet mfi := maxFunctionID() @@ -1165,6 +1206,7 @@ func support() flagSet { fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ) fs.setIf(ecx&(1<<13) != 0, TME) fs.setIf(ecx&(1<<25) != 0, CLDEMOTE) + fs.setIf(ecx&(1<<23) != 0, KEYLOCKER) fs.setIf(ecx&(1<<27) != 0, MOVDIRI) fs.setIf(ecx&(1<<28) != 0, MOVDIR64B) fs.setIf(ecx&(1<<29) != 0, ENQCMD) @@ -1201,7 +1243,10 @@ func support() flagSet { // CPUID.(EAX=7, ECX=1).EDX fs.setIf(edx1&(1<<4) != 0, AVXVNNIINT8) fs.setIf(edx1&(1<<5) != 0, AVXNECONVERT) + fs.setIf(edx1&(1<<10) != 0, AVXVNNIINT16) fs.setIf(edx1&(1<<14) != 0, PREFETCHI) + fs.setIf(edx1&(1<<19) != 0, AVX10) + fs.setIf(edx1&(1<<21) != 0, APX_F) // Only detect AVX-512 features if XGETBV is supported if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { @@ -1252,6 +1297,19 @@ func support() flagSet { fs.setIf(edx&(1<<4) != 0, BHI_CTRL) fs.setIf(edx&(1<<5) != 0, MCDT_NO) + // Add keylocker features. + if fs.inSet(KEYLOCKER) && mfi >= 0x19 { + _, ebx, _, _ := cpuidex(0x19, 0) + fs.setIf(ebx&5 == 5, KEYLOCKERW) // Bit 0 and 2 (1+4) + } + + // Add AVX10 features. + if fs.inSet(AVX10) && mfi >= 0x24 { + _, ebx, _, _ := cpuidex(0x24, 0) + fs.setIf(ebx&(1<<16) != 0, AVX10_128) + fs.setIf(ebx&(1<<17) != 0, AVX10_256) + fs.setIf(ebx&(1<<18) != 0, AVX10_512) + } } // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX = 1) @@ -1394,6 +1452,29 @@ func support() flagSet { fs.setIf((a>>24)&1 == 1, VMSA_REGPROT) } + if maxExtendedFunction() >= 0x80000021 && vend == AMD { + a, _, _, _ := cpuid(0x80000021) + fs.setIf((a>>31)&1 == 1, SRSO_MSR_FIX) + fs.setIf((a>>30)&1 == 1, SRSO_USER_KERNEL_NO) + fs.setIf((a>>29)&1 == 1, SRSO_NO) + fs.setIf((a>>28)&1 == 1, IBPB_BRTYPE) + fs.setIf((a>>27)&1 == 1, SBPB) + } + + if mfi >= 0x20 { + // Microsoft has decided to purposefully hide the information + // of the guest TEE when VMs are being created using Hyper-V. + // + // This leads us to check for the Hyper-V cpuid features + // (0x4000000C), and then for the `ebx` value set. + // + // For Intel TDX, `ebx` is set as `0xbe3`, being 3 the part + // we're mostly interested about,according to: + // https://github.com/torvalds/linux/blob/d2f51b3516dade79269ff45eae2a7668ae711b25/arch/x86/include/asm/hyperv-tlfs.h#L169-L174 + _, ebx, _, _ := cpuid(0x4000000C) + fs.setIf(ebx == 0xbe3, TDX_GUEST) + } + if mfi >= 0x21 { // Intel Trusted Domain Extensions Guests have their own cpuid leaf (0x21). _, ebx, ecx, edx := cpuid(0x21) @@ -1404,6 +1485,14 @@ func support() flagSet { return fs } +func (c *CPUInfo) supportAVX10() uint8 { + if c.maxFunc >= 0x24 && c.featureSet.inSet(AVX10) { + _, ebx, _, _ := cpuidex(0x24, 0) + return uint8(ebx) + } + return 0 +} + func valAsString(values ...uint32) []byte { r := make([]byte, 4*len(values)) for i, v := range values { diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go index c946824..799b400 100644 --- a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go +++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go @@ -27,10 +27,12 @@ func addInfo(c *CPUInfo, safe bool) { c.Family, c.Model, c.Stepping = familyModel() c.featureSet = support() c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC)) + c.AMDMemEncryption = hasAMDMemEncryption(c.featureSet.inSet(SME) || c.featureSet.inSet(SEV)) c.ThreadsPerCore = threadsPerCore() c.LogicalCores = logicalCores() c.PhysicalCores = physicalCores() c.VendorID, c.VendorString = vendorID() + c.AVX10Level = c.supportAVX10() c.cacheSize() c.frequencies() } diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go index 024c706..3a25603 100644 --- a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go +++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go @@ -16,210 +16,223 @@ func _() { _ = x[AMXFP16-6] _ = x[AMXINT8-7] _ = x[AMXTILE-8] - _ = x[AVX-9] - _ = x[AVX2-10] - _ = x[AVX512BF16-11] - _ = x[AVX512BITALG-12] - _ = x[AVX512BW-13] - _ = x[AVX512CD-14] - _ = x[AVX512DQ-15] - _ = x[AVX512ER-16] - _ = x[AVX512F-17] - _ = x[AVX512FP16-18] - _ = x[AVX512IFMA-19] - _ = x[AVX512PF-20] - _ = x[AVX512VBMI-21] - _ = x[AVX512VBMI2-22] - _ = x[AVX512VL-23] - _ = x[AVX512VNNI-24] - _ = x[AVX512VP2INTERSECT-25] - _ = x[AVX512VPOPCNTDQ-26] - _ = x[AVXIFMA-27] - _ = x[AVXNECONVERT-28] - _ = x[AVXSLOW-29] - _ = x[AVXVNNI-30] - _ = x[AVXVNNIINT8-31] - _ = x[BHI_CTRL-32] - _ = x[BMI1-33] - _ = x[BMI2-34] - _ = x[CETIBT-35] - _ = x[CETSS-36] - _ = x[CLDEMOTE-37] - _ = x[CLMUL-38] - _ = x[CLZERO-39] - _ = x[CMOV-40] - _ = x[CMPCCXADD-41] - _ = x[CMPSB_SCADBS_SHORT-42] - _ = x[CMPXCHG8-43] - _ = x[CPBOOST-44] - _ = x[CPPC-45] - _ = x[CX16-46] - _ = x[EFER_LMSLE_UNS-47] - _ = x[ENQCMD-48] - _ = x[ERMS-49] - _ = x[F16C-50] - _ = x[FLUSH_L1D-51] - _ = x[FMA3-52] - _ = x[FMA4-53] - _ = x[FP128-54] - _ = x[FP256-55] - _ = x[FSRM-56] - _ = x[FXSR-57] - _ = x[FXSROPT-58] - _ = x[GFNI-59] - _ = x[HLE-60] - _ = x[HRESET-61] - _ = x[HTT-62] - _ = x[HWA-63] - _ = x[HYBRID_CPU-64] - _ = x[HYPERVISOR-65] - _ = x[IA32_ARCH_CAP-66] - _ = x[IA32_CORE_CAP-67] - _ = x[IBPB-68] - _ = x[IBRS-69] - _ = x[IBRS_PREFERRED-70] - _ = x[IBRS_PROVIDES_SMP-71] - _ = x[IBS-72] - _ = x[IBSBRNTRGT-73] - _ = x[IBSFETCHSAM-74] - _ = x[IBSFFV-75] - _ = x[IBSOPCNT-76] - _ = x[IBSOPCNTEXT-77] - _ = x[IBSOPSAM-78] - _ = x[IBSRDWROPCNT-79] - _ = x[IBSRIPINVALIDCHK-80] - _ = x[IBS_FETCH_CTLX-81] - _ = x[IBS_OPDATA4-82] - _ = x[IBS_OPFUSE-83] - _ = x[IBS_PREVENTHOST-84] - _ = x[IBS_ZEN4-85] - _ = x[IDPRED_CTRL-86] - _ = x[INT_WBINVD-87] - _ = x[INVLPGB-88] - _ = x[LAHF-89] - _ = x[LAM-90] - _ = x[LBRVIRT-91] - _ = x[LZCNT-92] - _ = x[MCAOVERFLOW-93] - _ = x[MCDT_NO-94] - _ = x[MCOMMIT-95] - _ = x[MD_CLEAR-96] - _ = x[MMX-97] - _ = x[MMXEXT-98] - _ = x[MOVBE-99] - _ = x[MOVDIR64B-100] - _ = x[MOVDIRI-101] - _ = x[MOVSB_ZL-102] - _ = x[MOVU-103] - _ = x[MPX-104] - _ = x[MSRIRC-105] - _ = x[MSRLIST-106] - _ = x[MSR_PAGEFLUSH-107] - _ = x[NRIPS-108] - _ = x[NX-109] - _ = x[OSXSAVE-110] - _ = x[PCONFIG-111] - _ = x[POPCNT-112] - _ = x[PPIN-113] - _ = x[PREFETCHI-114] - _ = x[PSFD-115] - _ = x[RDPRU-116] - _ = x[RDRAND-117] - _ = x[RDSEED-118] - _ = x[RDTSCP-119] - _ = x[RRSBA_CTRL-120] - _ = x[RTM-121] - _ = x[RTM_ALWAYS_ABORT-122] - _ = x[SERIALIZE-123] - _ = x[SEV-124] - _ = x[SEV_64BIT-125] - _ = x[SEV_ALTERNATIVE-126] - _ = x[SEV_DEBUGSWAP-127] - _ = x[SEV_ES-128] - _ = x[SEV_RESTRICTED-129] - _ = x[SEV_SNP-130] - _ = x[SGX-131] - _ = x[SGXLC-132] - _ = x[SHA-133] - _ = x[SME-134] - _ = x[SME_COHERENT-135] - _ = x[SPEC_CTRL_SSBD-136] - _ = x[SRBDS_CTRL-137] - _ = x[SSE-138] - _ = x[SSE2-139] - _ = x[SSE3-140] - _ = x[SSE4-141] - _ = x[SSE42-142] - _ = x[SSE4A-143] - _ = x[SSSE3-144] - _ = x[STIBP-145] - _ = x[STIBP_ALWAYSON-146] - _ = x[STOSB_SHORT-147] - _ = x[SUCCOR-148] - _ = x[SVM-149] - _ = x[SVMDA-150] - _ = x[SVMFBASID-151] - _ = x[SVML-152] - _ = x[SVMNP-153] - _ = x[SVMPF-154] - _ = x[SVMPFT-155] - _ = x[SYSCALL-156] - _ = x[SYSEE-157] - _ = x[TBM-158] - _ = x[TDX_GUEST-159] - _ = x[TLB_FLUSH_NESTED-160] - _ = x[TME-161] - _ = x[TOPEXT-162] - _ = x[TSCRATEMSR-163] - _ = x[TSXLDTRK-164] - _ = x[VAES-165] - _ = x[VMCBCLEAN-166] - _ = x[VMPL-167] - _ = x[VMSA_REGPROT-168] - _ = x[VMX-169] - _ = x[VPCLMULQDQ-170] - _ = x[VTE-171] - _ = x[WAITPKG-172] - _ = x[WBNOINVD-173] - _ = x[WRMSRNS-174] - _ = x[X87-175] - _ = x[XGETBV1-176] - _ = x[XOP-177] - _ = x[XSAVE-178] - _ = x[XSAVEC-179] - _ = x[XSAVEOPT-180] - _ = x[XSAVES-181] - _ = x[AESARM-182] - _ = x[ARMCPUID-183] - _ = x[ASIMD-184] - _ = x[ASIMDDP-185] - _ = x[ASIMDHP-186] - _ = x[ASIMDRDM-187] - _ = x[ATOMICS-188] - _ = x[CRC32-189] - _ = x[DCPOP-190] - _ = x[EVTSTRM-191] - _ = x[FCMA-192] - _ = x[FP-193] - _ = x[FPHP-194] - _ = x[GPA-195] - _ = x[JSCVT-196] - _ = x[LRCPC-197] - _ = x[PMULL-198] - _ = x[SHA1-199] - _ = x[SHA2-200] - _ = x[SHA3-201] - _ = x[SHA512-202] - _ = x[SM3-203] - _ = x[SM4-204] - _ = x[SVE-205] - _ = x[lastID-206] + _ = x[APX_F-9] + _ = x[AVX-10] + _ = x[AVX10-11] + _ = x[AVX10_128-12] + _ = x[AVX10_256-13] + _ = x[AVX10_512-14] + _ = x[AVX2-15] + _ = x[AVX512BF16-16] + _ = x[AVX512BITALG-17] + _ = x[AVX512BW-18] + _ = x[AVX512CD-19] + _ = x[AVX512DQ-20] + _ = x[AVX512ER-21] + _ = x[AVX512F-22] + _ = x[AVX512FP16-23] + _ = x[AVX512IFMA-24] + _ = x[AVX512PF-25] + _ = x[AVX512VBMI-26] + _ = x[AVX512VBMI2-27] + _ = x[AVX512VL-28] + _ = x[AVX512VNNI-29] + _ = x[AVX512VP2INTERSECT-30] + _ = x[AVX512VPOPCNTDQ-31] + _ = x[AVXIFMA-32] + _ = x[AVXNECONVERT-33] + _ = x[AVXSLOW-34] + _ = x[AVXVNNI-35] + _ = x[AVXVNNIINT8-36] + _ = x[AVXVNNIINT16-37] + _ = x[BHI_CTRL-38] + _ = x[BMI1-39] + _ = x[BMI2-40] + _ = x[CETIBT-41] + _ = x[CETSS-42] + _ = x[CLDEMOTE-43] + _ = x[CLMUL-44] + _ = x[CLZERO-45] + _ = x[CMOV-46] + _ = x[CMPCCXADD-47] + _ = x[CMPSB_SCADBS_SHORT-48] + _ = x[CMPXCHG8-49] + _ = x[CPBOOST-50] + _ = x[CPPC-51] + _ = x[CX16-52] + _ = x[EFER_LMSLE_UNS-53] + _ = x[ENQCMD-54] + _ = x[ERMS-55] + _ = x[F16C-56] + _ = x[FLUSH_L1D-57] + _ = x[FMA3-58] + _ = x[FMA4-59] + _ = x[FP128-60] + _ = x[FP256-61] + _ = x[FSRM-62] + _ = x[FXSR-63] + _ = x[FXSROPT-64] + _ = x[GFNI-65] + _ = x[HLE-66] + _ = x[HRESET-67] + _ = x[HTT-68] + _ = x[HWA-69] + _ = x[HYBRID_CPU-70] + _ = x[HYPERVISOR-71] + _ = x[IA32_ARCH_CAP-72] + _ = x[IA32_CORE_CAP-73] + _ = x[IBPB-74] + _ = x[IBPB_BRTYPE-75] + _ = x[IBRS-76] + _ = x[IBRS_PREFERRED-77] + _ = x[IBRS_PROVIDES_SMP-78] + _ = x[IBS-79] + _ = x[IBSBRNTRGT-80] + _ = x[IBSFETCHSAM-81] + _ = x[IBSFFV-82] + _ = x[IBSOPCNT-83] + _ = x[IBSOPCNTEXT-84] + _ = x[IBSOPSAM-85] + _ = x[IBSRDWROPCNT-86] + _ = x[IBSRIPINVALIDCHK-87] + _ = x[IBS_FETCH_CTLX-88] + _ = x[IBS_OPDATA4-89] + _ = x[IBS_OPFUSE-90] + _ = x[IBS_PREVENTHOST-91] + _ = x[IBS_ZEN4-92] + _ = x[IDPRED_CTRL-93] + _ = x[INT_WBINVD-94] + _ = x[INVLPGB-95] + _ = x[KEYLOCKER-96] + _ = x[KEYLOCKERW-97] + _ = x[LAHF-98] + _ = x[LAM-99] + _ = x[LBRVIRT-100] + _ = x[LZCNT-101] + _ = x[MCAOVERFLOW-102] + _ = x[MCDT_NO-103] + _ = x[MCOMMIT-104] + _ = x[MD_CLEAR-105] + _ = x[MMX-106] + _ = x[MMXEXT-107] + _ = x[MOVBE-108] + _ = x[MOVDIR64B-109] + _ = x[MOVDIRI-110] + _ = x[MOVSB_ZL-111] + _ = x[MOVU-112] + _ = x[MPX-113] + _ = x[MSRIRC-114] + _ = x[MSRLIST-115] + _ = x[MSR_PAGEFLUSH-116] + _ = x[NRIPS-117] + _ = x[NX-118] + _ = x[OSXSAVE-119] + _ = x[PCONFIG-120] + _ = x[POPCNT-121] + _ = x[PPIN-122] + _ = x[PREFETCHI-123] + _ = x[PSFD-124] + _ = x[RDPRU-125] + _ = x[RDRAND-126] + _ = x[RDSEED-127] + _ = x[RDTSCP-128] + _ = x[RRSBA_CTRL-129] + _ = x[RTM-130] + _ = x[RTM_ALWAYS_ABORT-131] + _ = x[SBPB-132] + _ = x[SERIALIZE-133] + _ = x[SEV-134] + _ = x[SEV_64BIT-135] + _ = x[SEV_ALTERNATIVE-136] + _ = x[SEV_DEBUGSWAP-137] + _ = x[SEV_ES-138] + _ = x[SEV_RESTRICTED-139] + _ = x[SEV_SNP-140] + _ = x[SGX-141] + _ = x[SGXLC-142] + _ = x[SHA-143] + _ = x[SME-144] + _ = x[SME_COHERENT-145] + _ = x[SPEC_CTRL_SSBD-146] + _ = x[SRBDS_CTRL-147] + _ = x[SRSO_MSR_FIX-148] + _ = x[SRSO_NO-149] + _ = x[SRSO_USER_KERNEL_NO-150] + _ = x[SSE-151] + _ = x[SSE2-152] + _ = x[SSE3-153] + _ = x[SSE4-154] + _ = x[SSE42-155] + _ = x[SSE4A-156] + _ = x[SSSE3-157] + _ = x[STIBP-158] + _ = x[STIBP_ALWAYSON-159] + _ = x[STOSB_SHORT-160] + _ = x[SUCCOR-161] + _ = x[SVM-162] + _ = x[SVMDA-163] + _ = x[SVMFBASID-164] + _ = x[SVML-165] + _ = x[SVMNP-166] + _ = x[SVMPF-167] + _ = x[SVMPFT-168] + _ = x[SYSCALL-169] + _ = x[SYSEE-170] + _ = x[TBM-171] + _ = x[TDX_GUEST-172] + _ = x[TLB_FLUSH_NESTED-173] + _ = x[TME-174] + _ = x[TOPEXT-175] + _ = x[TSCRATEMSR-176] + _ = x[TSXLDTRK-177] + _ = x[VAES-178] + _ = x[VMCBCLEAN-179] + _ = x[VMPL-180] + _ = x[VMSA_REGPROT-181] + _ = x[VMX-182] + _ = x[VPCLMULQDQ-183] + _ = x[VTE-184] + _ = x[WAITPKG-185] + _ = x[WBNOINVD-186] + _ = x[WRMSRNS-187] + _ = x[X87-188] + _ = x[XGETBV1-189] + _ = x[XOP-190] + _ = x[XSAVE-191] + _ = x[XSAVEC-192] + _ = x[XSAVEOPT-193] + _ = x[XSAVES-194] + _ = x[AESARM-195] + _ = x[ARMCPUID-196] + _ = x[ASIMD-197] + _ = x[ASIMDDP-198] + _ = x[ASIMDHP-199] + _ = x[ASIMDRDM-200] + _ = x[ATOMICS-201] + _ = x[CRC32-202] + _ = x[DCPOP-203] + _ = x[EVTSTRM-204] + _ = x[FCMA-205] + _ = x[FP-206] + _ = x[FPHP-207] + _ = x[GPA-208] + _ = x[JSCVT-209] + _ = x[LRCPC-210] + _ = x[PMULL-211] + _ = x[SHA1-212] + _ = x[SHA2-213] + _ = x[SHA3-214] + _ = x[SHA512-215] + _ = x[SM3-216] + _ = x[SM4-217] + _ = x[SVE-218] + _ = x[lastID-219] _ = x[firstID-0] } -const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" +const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAPX_FAVXAVX10AVX10_128AVX10_256AVX10_512AVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8AVXVNNIINT16BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBPB_BRTYPEIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBKEYLOCKERKEYLOCKERWLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSBPBSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSRSO_MSR_FIXSRSO_NOSRSO_USER_KERNEL_NOSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" -var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 65, 69, 79, 91, 99, 107, 115, 123, 130, 140, 150, 158, 168, 179, 187, 197, 215, 230, 237, 249, 256, 263, 274, 282, 286, 290, 296, 301, 309, 314, 320, 324, 333, 351, 359, 366, 370, 374, 388, 394, 398, 402, 411, 415, 419, 424, 429, 433, 437, 444, 448, 451, 457, 460, 463, 473, 483, 496, 509, 513, 517, 531, 548, 551, 561, 572, 578, 586, 597, 605, 617, 633, 647, 658, 668, 683, 691, 702, 712, 719, 723, 726, 733, 738, 749, 756, 763, 771, 774, 780, 785, 794, 801, 809, 813, 816, 822, 829, 842, 847, 849, 856, 863, 869, 873, 882, 886, 891, 897, 903, 909, 919, 922, 938, 947, 950, 959, 974, 987, 993, 1007, 1014, 1017, 1022, 1025, 1028, 1040, 1054, 1064, 1067, 1071, 1075, 1079, 1084, 1089, 1094, 1099, 1113, 1124, 1130, 1133, 1138, 1147, 1151, 1156, 1161, 1167, 1174, 1179, 1182, 1191, 1207, 1210, 1216, 1226, 1234, 1238, 1247, 1251, 1263, 1266, 1276, 1279, 1286, 1294, 1301, 1304, 1311, 1314, 1319, 1325, 1333, 1339, 1345, 1353, 1358, 1365, 1372, 1380, 1387, 1392, 1397, 1404, 1408, 1410, 1414, 1417, 1422, 1427, 1432, 1436, 1440, 1444, 1450, 1453, 1456, 1459, 1465} +var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 67, 70, 75, 84, 93, 102, 106, 116, 128, 136, 144, 152, 160, 167, 177, 187, 195, 205, 216, 224, 234, 252, 267, 274, 286, 293, 300, 311, 323, 331, 335, 339, 345, 350, 358, 363, 369, 373, 382, 400, 408, 415, 419, 423, 437, 443, 447, 451, 460, 464, 468, 473, 478, 482, 486, 493, 497, 500, 506, 509, 512, 522, 532, 545, 558, 562, 573, 577, 591, 608, 611, 621, 632, 638, 646, 657, 665, 677, 693, 707, 718, 728, 743, 751, 762, 772, 779, 788, 798, 802, 805, 812, 817, 828, 835, 842, 850, 853, 859, 864, 873, 880, 888, 892, 895, 901, 908, 921, 926, 928, 935, 942, 948, 952, 961, 965, 970, 976, 982, 988, 998, 1001, 1017, 1021, 1030, 1033, 1042, 1057, 1070, 1076, 1090, 1097, 1100, 1105, 1108, 1111, 1123, 1137, 1147, 1159, 1166, 1185, 1188, 1192, 1196, 1200, 1205, 1210, 1215, 1220, 1234, 1245, 1251, 1254, 1259, 1268, 1272, 1277, 1282, 1288, 1295, 1300, 1303, 1312, 1328, 1331, 1337, 1347, 1355, 1359, 1368, 1372, 1384, 1387, 1397, 1400, 1407, 1415, 1422, 1425, 1432, 1435, 1440, 1446, 1454, 1460, 1466, 1474, 1479, 1486, 1493, 1501, 1508, 1513, 1518, 1525, 1529, 1531, 1535, 1538, 1543, 1548, 1553, 1557, 1561, 1565, 1571, 1574, 1577, 1580, 1586} func (i FeatureID) String() string { if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) { diff --git a/vendor/github.com/klauspost/reedsolomon/README.md b/vendor/github.com/klauspost/reedsolomon/README.md index e9c148f..d94512a 100644 --- a/vendor/github.com/klauspost/reedsolomon/README.md +++ b/vendor/github.com/klauspost/reedsolomon/README.md @@ -25,6 +25,10 @@ Using Go modules is recommended. # Changes +## 2024 + + * Auto-generation of SVE and NEON routines for ARM based on AVX2 code. This results in a speedup of 2x for SVE (as measured using Graviton 3 on AWS) and a speedup of 1.5x as compared to the existing NEON-accelerated code. + ## 2022 * [GFNI](https://github.com/klauspost/reedsolomon/pull/224) support for amd64, for up to 3x faster processing. @@ -534,6 +538,21 @@ BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x ``` +# Legal + +> None of section below is legal advice. Seek your own legal counsel. +> As stated by the [LICENSE](LICENSE) the authors will not be held reliable for any use of this library. +> Users are encouraged to independently verify they comply with all legal requirements. + +As can be seen in [recent news](https://www.datanami.com/2023/10/16/cloudera-hit-with-240-million-judgement-over-erasure-coding/) +there has been lawsuits related to possible patents of aspects of erasure coding functionality. + +As a possible mitigation it is possible to use the tag `nopshufb` when compiling any code which includes this package. +This will remove all inclusion and use of `PSHUFB` and equivalent on other platforms. + +This is done by adding `-tags=nopshufb` to `go build` and similar commands that produce binary output. + +The removed code may not be infringing and even after `-tags=nopshufb` there may still be infringing code left. # Links * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/). @@ -543,8 +562,9 @@ BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x * [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance. * [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation. * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests. -* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. +* [Screaming Fast Galois Field Arithmetic](https://www.snia.org/sites/default/files/files2/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. * [Leopard-RS](https://github.com/catid/leopard) C library used as basis for GF16 implementation. +* [reed-solomon-simd](https://github.com/AndersTrier/reed-solomon-simd) Leopard-RS Rust implementation. # License diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go index 479fa44..bbc521f 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois.go +++ b/vendor/github.com/klauspost/reedsolomon/galois.go @@ -62,21 +62,17 @@ var logTable = [fieldSize]byte{ /** * Inverse of the logarithm table. Maps integer logarithms - * to members of the field. There is no entry for 255 - * because the highest log is 254. + * to members of the field. Entry 255 is the same as entry 0 sue to mod 255. * * This table was generated by `go run gentables.go` + * Table has been truncated to 256 bytes, since no lookups are bigger. */ -var expTable = []byte{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e} +var expTable = [256]byte{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x1} func galAdd(a, b byte) byte { return a ^ b } -func galSub(a, b byte) byte { - return a ^ b -} - // Table from https://github.com/templexxx/reedsolomon var invTable = [256]byte{0x0, 0x1, 0x8e, 0xf4, 0x47, 0xa7, 0x7a, 0xba, 0xad, 0x9d, 0xdd, 0x98, 0x3d, 0xaa, 0x5d, 0x96, 0xd8, 0x72, 0xc0, 0x58, 0xe0, 0x3e, 0x4c, 0x66, 0x90, 0xde, 0x55, 0x80, 0xa0, 0x83, 0x4b, 0x2a, 0x6c, 0xed, 0x39, 0x51, 0x60, 0x56, 0x2c, 0x8a, 0x70, 0xd0, 0x1f, 0x4a, 0x26, 0x8b, 0x33, 0x6e, 0x48, 0x89, 0x6f, 0x2e, 0xa4, 0xc3, 0x40, 0x5e, 0x50, 0x22, 0xcf, 0xa9, 0xab, 0xc, 0x15, 0xe1, 0x36, 0x5f, 0xf8, 0xd5, 0x92, 0x4e, 0xa6, 0x4, 0x30, 0x88, 0x2b, 0x1e, 0x16, 0x67, 0x45, 0x93, 0x38, 0x23, 0x68, 0x8c, 0x81, 0x1a, 0x25, 0x61, 0x13, 0xc1, 0xcb, 0x63, 0x97, 0xe, 0x37, 0x41, 0x24, 0x57, 0xca, 0x5b, 0xb9, 0xc4, 0x17, 0x4d, 0x52, 0x8d, 0xef, 0xb3, 0x20, 0xec, 0x2f, 0x32, 0x28, 0xd1, 0x11, 0xd9, 0xe9, 0xfb, 0xda, 0x79, 0xdb, 0x77, 0x6, 0xbb, 0x84, 0xcd, 0xfe, 0xfc, 0x1b, 0x54, 0xa1, 0x1d, 0x7c, 0xcc, 0xe4, 0xb0, 0x49, 0x31, 0x27, 0x2d, 0x53, 0x69, 0x2, 0xf5, 0x18, 0xdf, 0x44, 0x4f, 0x9b, 0xbc, 0xf, 0x5c, 0xb, 0xdc, 0xbd, 0x94, 0xac, 0x9, 0xc7, 0xa2, 0x1c, 0x82, 0x9f, 0xc6, 0x34, 0xc2, 0x46, 0x5, 0xce, 0x3b, 0xd, 0x3c, 0x9c, 0x8, 0xbe, 0xb7, 0x87, 0xe5, 0xee, 0x6b, 0xeb, 0xf2, 0xbf, 0xaf, 0xc5, 0x64, 0x7, 0x7b, 0x95, 0x9a, 0xae, 0xb6, 0x12, 0x59, 0xa5, 0x35, 0x65, 0xb8, 0xa3, 0x9e, 0xd2, 0xf7, 0x62, 0x5a, 0x85, 0x7d, 0xa8, 0x3a, 0x29, 0x71, 0xc8, 0xf6, 0xf9, 0x43, 0xd7, 0xd6, 0x10, 0x73, 0x76, 0x78, 0x99, 0xa, 0x19, 0x91, 0x14, 0x3f, 0xe6, 0xf0, 0x86, 0xb1, 0xe2, 0xf1, 0xfa, 0x74, 0xf3, 0xb4, 0x6d, 0x21, 0xb2, 0x6a, 0xe3, 0xe7, 0xb5, 0xea, 0x3, 0x8f, 0xd3, 0xc9, 0x42, 0xd4, 0xe8, 0x75, 0x7f, 0xff, 0x7e, 0xfd} @@ -883,6 +879,15 @@ func galDivide(a, b byte) byte { if logResult < 0 { logResult += 255 } + return expTable[uint8(logResult)] +} + +// galOneOver is the same as galDivide(1, a). +func galOneOver(a byte) byte { + if a == 0 { + panic("Argument 'divisor' is 0") + } + logResult := logTable[a] ^ 255 return expTable[logResult] } @@ -902,17 +907,17 @@ func galExp(a byte, n int) byte { for logResult >= 255 { logResult -= 255 } - return expTable[logResult] + return expTable[uint8(logResult)] } -func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { - if !avx2CodeGen { +func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs, vectorLength int, dst []byte) []byte { + if !codeGen { panic("codegen not enabled") } total := inputs * outputs // Duplicated in+out - wantBytes := total * 32 * 2 + wantBytes := total * vectorLength * 2 if cap(dst) < wantBytes { dst = AllocAligned(1, wantBytes)[0] } else { @@ -920,15 +925,16 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) } for i, row := range matrixRows[:outputs] { for j, idx := range row[inIdx : inIdx+inputs] { - dstIdx := (j*outputs + i) * 64 + dstIdx := (j*outputs + i) * vectorLength * 2 dstPart := dst[dstIdx:] - dstPart = dstPart[:64] + dstPart = dstPart[:vectorLength*2] lo := mulTableLow[idx][:] hi := mulTableHigh[idx][:] - copy(dstPart[:16], lo) - copy(dstPart[16:32], lo) - copy(dstPart[32:48], hi) - copy(dstPart[48:64], hi) + + for k := 0; k < vectorLength; k += 16 { + copy(dstPart[k:k+16], lo) + copy(dstPart[vectorLength*2-(k+16):vectorLength*2-k], hi) + } } } return dst @@ -937,7 +943,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f} func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 { - if !avx2CodeGen { + if !codeGen { panic("codegen not enabled") } total := inputs * outputs diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go index 9f84276..8025560 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go @@ -1,10 +1,11 @@ -//go:build !noasm && !appengine && !gccgo -// +build !noasm,!appengine,!gccgo +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. package reedsolomon +const pshufb = true + //go:noescape func galMulSSSE3(low, high, in, out []byte) @@ -17,21 +18,12 @@ func galMulAVX2Xor(low, high, in, out []byte) //go:noescape func galMulAVX2(low, high, in, out []byte) -//go:noescape -func sSE2XorSlice(in, out []byte) - //go:noescape func galMulAVX2Xor_64(low, high, in, out []byte) //go:noescape func galMulAVX2_64(low, high, in, out []byte) -//go:noescape -func sSE2XorSlice_64(in, out []byte) - -//go:noescape -func avx2XorSlice_64(in, out []byte) - // This is what the assembler routines do in blocks of 16 bytes: /* func galMulSSSE3(low, high, in, out []byte) { @@ -61,20 +53,32 @@ func galMulSlice(c byte, in, out []byte, o *options) { } if o.useAVX2 { if len(in) >= bigSwitchover { - galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } if len(in) > 32 { - galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } } else if o.useSSSE3 { - galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } @@ -93,20 +97,32 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { if o.useAVX2 { if len(in) >= bigSwitchover { - galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } if len(in) >= 32 { - galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } } else if o.useSSSE3 { - galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } @@ -125,20 +141,32 @@ func sliceXor(in, out []byte, o *options) { if o.useSSE2 { if len(in) >= bigSwitchover { if o.useAVX2 { - avx2XorSlice_64(in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + avx2XorSlice_64(in, out) in = in[done:] out = out[done:] } else { - sSE2XorSlice_64(in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + sSE2XorSlice_64(in, out) in = in[done:] out = out[done:] } } if len(in) >= 16 { - sSE2XorSlice(in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + sSE2XorSlice(in, out) in = in[done:] out = out[done:] } @@ -233,7 +261,7 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio return } - if false && o.useGFNI { + if false && o.useAvx512GFNI { // Note that these currently require that length is multiple of 64. t01 := gf2p811dMulMatrices[log_m01] t23 := gf2p811dMulMatrices[log_m23] @@ -388,7 +416,7 @@ func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *option return } - if false && o.useGFNI { + if false && o.useAvx512GFNI { t01 := gf2p811dMulMatrices[log_m01] t23 := gf2p811dMulMatrices[log_m23] t02 := gf2p811dMulMatrices[log_m02] @@ -470,9 +498,17 @@ func fftDIT2(x, y []byte, log_m ffe, o *options) { } if o.useAVX2 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } fftDIT2_avx2(x, y, tmp) } else if o.useSSSE3 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } fftDIT2_ssse3(x, y, tmp) } else { // Reference version: @@ -488,11 +524,15 @@ func fftDIT28(x, y []byte, log_m ffe8, o *options) { } if o.useAVX2 { + done := (len(y) >> 6) << 6 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } fftDIT28_avx2(x, y, &multiply256LUT8[log_m]) if len(x)&63 == 0 { return } - done := (len(y) >> 6) << 6 y = y[done:] x = x[done:] } @@ -507,11 +547,15 @@ func ifftDIT28(x, y []byte, log_m ffe8, o *options) { } if o.useAVX2 { + done := (len(y) >> 6) << 6 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } ifftDIT28_avx2(x, y, &multiply256LUT8[log_m]) if len(x)&63 == 0 { return } - done := (len(y) >> 6) << 6 y = y[done:] x = x[done:] } @@ -522,14 +566,22 @@ func ifftDIT28(x, y []byte, log_m ffe8, o *options) { func mulAdd8(x, y []byte, log_m ffe8, o *options) { if o.useAVX2 { t := &multiply256LUT8[log_m] - galMulAVX2Xor_64(t[:16], t[16:32], y, x) done := (len(y) >> 6) << 6 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } + galMulAVX2Xor_64(t[:16], t[16:32], y, x) y = y[done:] x = x[done:] } else if o.useSSSE3 { t := &multiply256LUT8[log_m] - galMulSSSE3Xor(t[:16], t[16:32], y, x) done := (len(y) >> 4) << 4 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } + galMulSSSE3Xor(t[:16], t[16:32], y, x) y = y[done:] x = x[done:] } @@ -543,9 +595,19 @@ func ifftDIT2(x, y []byte, log_m ffe, o *options) { } if o.useAVX2 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } + ifftDIT2_avx2(x, y, tmp) } else if o.useSSSE3 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } + ifftDIT2_ssse3(x, y, tmp) } else { // Reference version: @@ -560,9 +622,17 @@ func mulgf16(x, y []byte, log_m ffe, o *options) { } if o.useAVX2 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } mulgf16_avx2(x, y, tmp) } else if o.useSSSE3 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } mulgf16_ssse3(x, y, tmp) } else { refMul(x, y, log_m) @@ -572,14 +642,23 @@ func mulgf16(x, y []byte, log_m ffe, o *options) { func mulgf8(out, in []byte, log_m ffe8, o *options) { if o.useAVX2 { t := &multiply256LUT8[log_m] - galMulAVX2_64(t[:16], t[16:32], in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + + galMulAVX2_64(t[:16], t[16:32], in, out) in = in[done:] out = out[done:] } else if o.useSSSE3 { t := &multiply256LUT8[log_m] - galMulSSSE3(t[:16], t[16:32], in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulSSSE3(t[:16], t[16:32], in, out) in = in[done:] out = out[done:] } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s index 3e97c7c..18e08c3 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s @@ -1,6 +1,7 @@ //+build !noasm //+build !appengine //+build !gccgo +//+build !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. @@ -215,28 +216,6 @@ done_avx2: VZEROUPPER RET -// func sSE2XorSlice(in, out []byte) -TEXT ·sSE2XorSlice(SB), 7, $0 - MOVQ in+0(FP), SI // SI: &in - MOVQ in_len+8(FP), R9 // R9: len(in) - MOVQ out+24(FP), DX // DX: &out - SHRQ $4, R9 // len(in) / 16 - CMPQ R9, $0 - JEQ done_xor_sse2 - -loopback_xor_sse2: - MOVOU (SI), X0 // in[x] - MOVOU (DX), X1 // out[x] - PXOR X0, X1 - MOVOU X1, (DX) - ADDQ $16, SI // in+=16 - ADDQ $16, DX // out+=16 - SUBQ $1, R9 - JNZ loopback_xor_sse2 - -done_xor_sse2: - RET - // func galMulAVX2Xor_64(low, high, in, out []byte) TEXT ·galMulAVX2Xor_64(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low @@ -329,66 +308,3 @@ loopback_avx2_64: done_avx2_64: VZEROUPPER RET - -// func sSE2XorSlice_64(in, out []byte) -TEXT ·sSE2XorSlice_64(SB), 7, $0 - MOVQ in+0(FP), SI // SI: &in - MOVQ in_len+8(FP), R9 // R9: len(in) - MOVQ out+24(FP), DX // DX: &out - SHRQ $6, R9 // len(in) / 64 - CMPQ R9, $0 - JEQ done_xor_sse2_64 - -loopback_xor_sse2_64: - MOVOU (SI), X0 // in[x] - MOVOU 16(SI), X2 // in[x] - MOVOU 32(SI), X4 // in[x] - MOVOU 48(SI), X6 // in[x] - MOVOU (DX), X1 // out[x] - MOVOU 16(DX), X3 // out[x] - MOVOU 32(DX), X5 // out[x] - MOVOU 48(DX), X7 // out[x] - PXOR X0, X1 - PXOR X2, X3 - PXOR X4, X5 - PXOR X6, X7 - MOVOU X1, (DX) - MOVOU X3, 16(DX) - MOVOU X5, 32(DX) - MOVOU X7, 48(DX) - ADDQ $64, SI // in+=64 - ADDQ $64, DX // out+=64 - SUBQ $1, R9 - JNZ loopback_xor_sse2_64 - -done_xor_sse2_64: - RET - -// func avx2XorSlice_64(in, out []byte) -TEXT ·avx2XorSlice_64(SB), 7, $0 - MOVQ in+0(FP), SI // SI: &in - MOVQ in_len+8(FP), R9 // R9: len(in) - MOVQ out+24(FP), DX // DX: &out - SHRQ $6, R9 // len(in) / 64 - CMPQ R9, $0 - JEQ done_xor_avx2_64 - -loopback_xor_avx2_64: - VMOVDQU (SI), Y0 - VMOVDQU 32(SI), Y2 - VMOVDQU (DX), Y1 - VMOVDQU 32(DX), Y3 - VPXOR Y0, Y1, Y1 - VPXOR Y2, Y3, Y3 - VMOVDQU Y1, (DX) - VMOVDQU Y3, 32(DX) - - ADDQ $64, SI // in+=64 - ADDQ $64, DX // out+=64 - SUBQ $1, R9 - JNZ loopback_xor_avx2_64 - VZEROUPPER - -done_xor_avx2_64: - - RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go index 9ab2794..d860525 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go @@ -1,19 +1,31 @@ -//go:build !noasm && !appengine && !gccgo -// +build !noasm,!appengine,!gccgo +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. package reedsolomon +const pshufb = true + //go:noescape func galMulNEON(low, high, in, out []byte) //go:noescape func galMulXorNEON(low, high, in, out []byte) -//go:noescape -func galXorNEON(in, out []byte) +func getVectorLength() (vl, pl uint64) + +func init() { + if defaultOptions.useSVE { + if vl, _ := getVectorLength(); vl <= 256 { + // set vector length in bytes + defaultOptions.vectorLength = int(vl) >> 3 + } else { + // disable SVE for hardware implementatons over 256 bits (only know to be Fujitsu A64FX atm) + defaultOptions.useSVE = false + } + } +} func galMulSlice(c byte, in, out []byte, o *options) { if c == 1 { @@ -21,8 +33,12 @@ func galMulSlice(c byte, in, out []byte, o *options) { return } var done int - galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) done = (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) remain := len(in) - done if remain > 0 { @@ -38,9 +54,12 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { sliceXor(in, out, o) return } - var done int + done := (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } galMulXorNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 5) << 5 remain := len(in) - done if remain > 0 { @@ -51,20 +70,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// simple slice xor -func sliceXor(in, out []byte, o *options) { - - galXorNEON(in, out) - done := (len(in) >> 5) << 5 - - remain := len(in) - done - if remain > 0 { - for i := done; i < len(in); i++ { - out[i] ^= in[i] - } - } -} - // 4-way butterfly func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) @@ -90,7 +95,7 @@ func fftDIT2(x, y []byte, log_m ffe, o *options) { // Reference version: refMulAdd(x, y, log_m) // 64 byte aligned, always full. - galXorNEON(x, y) + xorSliceNEON(x, y) } // 2-way butterfly forward @@ -103,7 +108,7 @@ func fftDIT28(x, y []byte, log_m ffe8, o *options) { // 2-way butterfly func ifftDIT2(x, y []byte, log_m ffe, o *options) { // 64 byte aligned, always full. - galXorNEON(x, y) + xorSliceNEON(x, y) // Reference version: refMulAdd(x, y, log_m) } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s index 3ae3237..ca3c912 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s @@ -1,10 +1,13 @@ //+build !noasm //+build !appengine //+build !gccgo +//+build !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. +#include "textflag.h" + #define LOAD(LO1, LO2, HI1, HI2) \ VLD1.P 32(R1), [LO1.B16, LO2.B16] \ \ @@ -100,28 +103,13 @@ loopXor: completeXor: RET -// func galXorNEON(in, out []byte) -TEXT ·galXorNEON(SB), 7, $0 - MOVD in_base+0(FP), R1 - MOVD in_len+8(FP), R2 // length of message - MOVD out_base+24(FP), R5 - SUBS $32, R2 - BMI completeXor - -loopXor: - // Main loop - VLD1.P 32(R1), [V0.B16, V1.B16] - VLD1 (R5), [V20.B16, V21.B16] - - VEOR V20.B16, V0.B16, V4.B16 - VEOR V21.B16, V1.B16, V5.B16 - - // Store result - VST1.P [V4.D2, V5.D2], 32(R5) - - SUBS $32, R2 - BPL loopXor - -completeXor: - RET - +TEXT ·getVectorLength(SB), NOSPLIT, $0 + WORD $0xd2800002 // mov x2, #0 + WORD $0x04225022 // addvl x2, x2, #1 + WORD $0xd37df042 // lsl x2, x2, #3 + WORD $0xd2800003 // mov x3, #0 + WORD $0x04635023 // addpl x3, x3, #1 + WORD $0xd37df063 // lsl x3, x3, #3 + MOVD R2, vl+0(FP) + MOVD R3, pl+8(FP) + RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go index 5f53c3b..dac9b13 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go @@ -1,16 +1,19 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -//go:build !appengine && !noasm && !nogen && gc +//go:build !appengine && !noasm && !nogen && !nopshufb && gc package reedsolomon func _dummy_() -// mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. -// The output is initialized to 0. -// //go:noescape -func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func sSE2XorSlice(in []byte, out []byte) + +//go:noescape +func sSE2XorSlice_64(in []byte, out []byte) + +//go:noescape +func avx2XorSlice_64(in []byte, out []byte) // mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. @@ -24,27 +27,27 @@ func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. +// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -57,27 +60,27 @@ func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. +// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -90,15 +93,21 @@ func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. // //go:noescape func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. +// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. // @@ -117,11 +126,22 @@ func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. // //go:noescape func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. // //go:noescape @@ -139,11 +159,22 @@ func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. // //go:noescape func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. // //go:noescape @@ -161,11 +192,22 @@ func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. // //go:noescape func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. // //go:noescape @@ -183,11 +225,22 @@ func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. // //go:noescape func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. // //go:noescape @@ -205,11 +258,22 @@ func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. // //go:noescape func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. // //go:noescape @@ -227,11 +291,22 @@ func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. // //go:noescape func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. // //go:noescape @@ -249,21 +324,26 @@ func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. // //go:noescape func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. +// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. @@ -277,27 +357,27 @@ func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. +// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -310,27 +390,27 @@ func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. +// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -343,15 +423,21 @@ func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. // //go:noescape func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. +// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. // @@ -370,11 +456,22 @@ func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. // //go:noescape func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. // //go:noescape @@ -392,11 +489,22 @@ func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. // //go:noescape func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. // //go:noescape @@ -414,11 +522,22 @@ func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. // //go:noescape func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. // //go:noescape @@ -436,11 +555,22 @@ func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. // //go:noescape func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. // //go:noescape @@ -458,11 +588,22 @@ func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. // //go:noescape func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. // //go:noescape @@ -480,11 +621,22 @@ func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. // //go:noescape func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. // //go:noescape @@ -502,21 +654,26 @@ func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. // //go:noescape func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. +// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. @@ -530,27 +687,27 @@ func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. +// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -563,27 +720,27 @@ func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. +// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -596,15 +753,21 @@ func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. // //go:noescape func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. +// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. // @@ -623,11 +786,22 @@ func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. // //go:noescape func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. // //go:noescape @@ -645,11 +819,22 @@ func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. // //go:noescape func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. // //go:noescape @@ -667,11 +852,22 @@ func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. // //go:noescape func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. // //go:noescape @@ -689,11 +885,22 @@ func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. // //go:noescape func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. // //go:noescape @@ -711,11 +918,22 @@ func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. // //go:noescape func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. // //go:noescape @@ -733,11 +951,22 @@ func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. // //go:noescape func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. // //go:noescape @@ -755,21 +984,26 @@ func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. // //go:noescape func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. +// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. @@ -783,27 +1017,27 @@ func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. +// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -816,27 +1050,27 @@ func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. +// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -849,15 +1083,21 @@ func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. // //go:noescape func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. +// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. // @@ -876,11 +1116,22 @@ func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. // //go:noescape func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. // //go:noescape @@ -898,11 +1149,22 @@ func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. // //go:noescape func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. // //go:noescape @@ -920,10 +1182,21 @@ func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. // //go:noescape -func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. // @@ -942,11 +1215,22 @@ func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. // //go:noescape func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. // //go:noescape @@ -964,11 +1248,22 @@ func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. // //go:noescape func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. // //go:noescape @@ -986,11 +1281,22 @@ func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. // //go:noescape func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. // //go:noescape @@ -1008,21 +1314,26 @@ func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. // //go:noescape func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. +// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1036,27 +1347,27 @@ func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. +// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1069,27 +1380,27 @@ func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. +// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1102,15 +1413,21 @@ func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. // //go:noescape func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. +// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. // @@ -1129,11 +1446,22 @@ func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. // //go:noescape func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. // //go:noescape @@ -1151,11 +1479,22 @@ func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. // //go:noescape func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. // //go:noescape @@ -1173,11 +1512,22 @@ func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. // //go:noescape func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. // //go:noescape @@ -1195,11 +1545,22 @@ func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. // //go:noescape func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. // //go:noescape @@ -1217,11 +1578,22 @@ func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. // //go:noescape func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. // //go:noescape @@ -1239,11 +1611,22 @@ func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. // //go:noescape func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. // //go:noescape @@ -1261,21 +1644,26 @@ func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. // //go:noescape func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. +// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1289,27 +1677,27 @@ func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. +// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1322,27 +1710,27 @@ func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. +// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1355,15 +1743,21 @@ func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. // //go:noescape func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. +// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. // @@ -1382,11 +1776,22 @@ func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. // //go:noescape func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. // //go:noescape @@ -1404,11 +1809,22 @@ func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. // //go:noescape func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. // //go:noescape @@ -1426,11 +1842,22 @@ func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. // //go:noescape func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. // //go:noescape @@ -1448,11 +1875,22 @@ func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. // //go:noescape func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. // //go:noescape @@ -1470,11 +1908,22 @@ func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. // //go:noescape func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. // //go:noescape @@ -1492,11 +1941,22 @@ func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. // //go:noescape func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. // //go:noescape @@ -1514,21 +1974,26 @@ func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. // //go:noescape func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. +// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1542,27 +2007,27 @@ func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. +// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1575,27 +2040,27 @@ func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. +// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1608,15 +2073,21 @@ func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. // //go:noescape func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. +// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. // @@ -1635,11 +2106,22 @@ func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. // //go:noescape func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. // //go:noescape @@ -1657,11 +2139,22 @@ func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. // //go:noescape func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. // //go:noescape @@ -1679,11 +2172,22 @@ func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. // //go:noescape func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. // //go:noescape @@ -1701,11 +2205,22 @@ func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. // //go:noescape func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. // //go:noescape @@ -1723,11 +2238,22 @@ func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. // //go:noescape func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. // //go:noescape @@ -1745,11 +2271,22 @@ func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. // //go:noescape func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. // //go:noescape @@ -1767,21 +2304,26 @@ func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. // //go:noescape func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. +// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1795,27 +2337,27 @@ func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. +// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1828,27 +2370,27 @@ func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. +// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1861,15 +2403,21 @@ func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. // //go:noescape func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. +// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. // @@ -1888,11 +2436,22 @@ func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. // //go:noescape func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. // //go:noescape @@ -1910,11 +2469,22 @@ func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. // //go:noescape func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. // //go:noescape @@ -1932,11 +2502,22 @@ func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. // //go:noescape func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. // //go:noescape @@ -1954,11 +2535,22 @@ func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. // //go:noescape func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. // //go:noescape @@ -1976,11 +2568,22 @@ func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. // //go:noescape func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. // //go:noescape @@ -1998,11 +2601,22 @@ func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. // //go:noescape func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. // //go:noescape @@ -2020,21 +2634,26 @@ func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. // //go:noescape func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. +// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. @@ -2048,27 +2667,27 @@ func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. +// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2081,27 +2700,27 @@ func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. +// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -2114,15 +2733,21 @@ func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. // //go:noescape func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. +// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. // @@ -2141,11 +2766,22 @@ func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. // //go:noescape func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. // //go:noescape @@ -2163,11 +2799,22 @@ func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. // //go:noescape func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. // //go:noescape @@ -2185,11 +2832,22 @@ func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. // //go:noescape func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. // //go:noescape @@ -2207,11 +2865,22 @@ func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. // //go:noescape func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. // //go:noescape @@ -2229,11 +2898,22 @@ func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. // //go:noescape func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. // //go:noescape @@ -2251,11 +2931,22 @@ func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. // //go:noescape func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. // //go:noescape @@ -2273,21 +2964,26 @@ func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. // //go:noescape func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. +// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. @@ -2301,27 +2997,27 @@ func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. +// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2334,27 +3030,27 @@ func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. +// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -2367,15 +3063,21 @@ func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. // //go:noescape func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. +// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. // @@ -2394,11 +3096,22 @@ func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. // //go:noescape func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. // //go:noescape @@ -2416,11 +3129,22 @@ func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. // //go:noescape func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. // //go:noescape @@ -2438,11 +3162,22 @@ func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. // //go:noescape func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. // //go:noescape @@ -2460,11 +3195,22 @@ func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. // //go:noescape func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. // //go:noescape @@ -2482,11 +3228,22 @@ func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. // //go:noescape func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. // //go:noescape @@ -2504,11 +3261,22 @@ func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. // //go:noescape func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. // //go:noescape @@ -2526,11 +3294,22 @@ func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. // //go:noescape func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. // //go:noescape diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s index 3a2acac..8ff74bf 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s @@ -1,6 +1,6 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -//go:build !appengine && !noasm && !nogen && gc +//go:build !appengine && !noasm && !nogen && !nopshufb && gc #include "textflag.h" @@ -18,55 +18,96 @@ TEXT ·_dummy_(SB), $0 #endif RET -// func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), DX - MOVQ start+72(FP), BX +// sSE2XorSlice will XOR in with out and store in out. +// Processes 16 bytes/loop. - // Add start offset to output - ADDQ BX, DX +// func sSE2XorSlice(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x04, DX + JZ end - // Add start offset to input - ADDQ BX, CX - MOVQ $0x0000000f, BX - MOVQ BX, X3 - VPBROADCASTB X3, Y3 +loop: + MOVOU (AX), X0 + MOVOU (CX), X1 + PXOR X0, X1 + MOVOU X1, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + DECQ DX + JNZ loop + +end: + RET + +// sSE2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func sSE2XorSlice_64(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end -mulAvxTwo_1x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y2 - ADDQ $0x20, CX - VPSRLQ $0x04, Y2, Y4 - VPAND Y3, Y2, Y2 - VPAND Y3, Y4, Y4 - VPSHUFB Y2, Y0, Y2 - VPSHUFB Y4, Y1, Y4 - VPXOR Y2, Y4, Y2 +loop: + MOVOU (AX), X0 + MOVOU 16(AX), X2 + MOVOU 32(AX), X4 + MOVOU 48(AX), X6 + MOVOU (CX), X1 + MOVOU 16(CX), X3 + MOVOU 32(CX), X5 + MOVOU 48(CX), X7 + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (CX) + MOVOU X3, 16(CX) + MOVOU X5, 32(CX) + MOVOU X7, 48(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + RET + +// avx2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func avx2XorSlice_64(in []byte, out []byte) +// Requires: AVX, AVX2 +TEXT ·avx2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end - // Store 1 outputs - VMOVDQU Y2, (DX) - ADDQ $0x20, DX +loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y2 + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (CX) + VMOVDQU Y3, 32(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x1_loop +end: VZEROUPPER - -mulAvxTwo_1x1_end: RET // func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -85,7 +126,6 @@ TEXT ·mulAvxTwo_1x1_64(SB), $0-88 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX @@ -172,6 +212,49 @@ mulGFNI_1x1_64_loop: mulGFNI_1x1_64_end: RET +// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1_loop + VZEROUPPER + +mulAvxGFNI_1x1_end: + RET + // func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 @@ -219,56 +302,51 @@ mulGFNI_1x1_64Xor_loop: mulGFNI_1x1_64Xor_end: RET -// func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), DX - MOVQ start+72(FP), BX + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input - ADDQ BX, CX - MOVQ $0x0000000f, BX - MOVQ BX, X3 - VPBROADCASTB X3, Y3 + ADDQ BX, CX + +mulAvxGFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 -mulAvxTwo_1x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y4 - ADDQ $0x20, CX - VPSRLQ $0x04, Y4, Y5 - VPAND Y3, Y4, Y4 - VPAND Y3, Y5, Y5 - VMOVDQU (DX), Y2 - VPSHUFB Y4, Y0, Y4 - VPSHUFB Y5, Y1, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 // Store 1 outputs - VMOVDQU Y2, (DX) + VMOVDQU Y1, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x1Xor_loop + JNZ mulAvxGFNI_1x1Xor_loop VZEROUPPER -mulAvxTwo_1x1Xor_end: +mulAvxGFNI_1x1Xor_end: RET // func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -287,7 +365,6 @@ TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX @@ -335,66 +412,6 @@ mulAvxTwo_1x1_64Xor_loop: mulAvxTwo_1x1_64Xor_end: RET -// func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x2_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - ADDQ SI, DX - - // Add start offset to input - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_1x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y6, Y8, Y8 - VPAND Y6, Y9, Y9 - VPSHUFB Y8, Y0, Y5 - VPSHUFB Y9, Y1, Y7 - VPXOR Y5, Y7, Y4 - VPSHUFB Y8, Y2, Y5 - VPSHUFB Y9, Y3, Y7 - VPXOR Y5, Y7, Y5 - - // Store 2 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - VMOVDQU Y5, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x2_loop - VZEROUPPER - -mulAvxTwo_1x2_end: - RET - // func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64(SB), $0-88 @@ -409,7 +426,6 @@ TEXT ·mulAvxTwo_1x2_64(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI @@ -517,6 +533,55 @@ mulGFNI_1x2_64_loop: mulGFNI_1x2_64_end: RET +// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2_loop + VZEROUPPER + +mulAvxGFNI_1x2_end: + RET + // func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 @@ -572,66 +637,59 @@ mulGFNI_1x2_64Xor_loop: mulGFNI_1x2_64Xor_end: RET -// func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x2Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ start+72(FP), SI + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X6 - VPBROADCASTB X6, Y6 + ADDQ SI, CX + +mulAvxGFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 -mulAvxTwo_1x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y9 - ADDQ $0x20, CX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (BX), Y4 - VPSHUFB Y9, Y0, Y7 - VPSHUFB Y10, Y1, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - VMOVDQU (DX), Y5 - VPSHUFB Y9, Y2, Y7 - VPSHUFB Y10, Y3, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 // Store 2 outputs - VMOVDQU Y4, (BX) + VMOVDQU Y2, (BX) ADDQ $0x20, BX - VMOVDQU Y5, (DX) + VMOVDQU Y3, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x2Xor_loop + JNZ mulAvxGFNI_1x2Xor_loop VZEROUPPER -mulAvxTwo_1x2Xor_end: +mulAvxGFNI_1x2Xor_end: RET // func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -648,7 +706,6 @@ TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI @@ -713,75 +770,6 @@ mulAvxTwo_1x2_64Xor_loop: mulAvxTwo_1x2_64Xor_end: RET -// func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x3(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x3_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, BX - ADDQ DI, SI - ADDQ DI, DX - - // Add start offset to input - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_1x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y11 - ADDQ $0x20, CX - VPSRLQ $0x04, Y11, Y12 - VPAND Y9, Y11, Y11 - VPAND Y9, Y12, Y12 - VPSHUFB Y11, Y0, Y8 - VPSHUFB Y12, Y1, Y10 - VPXOR Y8, Y10, Y6 - VPSHUFB Y11, Y2, Y8 - VPSHUFB Y12, Y3, Y10 - VPXOR Y8, Y10, Y7 - VPSHUFB Y11, Y4, Y8 - VPSHUFB Y12, Y5, Y10 - VPXOR Y8, Y10, Y8 - - // Store 3 outputs - VMOVDQU Y6, (BX) - ADDQ $0x20, BX - VMOVDQU Y7, (SI) - ADDQ $0x20, SI - VMOVDQU Y8, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x3_loop - VZEROUPPER - -mulAvxTwo_1x3_end: - RET - // func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64(SB), $0-88 @@ -796,7 +784,6 @@ TEXT ·mulAvxTwo_1x3_64(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX @@ -923,6 +910,61 @@ mulGFNI_1x3_64_loop: mulGFNI_1x3_64_end: RET +// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3_loop + VZEROUPPER + +mulAvxGFNI_1x3_end: + RET + // func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 @@ -986,30 +1028,28 @@ mulGFNI_1x3_64Xor_loop: mulGFNI_1x3_64Xor_end: RET -// func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x3Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ start+72(FP), DI + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX @@ -1017,45 +1057,38 @@ TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 ADDQ DI, DX // Add start offset to input - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X9 - VPBROADCASTB X9, Y9 + ADDQ DI, CX + +mulAvxGFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 -mulAvxTwo_1x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (BX), Y6 - VPSHUFB Y12, Y0, Y10 - VPSHUFB Y13, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) - VMOVDQU (SI), Y7 - VPSHUFB Y12, Y2, Y10 - VPSHUFB Y13, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) - VMOVDQU (DX), Y8 - VPSHUFB Y12, Y4, Y10 - VPSHUFB Y13, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 // Store 3 outputs - VMOVDQU Y6, (BX) + VMOVDQU Y3, (BX) ADDQ $0x20, BX - VMOVDQU Y7, (SI) + VMOVDQU Y4, (SI) ADDQ $0x20, SI - VMOVDQU Y8, (DX) + VMOVDQU Y5, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x3Xor_loop + JNZ mulAvxGFNI_1x3Xor_loop VZEROUPPER -mulAvxTwo_1x3Xor_end: +mulAvxGFNI_1x3Xor_end: RET // func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1072,7 +1105,6 @@ TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX @@ -1291,6 +1323,67 @@ mulGFNI_1x4_64_loop: mulGFNI_1x4_64_end: RET +// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4_loop + VZEROUPPER + +mulAvxGFNI_1x4_end: + RET + // func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 @@ -1362,6 +1455,77 @@ mulGFNI_1x4_64Xor_loop: mulGFNI_1x4_64Xor_end: RET +// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4Xor_loop + VZEROUPPER + +mulAvxGFNI_1x4Xor_end: + RET + // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 @@ -1598,6 +1762,73 @@ mulGFNI_1x5_64_loop: mulGFNI_1x5_64_end: RET +// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5_loop + VZEROUPPER + +mulAvxGFNI_1x5_end: + RET + // func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 @@ -1677,6 +1908,85 @@ mulGFNI_1x5_64Xor_loop: mulGFNI_1x5_64Xor_end: RET +// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5Xor_loop + VZEROUPPER + +mulAvxGFNI_1x5Xor_end: + RET + // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 @@ -1938,6 +2248,79 @@ mulGFNI_1x6_64_loop: mulGFNI_1x6_64_end: RET +// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6_loop + VZEROUPPER + +mulAvxGFNI_1x6_end: + RET + // func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 @@ -2025,6 +2408,93 @@ mulGFNI_1x6_64Xor_loop: mulGFNI_1x6_64Xor_end: RET +// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6Xor_loop + VZEROUPPER + +mulAvxGFNI_1x6Xor_end: + RET + // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 @@ -2311,6 +2781,85 @@ mulGFNI_1x7_64_loop: mulGFNI_1x7_64_end: RET +// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7_loop + VZEROUPPER + +mulAvxGFNI_1x7_end: + RET + // func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 @@ -2406,6 +2955,101 @@ mulGFNI_1x7_64Xor_loop: mulGFNI_1x7_64Xor_end: RET +// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7Xor_loop + VZEROUPPER + +mulAvxGFNI_1x7Xor_end: + RET + // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 @@ -2717,6 +3361,91 @@ mulGFNI_1x8_64_loop: mulGFNI_1x8_64_end: RET +// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 56(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8_loop + VZEROUPPER + +mulAvxGFNI_1x8_end: + RET + // func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 @@ -2820,6 +3549,109 @@ mulGFNI_1x8_64Xor_loop: mulGFNI_1x8_64Xor_end: RET +// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8Xor_loop + VZEROUPPER + +mulAvxGFNI_1x8Xor_end: + RET + // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 @@ -3156,6 +3988,97 @@ mulGFNI_1x9_64_loop: mulGFNI_1x9_64_end: RET +// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 64(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9_loop + VZEROUPPER + +mulAvxGFNI_1x9_end: + RET + // func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 @@ -3267,6 +4190,117 @@ mulGFNI_1x9_64Xor_loop: mulGFNI_1x9_64Xor_end: RET +// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9Xor_loop + VZEROUPPER + +mulAvxGFNI_1x9Xor_end: + RET + // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 @@ -3628,6 +4662,103 @@ mulGFNI_1x10_64_loop: mulGFNI_1x10_64_end: RET +// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 72(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10_loop + VZEROUPPER + +mulAvxGFNI_1x10_end: + RET + // func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 @@ -3747,6 +4878,125 @@ mulGFNI_1x10_64Xor_loop: mulGFNI_1x10_64Xor_end: RET +// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10Xor_loop + VZEROUPPER + +mulAvxGFNI_1x10Xor_end: + RET + // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 @@ -3889,71 +5139,6 @@ mulAvxTwo_1x10Xor_loop: mulAvxTwo_1x10Xor_end: RET -// func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), BX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - - // Add start offset to input - ADDQ SI, DX - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_2x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y0, Y6 - VPSHUFB Y7, Y1, Y7 - VPXOR Y6, Y7, Y4 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (CX), Y6 - ADDQ $0x20, CX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y2, Y6 - VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) - - // Store 1 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x1_loop - VZEROUPPER - -mulAvxTwo_2x1_end: - RET - // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 @@ -3973,7 +5158,6 @@ TEXT ·mulAvxTwo_2x1_64(SB), $0-88 MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI @@ -4087,6 +5271,58 @@ mulGFNI_2x1_64_loop: mulGFNI_2x1_64_end: RET +// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1_loop + VZEROUPPER + +mulAvxGFNI_2x1_end: + RET + // func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 @@ -4143,70 +5379,60 @@ mulGFNI_2x1_64Xor_loop: mulGFNI_2x1_64Xor_end: RET -// func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), BX - MOVQ start+72(FP), SI + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input - ADDQ SI, DX - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X5 - VPBROADCASTB X5, Y5 + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 -mulAvxTwo_2x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VMOVDQU (BX), Y4 - VPSHUFB Y6, Y0, Y6 - VPSHUFB Y7, Y1, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (CX), Y6 - ADDQ $0x20, CX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y2, Y6 - VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 // Store 1 outputs - VMOVDQU Y4, (BX) + VMOVDQU Y2, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x1Xor_loop + JNZ mulAvxGFNI_2x1Xor_loop VZEROUPPER -mulAvxTwo_2x1Xor_end: +mulAvxGFNI_2x1Xor_end: RET // func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -4228,7 +5454,6 @@ TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI @@ -4294,85 +5519,6 @@ mulAvxTwo_2x1_64Xor_loop: mulAvxTwo_2x1_64Xor_end: RET -// func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 15 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x2_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), SI - MOVQ 24(BX), BX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - ADDQ DI, BX - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X10 - VPBROADCASTB X10, Y10 - -mulAvxTwo_2x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y0, Y11 - VPSHUFB Y14, Y1, Y12 - VPXOR Y11, Y12, Y8 - VPSHUFB Y13, Y2, Y11 - VPSHUFB Y14, Y3, Y12 - VPXOR Y11, Y12, Y9 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (CX), Y13 - ADDQ $0x20, CX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y4, Y11 - VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VPSHUFB Y13, Y6, Y11 - VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) - - // Store 2 outputs - VMOVDQU Y8, (SI) - ADDQ $0x20, SI - VMOVDQU Y9, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x2_loop - VZEROUPPER - -mulAvxTwo_2x2_end: - RET - // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 @@ -4388,7 +5534,6 @@ TEXT ·mulAvxTwo_2x2_64(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 @@ -4536,6 +5681,67 @@ mulGFNI_2x2_64_loop: mulGFNI_2x2_64_end: RET +// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2_loop + VZEROUPPER + +mulAvxGFNI_2x2_end: + RET + // func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 @@ -4603,85 +5809,71 @@ mulGFNI_2x2_64Xor_loop: mulGFNI_2x2_64Xor_end: RET -// func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 15 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x2Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), SI - MOVQ 24(BX), BX - MOVQ start+72(FP), DI + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input - ADDQ DI, DX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X10 - VPBROADCASTB X10, Y10 + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 -mulAvxTwo_2x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (SI), Y8 - VPSHUFB Y13, Y0, Y11 - VPSHUFB Y14, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VMOVDQU (BX), Y9 - VPSHUFB Y13, Y2, Y11 - VPSHUFB Y14, Y3, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (CX), Y13 - ADDQ $0x20, CX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y4, Y11 - VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VPSHUFB Y13, Y6, Y11 - VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 // Store 2 outputs - VMOVDQU Y8, (SI) + VMOVDQU Y4, (SI) ADDQ $0x20, SI - VMOVDQU Y9, (BX) + VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x2Xor_loop + JNZ mulAvxGFNI_2x2Xor_loop VZEROUPPER -mulAvxTwo_2x2Xor_end: +mulAvxGFNI_2x2Xor_end: RET // func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -4699,7 +5891,6 @@ TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 @@ -4792,99 +5983,6 @@ mulAvxTwo_2x2_64Xor_loop: mulAvxTwo_2x2_64Xor_end: RET -// func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), SI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, DI - ADDQ R9, R8 - ADDQ R9, SI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_2x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (DI) - ADDQ $0x20, DI - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - VMOVDQU Y2, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x3_loop - VZEROUPPER - -mulAvxTwo_2x3_end: - RET - // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 @@ -4900,7 +5998,6 @@ TEXT ·mulAvxTwo_2x3_64(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI @@ -5078,6 +6175,76 @@ mulGFNI_2x3_64_loop: mulGFNI_2x3_64_end: RET +// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3_loop + VZEROUPPER + +mulAvxGFNI_2x3_end: + RET + // func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 @@ -5156,100 +6323,82 @@ mulGFNI_2x3_64Xor_loop: mulGFNI_2x3_64Xor_end: RET -// func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), SI - MOVQ start+72(FP), R9 + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 // Add start offset to output - ADDQ R9, DI - ADDQ R9, R8 - ADDQ R9, SI + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX // Add start offset to input - ADDQ R9, BX - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X3 - VPBROADCASTB X3, Y3 + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 -mulAvxTwo_2x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (DI), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R8), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (SI), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 // Store 3 outputs - VMOVDQU Y0, (DI) - ADDQ $0x20, DI - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - VMOVDQU Y2, (SI) + VMOVDQU Y6, (SI) ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x3Xor_loop + JNZ mulAvxGFNI_2x3Xor_loop VZEROUPPER -mulAvxTwo_2x3Xor_end: +mulAvxGFNI_2x3Xor_end: RET // func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -5267,7 +6416,6 @@ TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI @@ -5569,6 +6717,85 @@ mulGFNI_2x4_64_loop: mulGFNI_2x4_64_end: RET +// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4_loop + VZEROUPPER + +mulAvxGFNI_2x4_end: + RET + // func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 @@ -5658,6 +6885,95 @@ mulGFNI_2x4_64Xor_loop: mulGFNI_2x4_64Xor_end: RET +// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4Xor_loop + VZEROUPPER + +mulAvxGFNI_2x4Xor_end: + RET + // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 @@ -5978,6 +7294,94 @@ mulGFNI_2x5_64_loop: mulGFNI_2x5_64_end: RET +// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5_loop + VZEROUPPER + +mulAvxGFNI_2x5_end: + RET + // func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 @@ -6078,6 +7482,106 @@ mulGFNI_2x5_64Xor_loop: mulGFNI_2x5_64Xor_end: RET +// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5Xor_loop + VZEROUPPER + +mulAvxGFNI_2x5Xor_end: + RET + // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 @@ -6436,6 +7940,103 @@ mulGFNI_2x6_64_loop: mulGFNI_2x6_64_end: RET +// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6_loop + VZEROUPPER + +mulAvxGFNI_2x6_end: + RET + // func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 @@ -6547,6 +8148,117 @@ mulGFNI_2x6_64Xor_loop: mulGFNI_2x6_64Xor_end: RET +// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6Xor_loop + VZEROUPPER + +mulAvxGFNI_2x6Xor_end: + RET + // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 @@ -6943,6 +8655,112 @@ mulGFNI_2x7_64_loop: mulGFNI_2x7_64_end: RET +// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7_loop + VZEROUPPER + +mulAvxGFNI_2x7_end: + RET + // func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 @@ -7065,6 +8883,128 @@ mulGFNI_2x7_64Xor_loop: mulGFNI_2x7_64Xor_end: RET +// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7Xor_loop + VZEROUPPER + +mulAvxGFNI_2x7Xor_end: + RET + // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 @@ -7499,6 +9439,121 @@ mulGFNI_2x8_64_loop: mulGFNI_2x8_64_end: RET +// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8_loop + VZEROUPPER + +mulAvxGFNI_2x8_end: + RET + // func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 @@ -7632,6 +9687,139 @@ mulGFNI_2x8_64Xor_loop: mulGFNI_2x8_64Xor_end: RET +// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8Xor_loop + VZEROUPPER + +mulAvxGFNI_2x8Xor_end: + RET + // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 @@ -8104,6 +10292,130 @@ mulGFNI_2x9_64_loop: mulGFNI_2x9_64_end: RET +// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9_loop + VZEROUPPER + +mulAvxGFNI_2x9_end: + RET + // func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 @@ -8248,6 +10560,150 @@ mulGFNI_2x9_64Xor_loop: mulGFNI_2x9_64Xor_end: RET +// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9Xor_loop + VZEROUPPER + +mulAvxGFNI_2x9Xor_end: + RET + // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 @@ -8758,6 +11214,139 @@ mulGFNI_2x10_64_loop: mulGFNI_2x10_64_end: RET +// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10_loop + VZEROUPPER + +mulAvxGFNI_2x10_end: + RET + // func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 @@ -8913,6 +11502,161 @@ mulGFNI_2x10_64Xor_loop: mulGFNI_2x10_64Xor_end: RET +// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10Xor_loop + VZEROUPPER + +mulAvxGFNI_2x10Xor_end: + RET + // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 @@ -9114,85 +11858,6 @@ mulAvxTwo_2x10Xor_loop: mulAvxTwo_2x10Xor_end: RET -// func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 10 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), CX - MOVQ out_base+48(FP), SI - MOVQ (SI), SI - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_3x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y0, Y8 - VPSHUFB Y9, Y1, Y9 - VPXOR Y8, Y9, Y6 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y2, Y8 - VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y4, Y8 - VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Store 1 outputs - VMOVDQU Y6, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x1_loop - VZEROUPPER - -mulAvxTwo_3x1_end: - RET - // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 @@ -9209,7 +11874,6 @@ TEXT ·mulAvxTwo_3x1_64(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 @@ -9356,6 +12020,67 @@ mulGFNI_3x1_64_loop: mulGFNI_3x1_64_end: RET +// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1_loop + VZEROUPPER + +mulAvxGFNI_3x1_end: + RET + // func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 @@ -9421,84 +12146,69 @@ mulGFNI_3x1_64Xor_loop: mulGFNI_3x1_64Xor_end: RET -// func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 10 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), CX - MOVQ out_base+48(FP), SI - MOVQ (SI), SI - MOVQ start+72(FP), DI + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X7 - VPBROADCASTB X7, Y7 + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 -mulAvxTwo_3x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VMOVDQU (SI), Y6 - VPSHUFB Y8, Y0, Y8 - VPSHUFB Y9, Y1, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y2, Y8 - VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y4, Y8 - VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 // Store 1 outputs - VMOVDQU Y6, (SI) + VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x1Xor_loop + JNZ mulAvxGFNI_3x1Xor_loop VZEROUPPER -mulAvxTwo_3x1Xor_end: +mulAvxGFNI_3x1Xor_end: RET // func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -9517,7 +12227,6 @@ TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 @@ -9607,104 +12316,6 @@ mulAvxTwo_3x1_64Xor_loop: mulAvxTwo_3x1_64Xor_end: RET -// func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 19 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), DI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - ADDQ R9, DI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_3x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x2_loop - VZEROUPPER - -mulAvxTwo_3x2_end: - RET - // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 @@ -9721,7 +12332,6 @@ TEXT ·mulAvxTwo_3x2_64(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 @@ -9909,6 +12519,79 @@ mulGFNI_3x2_64_loop: mulGFNI_3x2_64_end: RET +// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2_loop + VZEROUPPER + +mulAvxGFNI_3x2_end: + RET + // func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 @@ -9988,104 +12671,83 @@ mulGFNI_3x2_64Xor_loop: mulGFNI_3x2_64Xor_end: RET -// func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 19 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), DI - MOVQ start+72(FP), R9 + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 // Add start offset to output - ADDQ R9, R8 - ADDQ R9, DI + ADDQ R8, DI + ADDQ R8, SI // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 -mulAvxTwo_3x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R8), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (DI), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 // Store 2 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (DI) + VMOVDQU Y6, (DI) ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x2Xor_loop + JNZ mulAvxGFNI_3x2Xor_loop VZEROUPPER -mulAvxTwo_3x2Xor_end: +mulAvxGFNI_3x2Xor_end: RET // func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -10104,7 +12766,6 @@ TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 @@ -10225,123 +12886,6 @@ mulAvxTwo_3x2_64Xor_loop: mulAvxTwo_3x2_64Xor_end: RET -// func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 26 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_3x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - VMOVDQU Y2, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x3_loop - VZEROUPPER - -mulAvxTwo_3x3_end: - RET - // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 @@ -10358,7 +12902,6 @@ TEXT ·mulAvxTwo_3x3_64(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI @@ -10587,6 +13130,91 @@ mulGFNI_3x3_64_loop: mulGFNI_3x3_64_end: RET +// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3_loop + VZEROUPPER + +mulAvxGFNI_3x3_end: + RET + // func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 @@ -10680,124 +13308,97 @@ mulGFNI_3x3_64Xor_loop: mulGFNI_3x3_64Xor_end: RET -// func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 26 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 -mulAvxTwo_3x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R8), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R9), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (DI), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 // Store 3 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - VMOVDQU Y2, (DI) + VMOVDQU Y9, (DI) ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x3Xor_loop + JNZ mulAvxGFNI_3x3Xor_loop VZEROUPPER -mulAvxTwo_3x3Xor_end: +mulAvxGFNI_3x3Xor_end: RET // func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -10816,7 +13417,6 @@ TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI @@ -11201,6 +13801,103 @@ mulGFNI_3x4_64_loop: mulGFNI_3x4_64_end: RET +// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4_loop + VZEROUPPER + +mulAvxGFNI_3x4_end: + RET + // func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 @@ -11308,6 +14005,113 @@ mulGFNI_3x4_64Xor_loop: mulGFNI_3x4_64Xor_end: RET +// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4Xor_loop + VZEROUPPER + +mulAvxGFNI_3x4Xor_end: + RET + // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 @@ -11712,6 +14516,115 @@ mulGFNI_3x5_64_loop: mulGFNI_3x5_64_end: RET +// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5_loop + VZEROUPPER + +mulAvxGFNI_3x5_end: + RET + // func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 @@ -11833,6 +14746,127 @@ mulGFNI_3x5_64Xor_loop: mulGFNI_3x5_64Xor_end: RET +// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5Xor_loop + VZEROUPPER + +mulAvxGFNI_3x5Xor_end: + RET + // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 @@ -12288,6 +15322,127 @@ mulGFNI_3x6_64_loop: mulGFNI_3x6_64_end: RET +// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6_loop + VZEROUPPER + +mulAvxGFNI_3x6_end: + RET + // func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 @@ -12423,6 +15578,141 @@ mulGFNI_3x6_64Xor_loop: mulGFNI_3x6_64Xor_end: RET +// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6Xor_loop + VZEROUPPER + +mulAvxGFNI_3x6Xor_end: + RET + // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 @@ -12929,6 +16219,139 @@ mulGFNI_3x7_64_loop: mulGFNI_3x7_64_end: RET +// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7_loop + VZEROUPPER + +mulAvxGFNI_3x7_end: + RET + // func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 @@ -13078,6 +16501,155 @@ mulGFNI_3x7_64Xor_loop: mulGFNI_3x7_64Xor_end: RET +// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7Xor_loop + VZEROUPPER + +mulAvxGFNI_3x7Xor_end: + RET + // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 @@ -13633,6 +17205,151 @@ mulGFNI_3x8_64_loop: mulGFNI_3x8_64_end: RET +// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8_loop + VZEROUPPER + +mulAvxGFNI_3x8_end: + RET + // func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 @@ -13794,6 +17511,169 @@ mulGFNI_3x8_64Xor_loop: mulGFNI_3x8_64Xor_end: RET +// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8Xor_loop + VZEROUPPER + +mulAvxGFNI_3x8Xor_end: + RET + // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 @@ -14396,6 +18276,163 @@ mulGFNI_3x9_64_loop: mulGFNI_3x9_64_end: RET +// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9_loop + VZEROUPPER + +mulAvxGFNI_3x9_end: + RET + // func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 @@ -14567,6 +18604,183 @@ mulGFNI_3x9_64Xor_loop: mulGFNI_3x9_64Xor_end: RET +// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9Xor_loop + VZEROUPPER + +mulAvxGFNI_3x9Xor_end: + RET + // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 @@ -15222,6 +19436,179 @@ mulGFNI_3x10_64_loop: mulGFNI_3x10_64_end: RET +// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10_loop + VZEROUPPER + +mulAvxGFNI_3x10_end: + RET + // func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 @@ -15407,6 +19794,201 @@ mulGFNI_3x10_64Xor_loop: mulGFNI_3x10_64Xor_end: RET +// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10Xor_loop + VZEROUPPER + +mulAvxGFNI_3x10Xor_end: + RET + // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 @@ -15669,99 +20251,6 @@ mulAvxTwo_3x10Xor_loop: mulAvxTwo_3x10Xor_end: RET -// func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 - - // Add start offset to output - ADDQ R8, DI - - // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_4x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - VPXOR Y10, Y11, Y8 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Store 1 outputs - VMOVDQU Y8, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x1_loop - VZEROUPPER - -mulAvxTwo_4x1_end: - RET - // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 @@ -15779,7 +20268,6 @@ TEXT ·mulAvxTwo_4x1_64(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 @@ -15955,6 +20443,76 @@ mulGFNI_4x1_64_loop: mulGFNI_4x1_64_end: RET +// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1_loop + VZEROUPPER + +mulAvxGFNI_4x1_end: + RET + // func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 @@ -16029,98 +20587,78 @@ mulGFNI_4x1_64Xor_loop: mulGFNI_4x1_64Xor_end: RET -// func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 -mulAvxTwo_4x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VMOVDQU (DI), Y8 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 // Store 1 outputs - VMOVDQU Y8, (DI) + VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_4x1Xor_loop + JNZ mulAvxGFNI_4x1Xor_loop VZEROUPPER -mulAvxTwo_4x1Xor_end: +mulAvxGFNI_4x1Xor_end: RET // func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -16140,7 +20678,6 @@ TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 @@ -16250,123 +20787,6 @@ mulAvxTwo_4x1_64Xor_loop: mulAvxTwo_4x1_64Xor_end: RET -// func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 23 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_4x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x2_loop - VZEROUPPER - -mulAvxTwo_4x2_end: - RET - // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 @@ -16384,7 +20804,6 @@ TEXT ·mulAvxTwo_4x2_64(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 @@ -16612,6 +21031,91 @@ mulGFNI_4x2_64_loop: mulGFNI_4x2_64_end: RET +// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2_loop + VZEROUPPER + +mulAvxGFNI_4x2_end: + RET + // func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 @@ -16703,123 +21207,95 @@ mulGFNI_4x2_64Xor_loop: mulGFNI_4x2_64Xor_end: RET -// func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 23 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 + ADDQ R9, R8 + ADDQ R9, DI // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 -mulAvxTwo_4x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R9), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R8), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 // Store 2 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R8) + VMOVDQU Y8, (R8) ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_4x2Xor_loop + JNZ mulAvxGFNI_4x2Xor_loop VZEROUPPER -mulAvxTwo_4x2Xor_end: +mulAvxGFNI_4x2Xor_end: RET // func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -16839,7 +21315,6 @@ TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 @@ -16988,147 +21463,6 @@ mulAvxTwo_4x2_64Xor_loop: mulAvxTwo_4x2_64Xor_end: RET -// func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R8 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R9 - ADDQ R11, R10 - ADDQ R11, R8 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_4x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - VMOVDQU Y2, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x3_loop - VZEROUPPER - -mulAvxTwo_4x3_end: - RET - // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 @@ -17146,7 +21480,6 @@ TEXT ·mulAvxTwo_4x3_64(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 @@ -17426,6 +21759,106 @@ mulGFNI_4x3_64_loop: mulGFNI_4x3_64_end: RET +// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3_loop + VZEROUPPER + +mulAvxGFNI_4x3_end: + RET + // func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 @@ -17534,27 +21967,39 @@ mulGFNI_4x3_64Xor_loop: mulGFNI_4x3_64Xor_end: RET -// func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers // Destination kept in GP registers - // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R8 - MOVQ start+72(FP), R11 + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 @@ -17562,120 +22007,72 @@ TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 ADDQ R11, R8 // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X3 - VPBROADCASTB X3, Y3 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 -mulAvxTwo_4x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R9), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R10), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R8), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R9) + VMOVDQU Y11, (R9) ADDQ $0x20, R9 - VMOVDQU Y1, (R10) + VMOVDQU Y12, (R10) ADDQ $0x20, R10 - VMOVDQU Y2, (R8) + VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_4x3Xor_loop + JNZ mulAvxGFNI_4x3Xor_loop VZEROUPPER -mulAvxTwo_4x3Xor_end: +mulAvxGFNI_4x3Xor_end: RET // func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -17695,7 +22092,6 @@ TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 @@ -18163,6 +22559,121 @@ mulGFNI_4x4_64_loop: mulGFNI_4x4_64_end: RET +// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4_loop + VZEROUPPER + +mulAvxGFNI_4x4_end: + RET + // func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 @@ -18288,6 +22799,131 @@ mulGFNI_4x4_64Xor_loop: mulGFNI_4x4_64Xor_end: RET +// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4Xor_loop + VZEROUPPER + +mulAvxGFNI_4x4Xor_end: + RET + // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 @@ -18776,6 +23412,136 @@ mulGFNI_4x5_64_loop: mulGFNI_4x5_64_end: RET +// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5_loop + VZEROUPPER + +mulAvxGFNI_4x5_end: + RET + // func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 @@ -18918,6 +23684,148 @@ mulGFNI_4x5_64Xor_loop: mulGFNI_4x5_64Xor_end: RET +// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5Xor_loop + VZEROUPPER + +mulAvxGFNI_4x5Xor_end: + RET + // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 @@ -19470,6 +24378,151 @@ mulGFNI_4x6_64_loop: mulGFNI_4x6_64_end: RET +// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6_loop + VZEROUPPER + +mulAvxGFNI_4x6_end: + RET + // func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 @@ -19629,6 +24682,165 @@ mulGFNI_4x6_64Xor_loop: mulGFNI_4x6_64Xor_end: RET +// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6Xor_loop + VZEROUPPER + +mulAvxGFNI_4x6Xor_end: + RET + // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 @@ -20240,6 +25452,166 @@ mulGFNI_4x7_64_loop: mulGFNI_4x7_64_end: RET +// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7_loop + VZEROUPPER + +mulAvxGFNI_4x7_end: + RET + // func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 @@ -20411,6 +25783,182 @@ mulGFNI_4x7_64Xor_loop: mulGFNI_4x7_64Xor_end: RET +// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7Xor_loop + VZEROUPPER + +mulAvxGFNI_4x7Xor_end: + RET + // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 @@ -21081,6 +26629,181 @@ mulGFNI_4x8_64_loop: mulGFNI_4x8_64_end: RET +// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8_loop + VZEROUPPER + +mulAvxGFNI_4x8_end: + RET + // func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 @@ -21264,6 +26987,199 @@ mulGFNI_4x8_64Xor_loop: mulGFNI_4x8_64Xor_end: RET +// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8Xor_loop + VZEROUPPER + +mulAvxGFNI_4x8Xor_end: + RET + // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 @@ -21999,6 +27915,200 @@ mulGFNI_4x9_64_loop: mulGFNI_4x9_64_end: RET +// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9_loop + VZEROUPPER + +mulAvxGFNI_4x9_end: + RET + // func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 @@ -22198,6 +28308,220 @@ mulGFNI_4x9_64Xor_loop: mulGFNI_4x9_64Xor_end: RET +// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9Xor_loop + VZEROUPPER + +mulAvxGFNI_4x9Xor_end: + RET + // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 @@ -22946,6 +29270,190 @@ mulGFNI_4x10_64_loop: mulGFNI_4x10_64_end: RET +// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10_loop + VZEROUPPER + +mulAvxGFNI_4x10_end: + RET + // func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 @@ -23142,6 +29650,222 @@ mulGFNI_4x10_64Xor_loop: mulGFNI_4x10_64Xor_end: RET +// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10Xor_loop + VZEROUPPER + +mulAvxGFNI_4x10Xor_end: + RET + // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 @@ -23450,113 +30174,6 @@ mulAvxTwo_4x10Xor_loop: mulAvxTwo_4x10Xor_end: RET -// func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - - // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 - -mulAvxTwo_5x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - VPXOR Y12, Y13, Y10 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Store 1 outputs - VMOVDQU Y10, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x1_loop - VZEROUPPER - -mulAvxTwo_5x1_end: - RET - // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 @@ -23575,7 +30192,6 @@ TEXT ·mulAvxTwo_5x1_64(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 @@ -23780,6 +30396,85 @@ mulGFNI_5x1_64_loop: mulGFNI_5x1_64_end: RET +// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1_loop + VZEROUPPER + +mulAvxGFNI_5x1_end: + RET + // func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 @@ -23863,112 +30558,87 @@ mulGFNI_5x1_64Xor_loop: mulGFNI_5x1_64Xor_end: RET -// func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 -mulAvxTwo_5x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VMOVDQU (R8), Y10 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 // Store 1 outputs - VMOVDQU Y10, (R8) + VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x1Xor_loop + JNZ mulAvxGFNI_5x1Xor_loop VZEROUPPER -mulAvxTwo_5x1Xor_end: +mulAvxGFNI_5x1Xor_end: RET // func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -23989,7 +30659,6 @@ TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 @@ -24119,142 +30788,6 @@ mulAvxTwo_5x1_64Xor_loop: mulAvxTwo_5x1_64Xor_end: RET -// func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_5x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x2_loop - VZEROUPPER - -mulAvxTwo_5x2_end: - RET - // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 @@ -24273,7 +30806,6 @@ TEXT ·mulAvxTwo_5x2_64(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 @@ -24541,6 +31073,103 @@ mulGFNI_5x2_64_loop: mulGFNI_5x2_64_end: RET +// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2_loop + VZEROUPPER + +mulAvxGFNI_5x2_end: + RET + // func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 @@ -24644,142 +31273,107 @@ mulGFNI_5x2_64Xor_loop: mulGFNI_5x2_64Xor_end: RET -// func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 + ADDQ R10, R9 + ADDQ R10, R8 // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 -mulAvxTwo_5x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R10), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R9), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 // Store 2 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R9) + VMOVDQU Y10, (R9) ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x2Xor_loop + JNZ mulAvxGFNI_5x2Xor_loop VZEROUPPER -mulAvxTwo_5x2Xor_end: +mulAvxGFNI_5x2Xor_end: RET // func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -24800,7 +31394,6 @@ TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 @@ -24977,171 +31570,6 @@ mulAvxTwo_5x2_64Xor_loop: mulAvxTwo_5x2_64Xor_end: RET -// func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 38 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R10 - ADDQ R12, R11 - ADDQ R12, R9 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_5x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - VMOVDQU Y2, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x3_loop - VZEROUPPER - -mulAvxTwo_5x3_end: - RET - // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 @@ -25160,7 +31588,6 @@ TEXT ·mulAvxTwo_5x3_64(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 @@ -25491,6 +31918,121 @@ mulGFNI_5x3_64_loop: mulGFNI_5x3_64_end: RET +// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3_loop + VZEROUPPER + +mulAvxGFNI_5x3_end: + RET + // func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 @@ -25614,28 +32156,40 @@ mulGFNI_5x3_64Xor_loop: mulGFNI_5x3_64Xor_end: RET -// func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers // Destination kept in GP registers - // Full registers estimated 38 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 @@ -25643,143 +32197,86 @@ TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 ADDQ R12, R9 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 -mulAvxTwo_5x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R10), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R11), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R9), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R10) + VMOVDQU Y11, (R10) ADDQ $0x20, R10 - VMOVDQU Y1, (R11) + VMOVDQU Y12, (R11) ADDQ $0x20, R11 - VMOVDQU Y2, (R9) + VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x3Xor_loop + JNZ mulAvxGFNI_5x3Xor_loop VZEROUPPER -mulAvxTwo_5x3Xor_end: +mulAvxGFNI_5x3Xor_end: RET // func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -25800,7 +32297,6 @@ TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 @@ -26351,6 +32847,139 @@ mulGFNI_5x4_64_loop: mulGFNI_5x4_64_end: RET +// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4_loop + VZEROUPPER + +mulAvxGFNI_5x4_end: + RET + // func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 @@ -26494,6 +33123,149 @@ mulGFNI_5x4_64Xor_loop: mulGFNI_5x4_64Xor_end: RET +// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4Xor_loop + VZEROUPPER + +mulAvxGFNI_5x4Xor_end: + RET + // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 @@ -27066,6 +33838,157 @@ mulGFNI_5x5_64_loop: mulGFNI_5x5_64_end: RET +// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5_loop + VZEROUPPER + +mulAvxGFNI_5x5_end: + RET + // func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 @@ -27229,6 +34152,169 @@ mulGFNI_5x5_64Xor_loop: mulGFNI_5x5_64Xor_end: RET +// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5Xor_loop + VZEROUPPER + +mulAvxGFNI_5x5Xor_end: + RET + // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 @@ -27872,6 +34958,175 @@ mulGFNI_5x6_64_loop: mulGFNI_5x6_64_end: RET +// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6_loop + VZEROUPPER + +mulAvxGFNI_5x6_end: + RET + // func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 @@ -28049,6 +35304,189 @@ mulGFNI_5x6_64Xor_loop: mulGFNI_5x6_64Xor_end: RET +// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6Xor_loop + VZEROUPPER + +mulAvxGFNI_5x6Xor_end: + RET + // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 @@ -28763,6 +36201,193 @@ mulGFNI_5x7_64_loop: mulGFNI_5x7_64_end: RET +// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7_loop + VZEROUPPER + +mulAvxGFNI_5x7_end: + RET + // func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 @@ -28954,6 +36579,209 @@ mulGFNI_5x7_64Xor_loop: mulGFNI_5x7_64Xor_end: RET +// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7Xor_loop + VZEROUPPER + +mulAvxGFNI_5x7Xor_end: + RET + // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 @@ -29745,6 +37573,215 @@ mulGFNI_5x8_64_loop: mulGFNI_5x8_64_end: RET +// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8_loop + VZEROUPPER + +mulAvxGFNI_5x8_end: + RET + // func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 @@ -29954,6 +37991,233 @@ mulGFNI_5x8_64Xor_loop: mulGFNI_5x8_64Xor_end: RET +// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8Xor_loop + VZEROUPPER + +mulAvxGFNI_5x8Xor_end: + RET + // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 @@ -30774,6 +39038,210 @@ mulGFNI_5x9_64_loop: mulGFNI_5x9_64_end: RET +// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9_loop + VZEROUPPER + +mulAvxGFNI_5x9_end: + RET + // func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 @@ -30983,6 +39451,239 @@ mulGFNI_5x9_64Xor_loop: mulGFNI_5x9_64Xor_end: RET +// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9Xor_loop + VZEROUPPER + +mulAvxGFNI_5x9Xor_end: + RET + // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 @@ -31858,6 +40559,226 @@ mulGFNI_5x10_64_loop: mulGFNI_5x10_64_end: RET +// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10_loop + VZEROUPPER + +mulAvxGFNI_5x10_end: + RET + // func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 @@ -32080,6 +41001,258 @@ mulGFNI_5x10_64Xor_loop: mulGFNI_5x10_64Xor_end: RET +// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10Xor_loop + VZEROUPPER + +mulAvxGFNI_5x10Xor_end: + RET + // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 @@ -32447,127 +41620,6 @@ mulAvxTwo_5x10Xor_loop: mulAvxTwo_5x10Xor_end: RET -// func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - - // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 - -mulAvxTwo_6x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - VPXOR Y14, Y15, Y12 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Store 1 outputs - VMOVDQU Y12, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x1_loop - VZEROUPPER - -mulAvxTwo_6x1_end: - RET - // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 @@ -32587,7 +41639,6 @@ TEXT ·mulAvxTwo_6x1_64(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 @@ -32821,6 +41872,94 @@ mulGFNI_6x1_64_loop: mulGFNI_6x1_64_end: RET +// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1_loop + VZEROUPPER + +mulAvxGFNI_6x1_end: + RET + // func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 @@ -32913,126 +42052,96 @@ mulGFNI_6x1_64Xor_loop: mulGFNI_6x1_64Xor_end: RET -// func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 -mulAvxTwo_6x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VMOVDQU (R9), Y12 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 // Store 1 outputs - VMOVDQU Y12, (R9) + VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x1Xor_loop + JNZ mulAvxGFNI_6x1Xor_loop VZEROUPPER -mulAvxTwo_6x1Xor_end: +mulAvxGFNI_6x1Xor_end: RET // func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -33054,7 +42163,6 @@ TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 @@ -33204,161 +42312,6 @@ mulAvxTwo_6x1_64Xor_loop: mulAvxTwo_6x1_64Xor_end: RET -// func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_6x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x2_loop - VZEROUPPER - -mulAvxTwo_6x2_end: - RET - // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 @@ -33378,7 +42331,6 @@ TEXT ·mulAvxTwo_6x2_64(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 @@ -33686,6 +42638,115 @@ mulGFNI_6x2_64_loop: mulGFNI_6x2_64_end: RET +// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2_loop + VZEROUPPER + +mulAvxGFNI_6x2_end: + RET + // func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 @@ -33801,161 +42862,119 @@ mulGFNI_6x2_64Xor_loop: mulGFNI_6x2_64Xor_end: RET -// func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 + ADDQ R11, R10 + ADDQ R11, R9 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 -mulAvxTwo_6x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R10), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) + VMOVDQU Y12, (R10) ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x2Xor_loop + JNZ mulAvxGFNI_6x2Xor_loop VZEROUPPER -mulAvxTwo_6x2Xor_end: +mulAvxGFNI_6x2Xor_end: RET // func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -33977,7 +42996,6 @@ TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 @@ -34182,195 +43200,6 @@ mulAvxTwo_6x2_64Xor_loop: mulAvxTwo_6x2_64Xor_end: RET -// func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 44 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R10 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_6x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - VMOVDQU Y2, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x3_loop - VZEROUPPER - -mulAvxTwo_6x3_end: - RET - // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 @@ -34390,7 +43219,6 @@ TEXT ·mulAvxTwo_6x3_64(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 @@ -34772,6 +43600,136 @@ mulGFNI_6x3_64_loop: mulGFNI_6x3_64_end: RET +// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3_loop + VZEROUPPER + +mulAvxGFNI_6x3_end: + RET + // func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 @@ -34910,29 +43868,41 @@ mulGFNI_6x3_64Xor_loop: mulGFNI_6x3_64Xor_end: RET -// func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers // Destination kept in GP registers - // Full registers estimated 44 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 @@ -34940,166 +43910,100 @@ TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X3 - VPBROADCASTB X3, Y3 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 -mulAvxTwo_6x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R12), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R10), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R11) + VMOVDQU Y11, (R11) ADDQ $0x20, R11 - VMOVDQU Y1, (R12) + VMOVDQU Y12, (R12) ADDQ $0x20, R12 - VMOVDQU Y2, (R10) + VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x3Xor_loop + JNZ mulAvxGFNI_6x3Xor_loop VZEROUPPER -mulAvxTwo_6x3Xor_end: +mulAvxGFNI_6x3Xor_end: RET // func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -35121,7 +44025,6 @@ TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 @@ -35755,6 +44658,157 @@ mulGFNI_6x4_64_loop: mulGFNI_6x4_64_end: RET +// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4_loop + VZEROUPPER + +mulAvxGFNI_6x4_end: + RET + // func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 @@ -35916,6 +44970,167 @@ mulGFNI_6x4_64Xor_loop: mulGFNI_6x4_64Xor_end: RET +// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4Xor_loop + VZEROUPPER + +mulAvxGFNI_6x4Xor_end: + RET + // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 @@ -36567,6 +45782,178 @@ mulGFNI_6x5_64_loop: mulGFNI_6x5_64_end: RET +// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5_loop + VZEROUPPER + +mulAvxGFNI_6x5_end: + RET + // func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 @@ -36746,6 +46133,190 @@ mulGFNI_6x5_64Xor_loop: mulGFNI_6x5_64Xor_end: RET +// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5Xor_loop + VZEROUPPER + +mulAvxGFNI_6x5Xor_end: + RET + // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 @@ -37480,6 +47051,199 @@ mulGFNI_6x6_64_loop: mulGFNI_6x6_64_end: RET +// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6_loop + VZEROUPPER + +mulAvxGFNI_6x6_end: + RET + // func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 @@ -37675,6 +47439,213 @@ mulGFNI_6x6_64Xor_loop: mulGFNI_6x6_64Xor_end: RET +// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6Xor_loop + VZEROUPPER + +mulAvxGFNI_6x6Xor_end: + RET + // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 @@ -38498,6 +48469,224 @@ mulGFNI_6x7_64_loop: mulGFNI_6x7_64_end: RET +// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7_loop + VZEROUPPER + +mulAvxGFNI_6x7_end: + RET + // func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 @@ -38713,6 +48902,240 @@ mulGFNI_6x7_64Xor_loop: mulGFNI_6x7_64Xor_end: RET +// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7Xor_loop + VZEROUPPER + +mulAvxGFNI_6x7Xor_end: + RET + // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 @@ -39581,6 +50004,224 @@ mulGFNI_6x8_64_loop: mulGFNI_6x8_64_end: RET +// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8_loop + VZEROUPPER + +mulAvxGFNI_6x8_end: + RET + // func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 @@ -39799,6 +50440,250 @@ mulGFNI_6x8_64Xor_loop: mulGFNI_6x8_64Xor_end: RET +// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8Xor_loop + VZEROUPPER + +mulAvxGFNI_6x8Xor_end: + RET + // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 @@ -40735,6 +51620,243 @@ mulGFNI_6x9_64_loop: mulGFNI_6x9_64_end: RET +// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9_loop + VZEROUPPER + +mulAvxGFNI_6x9_end: + RET + // func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 @@ -40968,6 +52090,272 @@ mulGFNI_6x9_64Xor_loop: mulGFNI_6x9_64Xor_end: RET +// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9Xor_loop + VZEROUPPER + +mulAvxGFNI_6x9Xor_end: + RET + // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 @@ -41982,6 +53370,262 @@ mulGFNI_6x10_64_loop: mulGFNI_6x10_64_end: RET +// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10_loop + VZEROUPPER + +mulAvxGFNI_6x10_end: + RET + // func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 @@ -42230,6 +53874,294 @@ mulGFNI_6x10_64Xor_loop: mulGFNI_6x10_64Xor_end: RET +// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10Xor_loop + VZEROUPPER + +mulAvxGFNI_6x10Xor_end: + RET + // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 @@ -42656,141 +54588,6 @@ mulAvxTwo_6x10Xor_loop: mulAvxTwo_6x10Xor_end: RET -// func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 18 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_7x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x1_loop - VZEROUPPER - -mulAvxTwo_7x1_end: - RET - // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 @@ -42811,7 +54608,6 @@ TEXT ·mulAvxTwo_7x1_64(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 @@ -43074,6 +54870,103 @@ mulGFNI_7x1_64_loop: mulGFNI_7x1_64_end: RET +// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1_loop + VZEROUPPER + +mulAvxGFNI_7x1_end: + RET + // func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 @@ -43175,17 +55068,118 @@ mulGFNI_7x1_64Xor_loop: mulGFNI_7x1_64Xor_end: RET -// func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1Xor_loop + VZEROUPPER + +mulAvxGFNI_7x1Xor_end: + RET + +// func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 18 YMM used + // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_7x1Xor_end + JZ mulAvxTwo_7x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -43201,143 +55195,6 @@ TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 // Add start offset to output ADDQ R12, R11 - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_7x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x1Xor_loop - VZEROUPPER - -mulAvxTwo_7x1Xor_end: - RET - -// func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 34 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_7x1_64Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - // Add start offset to input ADDQ R12, BX ADDQ R12, SI @@ -43501,180 +55358,6 @@ mulAvxTwo_7x1_64Xor_loop: mulAvxTwo_7x1_64Xor_end: RET -// func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - ADDQ R13, R11 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_7x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x2_loop - VZEROUPPER - -mulAvxTwo_7x2_end: - RET - // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 @@ -43695,7 +55378,6 @@ TEXT ·mulAvxTwo_7x2_64(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 @@ -44043,6 +55725,127 @@ mulGFNI_7x2_64_loop: mulGFNI_7x2_64_end: RET +// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2_loop + VZEROUPPER + +mulAvxGFNI_7x2_end: + RET + // func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 @@ -44170,17 +55973,144 @@ mulGFNI_7x2_64Xor_loop: mulGFNI_7x2_64Xor_end: RET -// func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2Xor_loop + VZEROUPPER + +mulAvxGFNI_7x2Xor_end: + RET + +// func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 35 YMM used + // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_7x2Xor_end + JZ mulAvxTwo_7x2_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -44198,183 +56128,6 @@ TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 ADDQ R13, R12 ADDQ R13, R11 - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_7x2Xor_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R11), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x2Xor_loop - VZEROUPPER - -mulAvxTwo_7x2Xor_end: - RET - -// func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 65 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2_64Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - ADDQ R13, R11 - // Add start offset to input ADDQ R13, BX ADDQ R13, SI @@ -44599,219 +56352,6 @@ mulAvxTwo_7x2_64Xor_loop: mulAvxTwo_7x2_64Xor_end: RET -// func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R11 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_7x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x3_loop - VZEROUPPER - -mulAvxTwo_7x3_end: - RET - // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 @@ -44832,7 +56372,6 @@ TEXT ·mulAvxTwo_7x3_64(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 @@ -45265,6 +56804,151 @@ mulGFNI_7x3_64_loop: mulGFNI_7x3_64_end: RET +// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3_loop + VZEROUPPER + +mulAvxGFNI_7x3_end: + RET + // func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 @@ -45418,30 +57102,42 @@ mulGFNI_7x3_64Xor_loop: mulGFNI_7x3_64Xor_end: RET -// func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers // Destination kept in GP registers - // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 @@ -45449,189 +57145,114 @@ TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 -mulAvxTwo_7x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R13), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R11), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R12) + VMOVDQU Y11, (R12) ADDQ $0x20, R12 - VMOVDQU Y1, (R13) + VMOVDQU Y12, (R13) ADDQ $0x20, R13 - VMOVDQU Y2, (R11) + VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_7x3Xor_loop + JNZ mulAvxGFNI_7x3Xor_loop VZEROUPPER -mulAvxTwo_7x3Xor_end: +mulAvxGFNI_7x3Xor_end: RET // func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -45654,7 +57275,6 @@ TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 @@ -46369,6 +57989,175 @@ mulGFNI_7x4_64_loop: mulGFNI_7x4_64_end: RET +// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4_loop + VZEROUPPER + +mulAvxGFNI_7x4_end: + RET + // func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 @@ -46546,6 +58335,185 @@ mulGFNI_7x4_64Xor_loop: mulGFNI_7x4_64Xor_end: RET +// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4Xor_loop + VZEROUPPER + +mulAvxGFNI_7x4Xor_end: + RET + // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 @@ -47276,6 +59244,199 @@ mulGFNI_7x5_64_loop: mulGFNI_7x5_64_end: RET +// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5_loop + VZEROUPPER + +mulAvxGFNI_7x5_end: + RET + // func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 @@ -47471,6 +59632,211 @@ mulGFNI_7x5_64Xor_loop: mulGFNI_7x5_64Xor_end: RET +// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5Xor_loop + VZEROUPPER + +mulAvxGFNI_7x5Xor_end: + RET + // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 @@ -48302,6 +60668,227 @@ mulGFNI_7x6_64_loop: mulGFNI_7x6_64_end: RET +// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6_loop + VZEROUPPER + +mulAvxGFNI_7x6_end: + RET + // func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 @@ -48519,6 +61106,241 @@ mulGFNI_7x6_64Xor_loop: mulGFNI_7x6_64Xor_end: RET +// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6Xor_loop + VZEROUPPER + +mulAvxGFNI_7x6Xor_end: + RET + // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 @@ -49411,6 +62233,232 @@ mulGFNI_7x7_64_loop: mulGFNI_7x7_64_end: RET +// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7_loop + VZEROUPPER + +mulAvxGFNI_7x7_end: + RET + // func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 @@ -49634,6 +62682,255 @@ mulGFNI_7x7_64Xor_loop: mulGFNI_7x7_64Xor_end: RET +// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7Xor_loop + VZEROUPPER + +mulAvxGFNI_7x7Xor_end: + RET + // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 @@ -50607,6 +63904,254 @@ mulGFNI_7x8_64_loop: mulGFNI_7x8_64_end: RET +// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8_loop + VZEROUPPER + +mulAvxGFNI_7x8_end: + RET + // func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 @@ -50847,6 +64392,280 @@ mulGFNI_7x8_64Xor_loop: mulGFNI_7x8_64Xor_end: RET +// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8Xor_loop + VZEROUPPER + +mulAvxGFNI_7x8Xor_end: + RET + // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 @@ -51910,6 +65729,276 @@ mulGFNI_7x9_64_loop: mulGFNI_7x9_64_end: RET +// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9_loop + VZEROUPPER + +mulAvxGFNI_7x9_end: + RET + // func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 @@ -52167,6 +66256,305 @@ mulGFNI_7x9_64Xor_loop: mulGFNI_7x9_64Xor_end: RET +// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9Xor_loop + VZEROUPPER + +mulAvxGFNI_7x9Xor_end: + RET + // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 @@ -53320,6 +67708,298 @@ mulGFNI_7x10_64_loop: mulGFNI_7x10_64_end: RET +// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10_loop + VZEROUPPER + +mulAvxGFNI_7x10_end: + RET + // func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 @@ -53594,6 +68274,330 @@ mulGFNI_7x10_64Xor_loop: mulGFNI_7x10_64Xor_end: RET +// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10Xor_loop + VZEROUPPER + +mulAvxGFNI_7x10Xor_end: + RET + // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 @@ -54079,155 +69083,6 @@ mulAvxTwo_7x10Xor_loop: mulAvxTwo_7x10Xor_end: RET -// func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_8x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x1_loop - VZEROUPPER - -mulAvxTwo_8x1_end: - RET - // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 @@ -54249,7 +69104,6 @@ TEXT ·mulAvxTwo_8x1_64(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 @@ -54541,6 +69395,112 @@ mulGFNI_8x1_64_loop: mulGFNI_8x1_64_end: RET +// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1_loop + VZEROUPPER + +mulAvxGFNI_8x1_end: + RET + // func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 @@ -54651,154 +69611,114 @@ mulGFNI_8x1_64Xor_loop: mulGFNI_8x1_64Xor_end: RET -// func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 // Add start offset to output - ADDQ R13, R12 + ADDQ R12, R11 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y8 -mulAvxTwo_8x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 // Store 1 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_8x1Xor_loop + JNZ mulAvxGFNI_8x1Xor_loop VZEROUPPER -mulAvxTwo_8x1Xor_end: +mulAvxGFNI_8x1Xor_end: RET // func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -54822,7 +69742,6 @@ TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 @@ -55012,199 +69931,6 @@ mulAvxTwo_8x1_64Xor_loop: mulAvxTwo_8x1_64Xor_end: RET -// func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 39 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - ADDQ R14, R12 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_8x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x2_loop - VZEROUPPER - -mulAvxTwo_8x2_end: - RET - // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 @@ -55226,7 +69952,6 @@ TEXT ·mulAvxTwo_8x2_64(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 @@ -55614,6 +70339,139 @@ mulGFNI_8x2_64_loop: mulGFNI_8x2_64_end: RET +// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2_loop + VZEROUPPER + +mulAvxGFNI_8x2_end: + RET + // func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 @@ -55753,199 +70611,143 @@ mulGFNI_8x2_64Xor_loop: mulGFNI_8x2_64Xor_end: RET -// func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers // Destination kept in GP registers - // Full registers estimated 39 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 -mulAvxTwo_8x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R12), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R13) + VMOVDQU Y12, (R13) ADDQ $0x20, R13 - VMOVDQU Y1, (R12) + VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_8x2Xor_loop + JNZ mulAvxGFNI_8x2Xor_loop VZEROUPPER -mulAvxTwo_8x2Xor_end: +mulAvxGFNI_8x2Xor_end: RET // func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -55969,7 +70771,6 @@ TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 @@ -56230,243 +71031,6 @@ mulAvxTwo_8x2_64Xor_loop: mulAvxTwo_8x2_64Xor_end: RET -// func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_8x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x3_loop - VZEROUPPER - -mulAvxTwo_8x3_end: - RET - // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 @@ -56488,7 +71052,6 @@ TEXT ·mulAvxTwo_8x3_64(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 @@ -56972,6 +71535,166 @@ mulGFNI_8x3_64_loop: mulGFNI_8x3_64_end: RET +// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3_loop + VZEROUPPER + +mulAvxGFNI_8x3_end: + RET + // func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 @@ -57140,31 +71863,43 @@ mulGFNI_8x3_64Xor_loop: mulGFNI_8x3_64Xor_end: RET -// func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers // Destination kept in GP registers - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 @@ -57172,212 +71907,128 @@ TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 ADDQ R15, R12 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 -mulAvxTwo_8x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R14), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R12), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R13) + VMOVDQU Y11, (R13) ADDQ $0x20, R13 - VMOVDQU Y1, (R14) + VMOVDQU Y12, (R14) ADDQ $0x20, R14 - VMOVDQU Y2, (R12) + VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_8x3Xor_loop + JNZ mulAvxGFNI_8x3Xor_loop VZEROUPPER -mulAvxTwo_8x3Xor_end: +mulAvxGFNI_8x3Xor_end: RET // func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -57401,7 +72052,6 @@ TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 @@ -58195,6 +72845,193 @@ mulGFNI_8x4_64_loop: mulGFNI_8x4_64_end: RET +// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4_loop + VZEROUPPER + +mulAvxGFNI_8x4_end: + RET + // func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 @@ -58386,6 +73223,203 @@ mulGFNI_8x4_64Xor_loop: mulGFNI_8x4_64Xor_end: RET +// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4Xor_loop + VZEROUPPER + +mulAvxGFNI_8x4Xor_end: + RET + // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 @@ -59201,6 +74235,224 @@ mulGFNI_8x5_64_loop: mulGFNI_8x5_64_end: RET +// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5_loop + VZEROUPPER + +mulAvxGFNI_8x5_end: + RET + // func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 @@ -59416,6 +74668,236 @@ mulGFNI_8x5_64Xor_loop: mulGFNI_8x5_64Xor_end: RET +// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5Xor_loop + VZEROUPPER + +mulAvxGFNI_8x5Xor_end: + RET + // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 @@ -60308,6 +75790,234 @@ mulGFNI_8x6_64_loop: mulGFNI_8x6_64_end: RET +// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6_loop + VZEROUPPER + +mulAvxGFNI_8x6_end: + RET + // func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 @@ -60532,6 +76242,254 @@ mulGFNI_8x6_64Xor_loop: mulGFNI_8x6_64Xor_end: RET +// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6Xor_loop + VZEROUPPER + +mulAvxGFNI_8x6Xor_end: + RET + // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 @@ -61518,6 +77476,259 @@ mulGFNI_8x7_64_loop: mulGFNI_8x7_64_end: RET +// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7_loop + VZEROUPPER + +mulAvxGFNI_8x7_end: + RET + // func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 @@ -61761,6 +77972,282 @@ mulGFNI_8x7_64Xor_loop: mulGFNI_8x7_64Xor_end: RET +// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7Xor_loop + VZEROUPPER + +mulAvxGFNI_8x7Xor_end: + RET + // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 @@ -62849,6 +79336,284 @@ mulGFNI_8x8_64_loop: mulGFNI_8x8_64_end: RET +// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8_loop + VZEROUPPER + +mulAvxGFNI_8x8_end: + RET + // func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 @@ -63111,6 +79876,310 @@ mulGFNI_8x8_64Xor_loop: mulGFNI_8x8_64Xor_end: RET +// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8Xor_loop + VZEROUPPER + +mulAvxGFNI_8x8Xor_end: + RET + // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 @@ -64301,6 +81370,309 @@ mulGFNI_8x9_64_loop: mulGFNI_8x9_64_end: RET +// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9_loop + VZEROUPPER + +mulAvxGFNI_8x9_end: + RET + // func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 @@ -64582,6 +81954,338 @@ mulGFNI_8x9_64Xor_loop: mulGFNI_8x9_64Xor_end: RET +// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9Xor_loop + VZEROUPPER + +mulAvxGFNI_8x9Xor_end: + RET + // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 @@ -65874,6 +83578,334 @@ mulGFNI_8x10_64_loop: mulGFNI_8x10_64_end: RET +// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10_loop + VZEROUPPER + +mulAvxGFNI_8x10_end: + RET + // func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 @@ -66174,6 +84206,366 @@ mulGFNI_8x10_64Xor_loop: mulGFNI_8x10_64Xor_end: RET +// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10Xor_loop + VZEROUPPER + +mulAvxGFNI_8x10Xor_end: + RET + // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 @@ -66718,169 +85110,6 @@ mulAvxTwo_8x10Xor_loop: mulAvxTwo_8x10Xor_end: RET -// func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_9x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x1_loop - VZEROUPPER - -mulAvxTwo_9x1_end: - RET - // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 @@ -66903,7 +85132,6 @@ TEXT ·mulAvxTwo_9x1_64(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 @@ -67224,6 +85452,121 @@ mulGFNI_9x1_64_loop: mulGFNI_9x1_64_end: RET +// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1_loop + VZEROUPPER + +mulAvxGFNI_9x1_end: + RET + // func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 @@ -67343,17 +85686,136 @@ mulGFNI_9x1_64Xor_loop: mulGFNI_9x1_64Xor_end: RET -// func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1Xor_loop + VZEROUPPER + +mulAvxGFNI_9x1Xor_end: + RET + +// func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 22 YMM used + // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_9x1Xor_end + JZ mulAvxTwo_9x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -67371,171 +85833,6 @@ TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 // Add start offset to output ADDQ R14, R13 - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_9x1Xor_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x1Xor_loop - VZEROUPPER - -mulAvxTwo_9x1Xor_end: - RET - -// func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 42 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_9x1_64Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - // Add start offset to input ADDQ R14, BX ADDQ R14, SI @@ -67739,218 +86036,6 @@ mulAvxTwo_9x1_64Xor_loop: mulAvxTwo_9x1_64Xor_end: RET -// func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 43 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - ADDQ R15, R13 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_9x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x2_loop - VZEROUPPER - -mulAvxTwo_9x2_end: - RET - // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 @@ -67973,7 +86058,6 @@ TEXT ·mulAvxTwo_9x2_64(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 @@ -68401,6 +86485,151 @@ mulGFNI_9x2_64_loop: mulGFNI_9x2_64_end: RET +// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2_loop + VZEROUPPER + +mulAvxGFNI_9x2_end: + RET + // func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 @@ -68552,218 +86781,155 @@ mulGFNI_9x2_64Xor_loop: mulGFNI_9x2_64Xor_end: RET -// func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers // Destination kept in GP registers - // Full registers estimated 43 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 -mulAvxTwo_9x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R13), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y12, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R13) + VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x2Xor_loop + JNZ mulAvxGFNI_9x2Xor_loop VZEROUPPER -mulAvxTwo_9x2Xor_end: +mulAvxGFNI_9x2Xor_end: RET // func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -68788,7 +86954,6 @@ TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 @@ -69077,267 +87242,6 @@ mulAvxTwo_9x2_64Xor_loop: mulAvxTwo_9x2_64Xor_end: RET -// func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_9x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x3_loop - VZEROUPPER - -mulAvxTwo_9x3_end: - RET - // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $8-88 @@ -69360,7 +87264,6 @@ TEXT ·mulAvxTwo_9x3_64(SB), $8-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -69895,6 +87798,181 @@ mulGFNI_9x3_64_loop: mulGFNI_9x3_64_end: RET +// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3_loop + VZEROUPPER + +mulAvxGFNI_9x3_end: + RET + // func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 @@ -70078,32 +88156,44 @@ mulGFNI_9x3_64Xor_loop: mulGFNI_9x3_64Xor_end: RET -// func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers +// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 @@ -70111,235 +88201,142 @@ TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 ADDQ BP, R13 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 -mulAvxTwo_9x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R15), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R13), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y11, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R15) + VMOVDQU Y12, (R15) ADDQ $0x20, R15 - VMOVDQU Y2, (R13) + VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x3Xor_loop + JNZ mulAvxGFNI_9x3Xor_loop VZEROUPPER -mulAvxTwo_9x3Xor_end: +mulAvxGFNI_9x3Xor_end: RET // func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -70364,7 +88361,6 @@ TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -71243,6 +89239,215 @@ mulGFNI_9x4_64_loop: mulGFNI_9x4_64_end: RET +// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4_loop + VZEROUPPER + +mulAvxGFNI_9x4_end: + RET + // func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 @@ -71452,6 +89657,225 @@ mulGFNI_9x4_64Xor_loop: mulGFNI_9x4_64Xor_end: RET +// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4Xor_loop + VZEROUPPER + +mulAvxGFNI_9x4Xor_end: + RET + // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 @@ -72320,6 +90744,230 @@ mulGFNI_9x5_64_loop: mulGFNI_9x5_64_end: RET +// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5_loop + VZEROUPPER + +mulAvxGFNI_9x5_end: + RET + // func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 @@ -72541,6 +91189,247 @@ mulGFNI_9x5_64Xor_loop: mulGFNI_9x5_64Xor_end: RET +// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5Xor_loop + VZEROUPPER + +mulAvxGFNI_9x5Xor_end: + RET + // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 @@ -73516,6 +92405,258 @@ mulGFNI_9x6_64_loop: mulGFNI_9x6_64_end: RET +// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6_loop + VZEROUPPER + +mulAvxGFNI_9x6_end: + RET + // func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 @@ -73758,6 +92899,278 @@ mulGFNI_9x6_64Xor_loop: mulGFNI_9x6_64Xor_end: RET +// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6Xor_loop + VZEROUPPER + +mulAvxGFNI_9x6Xor_end: + RET + // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 @@ -74847,6 +94260,286 @@ mulGFNI_9x7_64_loop: mulGFNI_9x7_64_end: RET +// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7_loop + VZEROUPPER + +mulAvxGFNI_9x7_end: + RET + // func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 @@ -75110,6 +94803,309 @@ mulGFNI_9x7_64Xor_loop: mulGFNI_9x7_64Xor_end: RET +// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7Xor_loop + VZEROUPPER + +mulAvxGFNI_9x7Xor_end: + RET + // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 @@ -76313,6 +96309,314 @@ mulGFNI_9x8_64_loop: mulGFNI_9x8_64_end: RET +// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8_loop + VZEROUPPER + +mulAvxGFNI_9x8_end: + RET + // func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 @@ -76597,6 +96901,340 @@ mulGFNI_9x8_64Xor_loop: mulGFNI_9x8_64Xor_end: RET +// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8Xor_loop + VZEROUPPER + +mulAvxGFNI_9x8Xor_end: + RET + // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 @@ -77914,6 +98552,342 @@ mulGFNI_9x9_64_loop: mulGFNI_9x9_64_end: RET +// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9_loop + VZEROUPPER + +mulAvxGFNI_9x9_end: + RET + // func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 @@ -78219,6 +99193,371 @@ mulGFNI_9x9_64Xor_loop: mulGFNI_9x9_64Xor_end: RET +// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9Xor_loop + VZEROUPPER + +mulAvxGFNI_9x9Xor_end: + RET + // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 @@ -79650,6 +100989,370 @@ mulGFNI_9x10_64_loop: mulGFNI_9x10_64_end: RET +// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10_loop + VZEROUPPER + +mulAvxGFNI_9x10_end: + RET + // func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 @@ -79976,6 +101679,402 @@ mulGFNI_9x10_64Xor_loop: mulGFNI_9x10_64Xor_end: RET +// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10Xor_loop + VZEROUPPER + +mulAvxGFNI_9x10Xor_end: + RET + // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 @@ -80579,183 +102678,6 @@ mulAvxTwo_9x10Xor_loop: mulAvxTwo_9x10Xor_end: RET -// func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_10x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x1_loop - VZEROUPPER - -mulAvxTwo_10x1_end: - RET - // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $0-88 @@ -80779,7 +102701,6 @@ TEXT ·mulAvxTwo_10x1_64(SB), $0-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 @@ -81129,6 +103050,130 @@ mulGFNI_10x1_64_loop: mulGFNI_10x1_64_end: RET +// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1_loop + VZEROUPPER + +mulAvxGFNI_10x1_end: + RET + // func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 @@ -81257,182 +103302,132 @@ mulGFNI_10x1_64Xor_loop: mulGFNI_10x1_64Xor_end: RET -// func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 - MOVQ start+72(FP), R15 + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ R15, R14 + ADDQ R14, R13 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 -mulAvxTwo_10x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_10x1Xor_loop + JNZ mulAvxGFNI_10x1Xor_loop VZEROUPPER -mulAvxTwo_10x1Xor_end: +mulAvxGFNI_10x1Xor_end: RET // func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -81458,7 +103453,6 @@ TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 @@ -81688,237 +103682,6 @@ mulAvxTwo_10x1_64Xor_loop: mulAvxTwo_10x1_64Xor_end: RET -// func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 47 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R15 - ADDQ BP, R14 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_10x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R15) - ADDQ $0x20, R15 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x2_loop - VZEROUPPER - -mulAvxTwo_10x2_end: - RET - // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 @@ -81942,7 +103705,6 @@ TEXT ·mulAvxTwo_10x2_64(SB), $8-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP @@ -82410,6 +104172,163 @@ mulGFNI_10x2_64_loop: mulGFNI_10x2_64_end: RET +// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2_loop + VZEROUPPER + +mulAvxGFNI_10x2_end: + RET + // func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 @@ -82573,237 +104492,167 @@ mulGFNI_10x2_64Xor_loop: mulGFNI_10x2_64Xor_end: RET -// func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers +// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers // Destination kept in GP registers - // Full registers estimated 47 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 -mulAvxTwo_10x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R15), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R14), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R15) + VMOVDQU Y12, (R15) ADDQ $0x20, R15 - VMOVDQU Y1, (R14) + VMOVDQU Y13, (R14) ADDQ $0x20, R14 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_10x2Xor_loop + JNZ mulAvxGFNI_10x2Xor_loop VZEROUPPER -mulAvxTwo_10x2Xor_end: +mulAvxGFNI_10x2Xor_end: RET // func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -82829,7 +104678,6 @@ TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP @@ -83146,293 +104994,6 @@ mulAvxTwo_10x2_64Xor_loop: mulAvxTwo_10x2_64Xor_end: RET -// func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_10x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_10x3_loop - VZEROUPPER - -mulAvxTwo_10x3_end: - RET - // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 @@ -83456,7 +105017,6 @@ TEXT ·mulAvxTwo_10x3_64(SB), $8-88 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -84047,6 +105607,200 @@ mulGFNI_10x3_64_loop: mulGFNI_10x3_64_end: RET +// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3_loop + VZEROUPPER + +mulAvxGFNI_10x3_end: + RET + // func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 @@ -84246,33 +106000,45 @@ mulGFNI_10x3_64Xor_loop: mulGFNI_10x3_64Xor_end: RET -// func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers +// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3Xor_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 @@ -84280,260 +106046,160 @@ TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 ADDQ BP, R13 // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 -mulAvxTwo_10x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R15), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R13), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y11, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R15) + VMOVDQU Y12, (R15) ADDQ $0x20, R15 - VMOVDQU Y2, (R13) + VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ BP - JNZ mulAvxTwo_10x3Xor_loop + JNZ mulAvxGFNI_10x3Xor_loop VZEROUPPER -mulAvxTwo_10x3Xor_end: +mulAvxGFNI_10x3Xor_end: RET // func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -84559,7 +106225,6 @@ TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -85497,6 +107162,220 @@ mulGFNI_10x4_64_loop: mulGFNI_10x4_64_end: RET +// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4_loop + VZEROUPPER + +mulAvxGFNI_10x4_end: + RET + // func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 @@ -85711,6 +107590,234 @@ mulGFNI_10x4_64Xor_loop: mulGFNI_10x4_64Xor_end: RET +// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4Xor_loop + VZEROUPPER + +mulAvxGFNI_10x4Xor_end: + RET + // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 @@ -86651,6 +108758,251 @@ mulGFNI_10x5_64_loop: mulGFNI_10x5_64_end: RET +// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5_loop + VZEROUPPER + +mulAvxGFNI_10x5_end: + RET + // func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 @@ -86888,6 +109240,268 @@ mulGFNI_10x5_64Xor_loop: mulGFNI_10x5_64Xor_end: RET +// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5Xor_loop + VZEROUPPER + +mulAvxGFNI_10x5Xor_end: + RET + // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 @@ -87954,6 +110568,282 @@ mulGFNI_10x6_64_loop: mulGFNI_10x6_64_end: RET +// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6_loop + VZEROUPPER + +mulAvxGFNI_10x6_end: + RET + // func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 @@ -88214,6 +111104,302 @@ mulGFNI_10x6_64Xor_loop: mulGFNI_10x6_64Xor_end: RET +// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6Xor_loop + VZEROUPPER + +mulAvxGFNI_10x6Xor_end: + RET + // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 @@ -89406,6 +112592,313 @@ mulGFNI_10x7_64_loop: mulGFNI_10x7_64_end: RET +// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7_loop + VZEROUPPER + +mulAvxGFNI_10x7_end: + RET + // func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 @@ -89689,6 +113182,336 @@ mulGFNI_10x7_64Xor_loop: mulGFNI_10x7_64Xor_end: RET +// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7Xor_loop + VZEROUPPER + +mulAvxGFNI_10x7Xor_end: + RET + // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 @@ -91007,6 +114830,344 @@ mulGFNI_10x8_64_loop: mulGFNI_10x8_64_end: RET +// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8_loop + VZEROUPPER + +mulAvxGFNI_10x8_end: + RET + // func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 @@ -91313,6 +115474,370 @@ mulGFNI_10x8_64Xor_loop: mulGFNI_10x8_64Xor_end: RET +// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8Xor_loop + VZEROUPPER + +mulAvxGFNI_10x8Xor_end: + RET + // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 @@ -92757,6 +117282,375 @@ mulGFNI_10x9_64_loop: mulGFNI_10x9_64_end: RET +// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9_loop + VZEROUPPER + +mulAvxGFNI_10x9_end: + RET + // func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 @@ -93086,6 +117980,404 @@ mulGFNI_10x9_64Xor_loop: mulGFNI_10x9_64Xor_end: RET +// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9Xor_loop + VZEROUPPER + +mulAvxGFNI_10x9Xor_end: + RET + // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 @@ -94656,6 +119948,406 @@ mulGFNI_10x10_64_loop: mulGFNI_10x10_64_end: RET +// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10_loop + VZEROUPPER + +mulAvxGFNI_10x10_end: + RET + // func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 @@ -95008,6 +120700,438 @@ mulGFNI_10x10_64Xor_loop: mulGFNI_10x10_64Xor_end: RET +// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10Xor_loop + VZEROUPPER + +mulAvxGFNI_10x10Xor_end: + RET + // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.go new file mode 100644 index 0000000..2f87190 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.go @@ -0,0 +1,125 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !noasm && !appengine && !gccgo && !nopshufb + +package reedsolomon + +//go:noescape +func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s new file mode 100644 index 0000000..dd974c1 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s @@ -0,0 +1,27052 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && !nopshufb && gc + +#include "textflag.h" + +// func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x1_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + WORD $0x8b0f01ce // add x14, x14, x15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd28001ef // mov x15, #15 + WORD $0x05e039e2 // mov z2.d, x15 + WORD $0x05212042 // dup z2.b, z2.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + WORD $0x85804026 // ldr z6, [x1] + WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] + WORD $0x04215041 // addvl x1, x1, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804043 // ldr z3, [x2] + WORD $0x85804444 // ldr z4, [x2, #1, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33080 // eor z0.d, z4.d, z3.d + WORD $0x04a530c1 // eor z1.d, z6.d, z5.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 1 to 1 outputs + WORD $0x85804086 // ldr z6, [x4] + WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] + WORD $0x04245044 // addvl x4, x4, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804843 // ldr z3, [x2, #2, MUL VL] + WORD $0x85804c44 // ldr z4, [x2, #3, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 2 to 1 outputs + WORD $0x858040a6 // ldr z6, [x5] + WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] + WORD $0x04255045 // addvl x5, x5, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805043 // ldr z3, [x2, #4, MUL VL] + WORD $0x85805444 // ldr z4, [x2, #5, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 3 to 1 outputs + WORD $0x85804106 // ldr z6, [x8] + WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] + WORD $0x04285048 // addvl x8, x8, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805843 // ldr z3, [x2, #6, MUL VL] + WORD $0x85805c44 // ldr z4, [x2, #7, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 4 to 1 outputs + WORD $0x85804126 // ldr z6, [x9] + WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] + WORD $0x04295049 // addvl x9, x9, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814043 // ldr z3, [x2, #8, MUL VL] + WORD $0x85814444 // ldr z4, [x2, #9, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 5 to 1 outputs + WORD $0x85804146 // ldr z6, [x10] + WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] + WORD $0x042a504a // addvl x10, x10, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814843 // ldr z3, [x2, #10, MUL VL] + WORD $0x85814c44 // ldr z4, [x2, #11, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 6 to 1 outputs + WORD $0x85804166 // ldr z6, [x11] + WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] + WORD $0x042b504b // addvl x11, x11, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815043 // ldr z3, [x2, #12, MUL VL] + WORD $0x85815444 // ldr z4, [x2, #13, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 7 to 1 outputs + WORD $0x85804186 // ldr z6, [x12] + WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] + WORD $0x042c504c // addvl x12, x12, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815843 // ldr z3, [x2, #14, MUL VL] + WORD $0x85815c44 // ldr z4, [x2, #15, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 8 to 1 outputs + WORD $0x858041a6 // ldr z6, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + WORD $0x042d504d // addvl x13, x13, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824043 // ldr z3, [x2, #16, MUL VL] + WORD $0x85824444 // ldr z4, [x2, #17, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 9 to 1 outputs + WORD $0x85804066 // ldr z6, [x3] + WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] + WORD $0x04235043 // addvl x3, x3, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824843 // ldr z3, [x2, #18, MUL VL] + WORD $0x85824c44 // ldr z4, [x2, #19, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + +mulSve_10x1_64_store: + // Store 1 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x042e504e // addvl x14, x14, #2 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x1_64_loop + +mulSve_10x1_64_end: + RET + +// func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x1_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + WORD $0x8b0f01ce // add x14, x14, x15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd28001ef // mov x15, #15 + WORD $0x05e039e2 // mov z2.d, x15 + WORD $0x05212042 // dup z2.b, z2.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x1_64Xor_loop: + // Load 1 outputs + WORD $0x858041c0 // ldr z0, [x14] + WORD $0x858045c1 // ldr z1, [x14, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 1 outputs + WORD $0x85804026 // ldr z6, [x1] + WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] + WORD $0x04215041 // addvl x1, x1, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804043 // ldr z3, [x2] + WORD $0x85804444 // ldr z4, [x2, #1, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 1 to 1 outputs + WORD $0x85804086 // ldr z6, [x4] + WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] + WORD $0x04245044 // addvl x4, x4, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804843 // ldr z3, [x2, #2, MUL VL] + WORD $0x85804c44 // ldr z4, [x2, #3, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 2 to 1 outputs + WORD $0x858040a6 // ldr z6, [x5] + WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] + WORD $0x04255045 // addvl x5, x5, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805043 // ldr z3, [x2, #4, MUL VL] + WORD $0x85805444 // ldr z4, [x2, #5, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 3 to 1 outputs + WORD $0x85804106 // ldr z6, [x8] + WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] + WORD $0x04285048 // addvl x8, x8, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805843 // ldr z3, [x2, #6, MUL VL] + WORD $0x85805c44 // ldr z4, [x2, #7, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 4 to 1 outputs + WORD $0x85804126 // ldr z6, [x9] + WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] + WORD $0x04295049 // addvl x9, x9, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814043 // ldr z3, [x2, #8, MUL VL] + WORD $0x85814444 // ldr z4, [x2, #9, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 5 to 1 outputs + WORD $0x85804146 // ldr z6, [x10] + WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] + WORD $0x042a504a // addvl x10, x10, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814843 // ldr z3, [x2, #10, MUL VL] + WORD $0x85814c44 // ldr z4, [x2, #11, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 6 to 1 outputs + WORD $0x85804166 // ldr z6, [x11] + WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] + WORD $0x042b504b // addvl x11, x11, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815043 // ldr z3, [x2, #12, MUL VL] + WORD $0x85815444 // ldr z4, [x2, #13, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 7 to 1 outputs + WORD $0x85804186 // ldr z6, [x12] + WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] + WORD $0x042c504c // addvl x12, x12, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815843 // ldr z3, [x2, #14, MUL VL] + WORD $0x85815c44 // ldr z4, [x2, #15, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 8 to 1 outputs + WORD $0x858041a6 // ldr z6, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + WORD $0x042d504d // addvl x13, x13, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824043 // ldr z3, [x2, #16, MUL VL] + WORD $0x85824444 // ldr z4, [x2, #17, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 9 to 1 outputs + WORD $0x85804066 // ldr z6, [x3] + WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] + WORD $0x04235043 // addvl x3, x3, #2 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824843 // ldr z3, [x2, #18, MUL VL] + WORD $0x85824c44 // ldr z4, [x2, #19, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + +mulSve_10x1_64Xor_store: + // Store 1 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x042e504e // addvl x14, x14, #2 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x1_64Xor_loop + +mulSve_10x1_64Xor_end: + RET + +// func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x2_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ce // add x14, x14, x6 + + // Add start offset to input + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b0601ad // add x13, x13, x6 + WORD $0x8b060063 // add x3, x3, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] + WORD $0x04215041 // addvl x1, x1, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a530c0 // eor z0.d, z6.d, z5.d + WORD $0x04a73101 // eor z1.d, z8.d, z7.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a530c2 // eor z2.d, z6.d, z5.d + WORD $0x04a73103 // eor z3.d, z8.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 1 to 2 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] + WORD $0x04245044 // addvl x4, x4, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 2 to 2 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] + WORD $0x04255045 // addvl x5, x5, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 3 to 2 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] + WORD $0x04285048 // addvl x8, x8, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 4 to 2 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] + WORD $0x04295049 // addvl x9, x9, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 5 to 2 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] + WORD $0x042a504a // addvl x10, x10, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 6 to 2 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] + WORD $0x042b504b // addvl x11, x11, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 7 to 2 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] + WORD $0x042c504c // addvl x12, x12, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 8 to 2 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] + WORD $0x042d504d // addvl x13, x13, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 9 to 2 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] + WORD $0x04235043 // addvl x3, x3, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + +mulSve_10x2_64_store: + // Store 2 outputs + WORD $0xe58041e0 // str z0, [x15] + WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] + WORD $0x042f504f // addvl x15, x15, #2 + WORD $0xe58041c2 // str z2, [x14] + WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] + WORD $0x042e504e // addvl x14, x14, #2 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x2_64_loop + +mulSve_10x2_64_end: + RET + +// func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x2_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ce // add x14, x14, x6 + + // Add start offset to input + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b0601ad // add x13, x13, x6 + WORD $0x8b060063 // add x3, x3, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x2_64Xor_loop: + // Load 2 outputs + WORD $0x858041e0 // ldr z0, [x15] + WORD $0x858045e1 // ldr z1, [x15, #1, MUL VL] + WORD $0x858041c2 // ldr z2, [x14] + WORD $0x858045c3 // ldr z3, [x14, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 2 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] + WORD $0x04215041 // addvl x1, x1, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 1 to 2 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] + WORD $0x04245044 // addvl x4, x4, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 2 to 2 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] + WORD $0x04255045 // addvl x5, x5, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 3 to 2 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] + WORD $0x04285048 // addvl x8, x8, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 4 to 2 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] + WORD $0x04295049 // addvl x9, x9, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 5 to 2 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] + WORD $0x042a504a // addvl x10, x10, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 6 to 2 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] + WORD $0x042b504b // addvl x11, x11, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 7 to 2 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] + WORD $0x042c504c // addvl x12, x12, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 8 to 2 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] + WORD $0x042d504d // addvl x13, x13, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 9 to 2 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] + WORD $0x04235043 // addvl x3, x3, #2 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + +mulSve_10x2_64Xor_store: + // Store 2 outputs + WORD $0xe58041e0 // str z0, [x15] + WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] + WORD $0x042f504f // addvl x15, x15, #2 + WORD $0xe58041c2 // str z2, [x14] + WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] + WORD $0x042e504e // addvl x14, x14, #2 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x2_64Xor_loop + +mulSve_10x2_64Xor_end: + RET + +// func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x3_64_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ce // add x14, x14, x6 + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ad // add x13, x13, x6 + + // Add start offset to input + WORD $0x8b060063 // add x3, x3, x6 + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b060000 // add x0, x0, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Reload length to save a register + MOVD n+80(FP), R6 + WORD $0xd346fcc6 // lsr x6, x6, #6 + WORD $0xd37ae4c6 // lsl x6, x6, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad008c6 // udiv x6, x6, x16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] + WORD $0x04235043 // addvl x3, x3, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73100 // eor z0.d, z8.d, z7.d + WORD $0x04a93141 // eor z1.d, z10.d, z9.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73102 // eor z2.d, z8.d, z7.d + WORD $0x04a93143 // eor z3.d, z10.d, z9.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73104 // eor z4.d, z8.d, z7.d + WORD $0x04a93145 // eor z5.d, z10.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 1 to 3 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] + WORD $0x04215041 // addvl x1, x1, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 2 to 3 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] + WORD $0x04245044 // addvl x4, x4, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 3 to 3 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] + WORD $0x04255045 // addvl x5, x5, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 4 to 3 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] + WORD $0x04285048 // addvl x8, x8, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 5 to 3 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] + WORD $0x04295049 // addvl x9, x9, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 6 to 3 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] + WORD $0x042a504a // addvl x10, x10, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 7 to 3 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] + WORD $0x042b504b // addvl x11, x11, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 8 to 3 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] + WORD $0x042c504c // addvl x12, x12, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 9 to 3 outputs + WORD $0x8580400b // ldr z11, [x0] + WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] + WORD $0x04205040 // addvl x0, x0, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + +mulSve_10x3_64_store: + // Store 3 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x042e504e // addvl x14, x14, #2 + WORD $0xe58041e2 // str z2, [x15] + WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] + WORD $0x042f504f // addvl x15, x15, #2 + WORD $0xe58041a4 // str z4, [x13] + WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] + WORD $0x042d504d // addvl x13, x13, #2 + + // Prepare for next loop + WORD $0xf10004c6 // subs x6, x6, #1 + BNE mulSve_10x3_64_loop + +mulSve_10x3_64_end: + RET + +// func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x3_64Xor_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ce // add x14, x14, x6 + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ad // add x13, x13, x6 + + // Add start offset to input + WORD $0x8b060063 // add x3, x3, x6 + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b060000 // add x0, x0, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Reload length to save a register + MOVD n+80(FP), R6 + WORD $0xd346fcc6 // lsr x6, x6, #6 + WORD $0xd37ae4c6 // lsl x6, x6, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad008c6 // udiv x6, x6, x16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x3_64Xor_loop: + // Load 3 outputs + WORD $0x858041c0 // ldr z0, [x14] + WORD $0x858045c1 // ldr z1, [x14, #1, MUL VL] + WORD $0x858041e2 // ldr z2, [x15] + WORD $0x858045e3 // ldr z3, [x15, #1, MUL VL] + WORD $0x858041a4 // ldr z4, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 3 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] + WORD $0x04235043 // addvl x3, x3, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 1 to 3 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] + WORD $0x04215041 // addvl x1, x1, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 2 to 3 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] + WORD $0x04245044 // addvl x4, x4, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 3 to 3 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] + WORD $0x04255045 // addvl x5, x5, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 4 to 3 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] + WORD $0x04285048 // addvl x8, x8, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 5 to 3 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] + WORD $0x04295049 // addvl x9, x9, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 6 to 3 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] + WORD $0x042a504a // addvl x10, x10, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 7 to 3 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] + WORD $0x042b504b // addvl x11, x11, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 8 to 3 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] + WORD $0x042c504c // addvl x12, x12, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 9 to 3 outputs + WORD $0x8580400b // ldr z11, [x0] + WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] + WORD $0x04205040 // addvl x0, x0, #2 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + +mulSve_10x3_64Xor_store: + // Store 3 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x042e504e // addvl x14, x14, #2 + WORD $0xe58041e2 // str z2, [x15] + WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] + WORD $0x042f504f // addvl x15, x15, #2 + WORD $0xe58041a4 // str z4, [x13] + WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] + WORD $0x042d504d // addvl x13, x13, #2 + + // Prepare for next loop + WORD $0xf10004c6 // subs x6, x6, #1 + BNE mulSve_10x3_64Xor_loop + +mulSve_10x3_64Xor_end: + RET + +// func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x4_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + WORD $0x85804027 // ldr z7, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c0 // eor z0.d, z6.d, z5.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c1 // eor z1.d, z6.d, z5.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c2 // eor z2.d, z6.d, z5.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c3 // eor z3.d, z6.d, z5.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 1 to 4 outputs + WORD $0x85804087 // ldr z7, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 2 to 4 outputs + WORD $0x858040a7 // ldr z7, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 3 to 4 outputs + WORD $0x85804107 // ldr z7, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 4 to 4 outputs + WORD $0x85804127 // ldr z7, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 5 to 4 outputs + WORD $0x85804147 // ldr z7, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85854045 // ldr z5, [x2, #40, MUL VL] + WORD $0x85854446 // ldr z6, [x2, #41, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85854845 // ldr z5, [x2, #42, MUL VL] + WORD $0x85854c46 // ldr z6, [x2, #43, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85855045 // ldr z5, [x2, #44, MUL VL] + WORD $0x85855446 // ldr z6, [x2, #45, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85855845 // ldr z5, [x2, #46, MUL VL] + WORD $0x85855c46 // ldr z6, [x2, #47, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 6 to 4 outputs + WORD $0x85804167 // ldr z7, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85864045 // ldr z5, [x2, #48, MUL VL] + WORD $0x85864446 // ldr z6, [x2, #49, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85864845 // ldr z5, [x2, #50, MUL VL] + WORD $0x85864c46 // ldr z6, [x2, #51, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85865045 // ldr z5, [x2, #52, MUL VL] + WORD $0x85865446 // ldr z6, [x2, #53, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85865845 // ldr z5, [x2, #54, MUL VL] + WORD $0x85865c46 // ldr z6, [x2, #55, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 7 to 4 outputs + WORD $0x85804187 // ldr z7, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85874045 // ldr z5, [x2, #56, MUL VL] + WORD $0x85874446 // ldr z6, [x2, #57, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85874845 // ldr z5, [x2, #58, MUL VL] + WORD $0x85874c46 // ldr z6, [x2, #59, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85875045 // ldr z5, [x2, #60, MUL VL] + WORD $0x85875446 // ldr z6, [x2, #61, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85875845 // ldr z5, [x2, #62, MUL VL] + WORD $0x85875c46 // ldr z6, [x2, #63, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 8 to 4 outputs + WORD $0x858041a7 // ldr z7, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85884045 // ldr z5, [x2, #64, MUL VL] + WORD $0x85884446 // ldr z6, [x2, #65, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85884845 // ldr z5, [x2, #66, MUL VL] + WORD $0x85884c46 // ldr z6, [x2, #67, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85885045 // ldr z5, [x2, #68, MUL VL] + WORD $0x85885446 // ldr z6, [x2, #69, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85885845 // ldr z5, [x2, #70, MUL VL] + WORD $0x85885c46 // ldr z6, [x2, #71, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 9 to 4 outputs + WORD $0x85804067 // ldr z7, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85894045 // ldr z5, [x2, #72, MUL VL] + WORD $0x85894446 // ldr z6, [x2, #73, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85894845 // ldr z5, [x2, #74, MUL VL] + WORD $0x85894c46 // ldr z6, [x2, #75, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85895045 // ldr z5, [x2, #76, MUL VL] + WORD $0x85895446 // ldr z6, [x2, #77, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85895845 // ldr z5, [x2, #78, MUL VL] + WORD $0x85895c46 // ldr z6, [x2, #79, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + +mulSve_10x4_store: + // Store 4 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x4_loop + +mulSve_10x4_end: + RET + +// func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x4Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + WORD $0x85804027 // ldr z7, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 1 to 4 outputs + WORD $0x85804087 // ldr z7, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 2 to 4 outputs + WORD $0x858040a7 // ldr z7, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 3 to 4 outputs + WORD $0x85804107 // ldr z7, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 4 to 4 outputs + WORD $0x85804127 // ldr z7, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 5 to 4 outputs + WORD $0x85804147 // ldr z7, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85854045 // ldr z5, [x2, #40, MUL VL] + WORD $0x85854446 // ldr z6, [x2, #41, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85854845 // ldr z5, [x2, #42, MUL VL] + WORD $0x85854c46 // ldr z6, [x2, #43, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85855045 // ldr z5, [x2, #44, MUL VL] + WORD $0x85855446 // ldr z6, [x2, #45, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85855845 // ldr z5, [x2, #46, MUL VL] + WORD $0x85855c46 // ldr z6, [x2, #47, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 6 to 4 outputs + WORD $0x85804167 // ldr z7, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85864045 // ldr z5, [x2, #48, MUL VL] + WORD $0x85864446 // ldr z6, [x2, #49, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85864845 // ldr z5, [x2, #50, MUL VL] + WORD $0x85864c46 // ldr z6, [x2, #51, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85865045 // ldr z5, [x2, #52, MUL VL] + WORD $0x85865446 // ldr z6, [x2, #53, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85865845 // ldr z5, [x2, #54, MUL VL] + WORD $0x85865c46 // ldr z6, [x2, #55, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 7 to 4 outputs + WORD $0x85804187 // ldr z7, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85874045 // ldr z5, [x2, #56, MUL VL] + WORD $0x85874446 // ldr z6, [x2, #57, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85874845 // ldr z5, [x2, #58, MUL VL] + WORD $0x85874c46 // ldr z6, [x2, #59, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85875045 // ldr z5, [x2, #60, MUL VL] + WORD $0x85875446 // ldr z6, [x2, #61, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85875845 // ldr z5, [x2, #62, MUL VL] + WORD $0x85875c46 // ldr z6, [x2, #63, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 8 to 4 outputs + WORD $0x858041a7 // ldr z7, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85884045 // ldr z5, [x2, #64, MUL VL] + WORD $0x85884446 // ldr z6, [x2, #65, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85884845 // ldr z5, [x2, #66, MUL VL] + WORD $0x85884c46 // ldr z6, [x2, #67, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85885045 // ldr z5, [x2, #68, MUL VL] + WORD $0x85885446 // ldr z6, [x2, #69, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85885845 // ldr z5, [x2, #70, MUL VL] + WORD $0x85885c46 // ldr z6, [x2, #71, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 9 to 4 outputs + WORD $0x85804067 // ldr z7, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85894045 // ldr z5, [x2, #72, MUL VL] + WORD $0x85894446 // ldr z6, [x2, #73, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85894845 // ldr z5, [x2, #74, MUL VL] + WORD $0x85894c46 // ldr z6, [x2, #75, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85895045 // ldr z5, [x2, #76, MUL VL] + WORD $0x85895446 // ldr z6, [x2, #77, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85895845 // ldr z5, [x2, #78, MUL VL] + WORD $0x85895c46 // ldr z6, [x2, #79, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + +mulSve_10x4Xor_store: + // Store 4 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x4Xor_loop + +mulSve_10x4Xor_end: + RET + +// func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x5_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c5 // mov z5.d, x6 + WORD $0x052120a5 // dup z5.b, z5.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + WORD $0x85804028 // ldr z8, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85804046 // ldr z6, [x2] + WORD $0x85804447 // ldr z7, [x2, #1, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e0 // eor z0.d, z7.d, z6.d + WORD $0x85804846 // ldr z6, [x2, #2, MUL VL] + WORD $0x85804c47 // ldr z7, [x2, #3, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e1 // eor z1.d, z7.d, z6.d + WORD $0x85805046 // ldr z6, [x2, #4, MUL VL] + WORD $0x85805447 // ldr z7, [x2, #5, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e2 // eor z2.d, z7.d, z6.d + WORD $0x85805846 // ldr z6, [x2, #6, MUL VL] + WORD $0x85805c47 // ldr z7, [x2, #7, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e3 // eor z3.d, z7.d, z6.d + WORD $0x85814046 // ldr z6, [x2, #8, MUL VL] + WORD $0x85814447 // ldr z7, [x2, #9, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e4 // eor z4.d, z7.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 1 to 5 outputs + WORD $0x85804088 // ldr z8, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85814846 // ldr z6, [x2, #10, MUL VL] + WORD $0x85814c47 // ldr z7, [x2, #11, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85815046 // ldr z6, [x2, #12, MUL VL] + WORD $0x85815447 // ldr z7, [x2, #13, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85815846 // ldr z6, [x2, #14, MUL VL] + WORD $0x85815c47 // ldr z7, [x2, #15, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85824046 // ldr z6, [x2, #16, MUL VL] + WORD $0x85824447 // ldr z7, [x2, #17, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85824846 // ldr z6, [x2, #18, MUL VL] + WORD $0x85824c47 // ldr z7, [x2, #19, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 2 to 5 outputs + WORD $0x858040a8 // ldr z8, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85825046 // ldr z6, [x2, #20, MUL VL] + WORD $0x85825447 // ldr z7, [x2, #21, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85825846 // ldr z6, [x2, #22, MUL VL] + WORD $0x85825c47 // ldr z7, [x2, #23, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85834046 // ldr z6, [x2, #24, MUL VL] + WORD $0x85834447 // ldr z7, [x2, #25, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85834846 // ldr z6, [x2, #26, MUL VL] + WORD $0x85834c47 // ldr z7, [x2, #27, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85835046 // ldr z6, [x2, #28, MUL VL] + WORD $0x85835447 // ldr z7, [x2, #29, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 3 to 5 outputs + WORD $0x85804108 // ldr z8, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85835846 // ldr z6, [x2, #30, MUL VL] + WORD $0x85835c47 // ldr z7, [x2, #31, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85844046 // ldr z6, [x2, #32, MUL VL] + WORD $0x85844447 // ldr z7, [x2, #33, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85844846 // ldr z6, [x2, #34, MUL VL] + WORD $0x85844c47 // ldr z7, [x2, #35, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85845046 // ldr z6, [x2, #36, MUL VL] + WORD $0x85845447 // ldr z7, [x2, #37, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85845846 // ldr z6, [x2, #38, MUL VL] + WORD $0x85845c47 // ldr z7, [x2, #39, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 4 to 5 outputs + WORD $0x85804128 // ldr z8, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85854046 // ldr z6, [x2, #40, MUL VL] + WORD $0x85854447 // ldr z7, [x2, #41, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85854846 // ldr z6, [x2, #42, MUL VL] + WORD $0x85854c47 // ldr z7, [x2, #43, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85855046 // ldr z6, [x2, #44, MUL VL] + WORD $0x85855447 // ldr z7, [x2, #45, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85855846 // ldr z6, [x2, #46, MUL VL] + WORD $0x85855c47 // ldr z7, [x2, #47, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85864046 // ldr z6, [x2, #48, MUL VL] + WORD $0x85864447 // ldr z7, [x2, #49, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 5 to 5 outputs + WORD $0x85804148 // ldr z8, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85864846 // ldr z6, [x2, #50, MUL VL] + WORD $0x85864c47 // ldr z7, [x2, #51, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85865046 // ldr z6, [x2, #52, MUL VL] + WORD $0x85865447 // ldr z7, [x2, #53, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85865846 // ldr z6, [x2, #54, MUL VL] + WORD $0x85865c47 // ldr z7, [x2, #55, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85874046 // ldr z6, [x2, #56, MUL VL] + WORD $0x85874447 // ldr z7, [x2, #57, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85874846 // ldr z6, [x2, #58, MUL VL] + WORD $0x85874c47 // ldr z7, [x2, #59, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 6 to 5 outputs + WORD $0x85804168 // ldr z8, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85875046 // ldr z6, [x2, #60, MUL VL] + WORD $0x85875447 // ldr z7, [x2, #61, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85875846 // ldr z6, [x2, #62, MUL VL] + WORD $0x85875c47 // ldr z7, [x2, #63, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85884046 // ldr z6, [x2, #64, MUL VL] + WORD $0x85884447 // ldr z7, [x2, #65, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85884846 // ldr z6, [x2, #66, MUL VL] + WORD $0x85884c47 // ldr z7, [x2, #67, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85885046 // ldr z6, [x2, #68, MUL VL] + WORD $0x85885447 // ldr z7, [x2, #69, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 7 to 5 outputs + WORD $0x85804188 // ldr z8, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85885846 // ldr z6, [x2, #70, MUL VL] + WORD $0x85885c47 // ldr z7, [x2, #71, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85894046 // ldr z6, [x2, #72, MUL VL] + WORD $0x85894447 // ldr z7, [x2, #73, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85894846 // ldr z6, [x2, #74, MUL VL] + WORD $0x85894c47 // ldr z7, [x2, #75, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85895046 // ldr z6, [x2, #76, MUL VL] + WORD $0x85895447 // ldr z7, [x2, #77, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85895846 // ldr z6, [x2, #78, MUL VL] + WORD $0x85895c47 // ldr z7, [x2, #79, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 8 to 5 outputs + WORD $0x858041a8 // ldr z8, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858a4046 // ldr z6, [x2, #80, MUL VL] + WORD $0x858a4447 // ldr z7, [x2, #81, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858a4846 // ldr z6, [x2, #82, MUL VL] + WORD $0x858a4c47 // ldr z7, [x2, #83, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858a5046 // ldr z6, [x2, #84, MUL VL] + WORD $0x858a5447 // ldr z7, [x2, #85, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858a5846 // ldr z6, [x2, #86, MUL VL] + WORD $0x858a5c47 // ldr z7, [x2, #87, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858b4046 // ldr z6, [x2, #88, MUL VL] + WORD $0x858b4447 // ldr z7, [x2, #89, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 9 to 5 outputs + WORD $0x85804068 // ldr z8, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858b4846 // ldr z6, [x2, #90, MUL VL] + WORD $0x858b4c47 // ldr z7, [x2, #91, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858b5046 // ldr z6, [x2, #92, MUL VL] + WORD $0x858b5447 // ldr z7, [x2, #93, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858b5846 // ldr z6, [x2, #94, MUL VL] + WORD $0x858b5c47 // ldr z7, [x2, #95, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858c4046 // ldr z6, [x2, #96, MUL VL] + WORD $0x858c4447 // ldr z7, [x2, #97, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858c4846 // ldr z6, [x2, #98, MUL VL] + WORD $0x858c4c47 // ldr z7, [x2, #99, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + +mulSve_10x5_store: + // Store 5 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x5_loop + +mulSve_10x5_end: + RET + +// func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x5Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c5 // mov z5.d, x6 + WORD $0x052120a5 // dup z5.b, z5.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + WORD $0x85804028 // ldr z8, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804046 // ldr z6, [x2] + WORD $0x85804447 // ldr z7, [x2, #1, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804846 // ldr z6, [x2, #2, MUL VL] + WORD $0x85804c47 // ldr z7, [x2, #3, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805046 // ldr z6, [x2, #4, MUL VL] + WORD $0x85805447 // ldr z7, [x2, #5, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805846 // ldr z6, [x2, #6, MUL VL] + WORD $0x85805c47 // ldr z7, [x2, #7, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814046 // ldr z6, [x2, #8, MUL VL] + WORD $0x85814447 // ldr z7, [x2, #9, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 1 to 5 outputs + WORD $0x85804088 // ldr z8, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85814846 // ldr z6, [x2, #10, MUL VL] + WORD $0x85814c47 // ldr z7, [x2, #11, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85815046 // ldr z6, [x2, #12, MUL VL] + WORD $0x85815447 // ldr z7, [x2, #13, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85815846 // ldr z6, [x2, #14, MUL VL] + WORD $0x85815c47 // ldr z7, [x2, #15, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85824046 // ldr z6, [x2, #16, MUL VL] + WORD $0x85824447 // ldr z7, [x2, #17, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85824846 // ldr z6, [x2, #18, MUL VL] + WORD $0x85824c47 // ldr z7, [x2, #19, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 2 to 5 outputs + WORD $0x858040a8 // ldr z8, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85825046 // ldr z6, [x2, #20, MUL VL] + WORD $0x85825447 // ldr z7, [x2, #21, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85825846 // ldr z6, [x2, #22, MUL VL] + WORD $0x85825c47 // ldr z7, [x2, #23, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85834046 // ldr z6, [x2, #24, MUL VL] + WORD $0x85834447 // ldr z7, [x2, #25, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85834846 // ldr z6, [x2, #26, MUL VL] + WORD $0x85834c47 // ldr z7, [x2, #27, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85835046 // ldr z6, [x2, #28, MUL VL] + WORD $0x85835447 // ldr z7, [x2, #29, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 3 to 5 outputs + WORD $0x85804108 // ldr z8, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85835846 // ldr z6, [x2, #30, MUL VL] + WORD $0x85835c47 // ldr z7, [x2, #31, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85844046 // ldr z6, [x2, #32, MUL VL] + WORD $0x85844447 // ldr z7, [x2, #33, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85844846 // ldr z6, [x2, #34, MUL VL] + WORD $0x85844c47 // ldr z7, [x2, #35, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85845046 // ldr z6, [x2, #36, MUL VL] + WORD $0x85845447 // ldr z7, [x2, #37, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85845846 // ldr z6, [x2, #38, MUL VL] + WORD $0x85845c47 // ldr z7, [x2, #39, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 4 to 5 outputs + WORD $0x85804128 // ldr z8, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85854046 // ldr z6, [x2, #40, MUL VL] + WORD $0x85854447 // ldr z7, [x2, #41, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85854846 // ldr z6, [x2, #42, MUL VL] + WORD $0x85854c47 // ldr z7, [x2, #43, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85855046 // ldr z6, [x2, #44, MUL VL] + WORD $0x85855447 // ldr z7, [x2, #45, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85855846 // ldr z6, [x2, #46, MUL VL] + WORD $0x85855c47 // ldr z7, [x2, #47, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85864046 // ldr z6, [x2, #48, MUL VL] + WORD $0x85864447 // ldr z7, [x2, #49, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 5 to 5 outputs + WORD $0x85804148 // ldr z8, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85864846 // ldr z6, [x2, #50, MUL VL] + WORD $0x85864c47 // ldr z7, [x2, #51, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85865046 // ldr z6, [x2, #52, MUL VL] + WORD $0x85865447 // ldr z7, [x2, #53, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85865846 // ldr z6, [x2, #54, MUL VL] + WORD $0x85865c47 // ldr z7, [x2, #55, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85874046 // ldr z6, [x2, #56, MUL VL] + WORD $0x85874447 // ldr z7, [x2, #57, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85874846 // ldr z6, [x2, #58, MUL VL] + WORD $0x85874c47 // ldr z7, [x2, #59, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 6 to 5 outputs + WORD $0x85804168 // ldr z8, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85875046 // ldr z6, [x2, #60, MUL VL] + WORD $0x85875447 // ldr z7, [x2, #61, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85875846 // ldr z6, [x2, #62, MUL VL] + WORD $0x85875c47 // ldr z7, [x2, #63, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85884046 // ldr z6, [x2, #64, MUL VL] + WORD $0x85884447 // ldr z7, [x2, #65, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85884846 // ldr z6, [x2, #66, MUL VL] + WORD $0x85884c47 // ldr z7, [x2, #67, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85885046 // ldr z6, [x2, #68, MUL VL] + WORD $0x85885447 // ldr z7, [x2, #69, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 7 to 5 outputs + WORD $0x85804188 // ldr z8, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85885846 // ldr z6, [x2, #70, MUL VL] + WORD $0x85885c47 // ldr z7, [x2, #71, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85894046 // ldr z6, [x2, #72, MUL VL] + WORD $0x85894447 // ldr z7, [x2, #73, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85894846 // ldr z6, [x2, #74, MUL VL] + WORD $0x85894c47 // ldr z7, [x2, #75, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85895046 // ldr z6, [x2, #76, MUL VL] + WORD $0x85895447 // ldr z7, [x2, #77, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85895846 // ldr z6, [x2, #78, MUL VL] + WORD $0x85895c47 // ldr z7, [x2, #79, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 8 to 5 outputs + WORD $0x858041a8 // ldr z8, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858a4046 // ldr z6, [x2, #80, MUL VL] + WORD $0x858a4447 // ldr z7, [x2, #81, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858a4846 // ldr z6, [x2, #82, MUL VL] + WORD $0x858a4c47 // ldr z7, [x2, #83, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858a5046 // ldr z6, [x2, #84, MUL VL] + WORD $0x858a5447 // ldr z7, [x2, #85, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858a5846 // ldr z6, [x2, #86, MUL VL] + WORD $0x858a5c47 // ldr z7, [x2, #87, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858b4046 // ldr z6, [x2, #88, MUL VL] + WORD $0x858b4447 // ldr z7, [x2, #89, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 9 to 5 outputs + WORD $0x85804068 // ldr z8, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858b4846 // ldr z6, [x2, #90, MUL VL] + WORD $0x858b4c47 // ldr z7, [x2, #91, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858b5046 // ldr z6, [x2, #92, MUL VL] + WORD $0x858b5447 // ldr z7, [x2, #93, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858b5846 // ldr z6, [x2, #94, MUL VL] + WORD $0x858b5c47 // ldr z7, [x2, #95, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858c4046 // ldr z6, [x2, #96, MUL VL] + WORD $0x858c4447 // ldr z7, [x2, #97, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858c4846 // ldr z6, [x2, #98, MUL VL] + WORD $0x858c4c47 // ldr z7, [x2, #99, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + +mulSve_10x5Xor_store: + // Store 5 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x5Xor_loop + +mulSve_10x5Xor_end: + RET + +// func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x6_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73100 // eor z0.d, z8.d, z7.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73101 // eor z1.d, z8.d, z7.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73102 // eor z2.d, z8.d, z7.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73103 // eor z3.d, z8.d, z7.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73104 // eor z4.d, z8.d, z7.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73105 // eor z5.d, z8.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 1 to 6 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 2 to 6 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 3 to 6 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 4 to 6 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 5 to 6 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85875047 // ldr z7, [x2, #60, MUL VL] + WORD $0x85875448 // ldr z8, [x2, #61, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85875847 // ldr z7, [x2, #62, MUL VL] + WORD $0x85875c48 // ldr z8, [x2, #63, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85884047 // ldr z7, [x2, #64, MUL VL] + WORD $0x85884448 // ldr z8, [x2, #65, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85884847 // ldr z7, [x2, #66, MUL VL] + WORD $0x85884c48 // ldr z8, [x2, #67, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85885047 // ldr z7, [x2, #68, MUL VL] + WORD $0x85885448 // ldr z8, [x2, #69, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85885847 // ldr z7, [x2, #70, MUL VL] + WORD $0x85885c48 // ldr z8, [x2, #71, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 6 to 6 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85894047 // ldr z7, [x2, #72, MUL VL] + WORD $0x85894448 // ldr z8, [x2, #73, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85894847 // ldr z7, [x2, #74, MUL VL] + WORD $0x85894c48 // ldr z8, [x2, #75, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85895047 // ldr z7, [x2, #76, MUL VL] + WORD $0x85895448 // ldr z8, [x2, #77, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85895847 // ldr z7, [x2, #78, MUL VL] + WORD $0x85895c48 // ldr z8, [x2, #79, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858a4047 // ldr z7, [x2, #80, MUL VL] + WORD $0x858a4448 // ldr z8, [x2, #81, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858a4847 // ldr z7, [x2, #82, MUL VL] + WORD $0x858a4c48 // ldr z8, [x2, #83, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 7 to 6 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858a5047 // ldr z7, [x2, #84, MUL VL] + WORD $0x858a5448 // ldr z8, [x2, #85, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858a5847 // ldr z7, [x2, #86, MUL VL] + WORD $0x858a5c48 // ldr z8, [x2, #87, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858b4047 // ldr z7, [x2, #88, MUL VL] + WORD $0x858b4448 // ldr z8, [x2, #89, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858b4847 // ldr z7, [x2, #90, MUL VL] + WORD $0x858b4c48 // ldr z8, [x2, #91, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858b5047 // ldr z7, [x2, #92, MUL VL] + WORD $0x858b5448 // ldr z8, [x2, #93, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858b5847 // ldr z7, [x2, #94, MUL VL] + WORD $0x858b5c48 // ldr z8, [x2, #95, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 8 to 6 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858c4047 // ldr z7, [x2, #96, MUL VL] + WORD $0x858c4448 // ldr z8, [x2, #97, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858c4847 // ldr z7, [x2, #98, MUL VL] + WORD $0x858c4c48 // ldr z8, [x2, #99, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858c5047 // ldr z7, [x2, #100, MUL VL] + WORD $0x858c5448 // ldr z8, [x2, #101, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858c5847 // ldr z7, [x2, #102, MUL VL] + WORD $0x858c5c48 // ldr z8, [x2, #103, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858d4047 // ldr z7, [x2, #104, MUL VL] + WORD $0x858d4448 // ldr z8, [x2, #105, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858d4847 // ldr z7, [x2, #106, MUL VL] + WORD $0x858d4c48 // ldr z8, [x2, #107, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 9 to 6 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858d5047 // ldr z7, [x2, #108, MUL VL] + WORD $0x858d5448 // ldr z8, [x2, #109, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858d5847 // ldr z7, [x2, #110, MUL VL] + WORD $0x858d5c48 // ldr z8, [x2, #111, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858e4047 // ldr z7, [x2, #112, MUL VL] + WORD $0x858e4448 // ldr z8, [x2, #113, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858e4847 // ldr z7, [x2, #114, MUL VL] + WORD $0x858e4c48 // ldr z8, [x2, #115, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858e5047 // ldr z7, [x2, #116, MUL VL] + WORD $0x858e5448 // ldr z8, [x2, #117, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858e5847 // ldr z7, [x2, #118, MUL VL] + WORD $0x858e5c48 // ldr z8, [x2, #119, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + +mulSve_10x6_store: + // Store 6 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x6_loop + +mulSve_10x6_end: + RET + +// func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x6Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 1 to 6 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 2 to 6 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 3 to 6 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 4 to 6 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 5 to 6 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85875047 // ldr z7, [x2, #60, MUL VL] + WORD $0x85875448 // ldr z8, [x2, #61, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85875847 // ldr z7, [x2, #62, MUL VL] + WORD $0x85875c48 // ldr z8, [x2, #63, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85884047 // ldr z7, [x2, #64, MUL VL] + WORD $0x85884448 // ldr z8, [x2, #65, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85884847 // ldr z7, [x2, #66, MUL VL] + WORD $0x85884c48 // ldr z8, [x2, #67, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85885047 // ldr z7, [x2, #68, MUL VL] + WORD $0x85885448 // ldr z8, [x2, #69, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85885847 // ldr z7, [x2, #70, MUL VL] + WORD $0x85885c48 // ldr z8, [x2, #71, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 6 to 6 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85894047 // ldr z7, [x2, #72, MUL VL] + WORD $0x85894448 // ldr z8, [x2, #73, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85894847 // ldr z7, [x2, #74, MUL VL] + WORD $0x85894c48 // ldr z8, [x2, #75, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85895047 // ldr z7, [x2, #76, MUL VL] + WORD $0x85895448 // ldr z8, [x2, #77, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85895847 // ldr z7, [x2, #78, MUL VL] + WORD $0x85895c48 // ldr z8, [x2, #79, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858a4047 // ldr z7, [x2, #80, MUL VL] + WORD $0x858a4448 // ldr z8, [x2, #81, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858a4847 // ldr z7, [x2, #82, MUL VL] + WORD $0x858a4c48 // ldr z8, [x2, #83, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 7 to 6 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858a5047 // ldr z7, [x2, #84, MUL VL] + WORD $0x858a5448 // ldr z8, [x2, #85, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858a5847 // ldr z7, [x2, #86, MUL VL] + WORD $0x858a5c48 // ldr z8, [x2, #87, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858b4047 // ldr z7, [x2, #88, MUL VL] + WORD $0x858b4448 // ldr z8, [x2, #89, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858b4847 // ldr z7, [x2, #90, MUL VL] + WORD $0x858b4c48 // ldr z8, [x2, #91, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858b5047 // ldr z7, [x2, #92, MUL VL] + WORD $0x858b5448 // ldr z8, [x2, #93, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858b5847 // ldr z7, [x2, #94, MUL VL] + WORD $0x858b5c48 // ldr z8, [x2, #95, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 8 to 6 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858c4047 // ldr z7, [x2, #96, MUL VL] + WORD $0x858c4448 // ldr z8, [x2, #97, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858c4847 // ldr z7, [x2, #98, MUL VL] + WORD $0x858c4c48 // ldr z8, [x2, #99, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858c5047 // ldr z7, [x2, #100, MUL VL] + WORD $0x858c5448 // ldr z8, [x2, #101, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858c5847 // ldr z7, [x2, #102, MUL VL] + WORD $0x858c5c48 // ldr z8, [x2, #103, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858d4047 // ldr z7, [x2, #104, MUL VL] + WORD $0x858d4448 // ldr z8, [x2, #105, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858d4847 // ldr z7, [x2, #106, MUL VL] + WORD $0x858d4c48 // ldr z8, [x2, #107, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 9 to 6 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858d5047 // ldr z7, [x2, #108, MUL VL] + WORD $0x858d5448 // ldr z8, [x2, #109, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858d5847 // ldr z7, [x2, #110, MUL VL] + WORD $0x858d5c48 // ldr z8, [x2, #111, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858e4047 // ldr z7, [x2, #112, MUL VL] + WORD $0x858e4448 // ldr z8, [x2, #113, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858e4847 // ldr z7, [x2, #114, MUL VL] + WORD $0x858e4c48 // ldr z8, [x2, #115, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858e5047 // ldr z7, [x2, #116, MUL VL] + WORD $0x858e5448 // ldr z8, [x2, #117, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858e5847 // ldr z7, [x2, #118, MUL VL] + WORD $0x858e5c48 // ldr z8, [x2, #119, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + +mulSve_10x6Xor_store: + // Store 6 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x6Xor_loop + +mulSve_10x6Xor_end: + RET + +// func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x7_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c7 // mov z7.d, x6 + WORD $0x052120e7 // dup z7.b, z7.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + WORD $0x8580402a // ldr z10, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85804048 // ldr z8, [x2] + WORD $0x85804449 // ldr z9, [x2, #1, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83120 // eor z0.d, z9.d, z8.d + WORD $0x85804848 // ldr z8, [x2, #2, MUL VL] + WORD $0x85804c49 // ldr z9, [x2, #3, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83121 // eor z1.d, z9.d, z8.d + WORD $0x85805048 // ldr z8, [x2, #4, MUL VL] + WORD $0x85805449 // ldr z9, [x2, #5, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83122 // eor z2.d, z9.d, z8.d + WORD $0x85805848 // ldr z8, [x2, #6, MUL VL] + WORD $0x85805c49 // ldr z9, [x2, #7, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83123 // eor z3.d, z9.d, z8.d + WORD $0x85814048 // ldr z8, [x2, #8, MUL VL] + WORD $0x85814449 // ldr z9, [x2, #9, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83124 // eor z4.d, z9.d, z8.d + WORD $0x85814848 // ldr z8, [x2, #10, MUL VL] + WORD $0x85814c49 // ldr z9, [x2, #11, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83125 // eor z5.d, z9.d, z8.d + WORD $0x85815048 // ldr z8, [x2, #12, MUL VL] + WORD $0x85815449 // ldr z9, [x2, #13, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83126 // eor z6.d, z9.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 1 to 7 outputs + WORD $0x8580408a // ldr z10, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85815848 // ldr z8, [x2, #14, MUL VL] + WORD $0x85815c49 // ldr z9, [x2, #15, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85824048 // ldr z8, [x2, #16, MUL VL] + WORD $0x85824449 // ldr z9, [x2, #17, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85824848 // ldr z8, [x2, #18, MUL VL] + WORD $0x85824c49 // ldr z9, [x2, #19, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85825048 // ldr z8, [x2, #20, MUL VL] + WORD $0x85825449 // ldr z9, [x2, #21, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85825848 // ldr z8, [x2, #22, MUL VL] + WORD $0x85825c49 // ldr z9, [x2, #23, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85834048 // ldr z8, [x2, #24, MUL VL] + WORD $0x85834449 // ldr z9, [x2, #25, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85834848 // ldr z8, [x2, #26, MUL VL] + WORD $0x85834c49 // ldr z9, [x2, #27, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 2 to 7 outputs + WORD $0x858040aa // ldr z10, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85835048 // ldr z8, [x2, #28, MUL VL] + WORD $0x85835449 // ldr z9, [x2, #29, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85835848 // ldr z8, [x2, #30, MUL VL] + WORD $0x85835c49 // ldr z9, [x2, #31, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85844048 // ldr z8, [x2, #32, MUL VL] + WORD $0x85844449 // ldr z9, [x2, #33, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85844848 // ldr z8, [x2, #34, MUL VL] + WORD $0x85844c49 // ldr z9, [x2, #35, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85845048 // ldr z8, [x2, #36, MUL VL] + WORD $0x85845449 // ldr z9, [x2, #37, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85845848 // ldr z8, [x2, #38, MUL VL] + WORD $0x85845c49 // ldr z9, [x2, #39, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85854048 // ldr z8, [x2, #40, MUL VL] + WORD $0x85854449 // ldr z9, [x2, #41, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 3 to 7 outputs + WORD $0x8580410a // ldr z10, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85854848 // ldr z8, [x2, #42, MUL VL] + WORD $0x85854c49 // ldr z9, [x2, #43, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85855048 // ldr z8, [x2, #44, MUL VL] + WORD $0x85855449 // ldr z9, [x2, #45, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85855848 // ldr z8, [x2, #46, MUL VL] + WORD $0x85855c49 // ldr z9, [x2, #47, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85864048 // ldr z8, [x2, #48, MUL VL] + WORD $0x85864449 // ldr z9, [x2, #49, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85864848 // ldr z8, [x2, #50, MUL VL] + WORD $0x85864c49 // ldr z9, [x2, #51, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85865048 // ldr z8, [x2, #52, MUL VL] + WORD $0x85865449 // ldr z9, [x2, #53, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85865848 // ldr z8, [x2, #54, MUL VL] + WORD $0x85865c49 // ldr z9, [x2, #55, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 4 to 7 outputs + WORD $0x8580412a // ldr z10, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85874048 // ldr z8, [x2, #56, MUL VL] + WORD $0x85874449 // ldr z9, [x2, #57, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85874848 // ldr z8, [x2, #58, MUL VL] + WORD $0x85874c49 // ldr z9, [x2, #59, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85875048 // ldr z8, [x2, #60, MUL VL] + WORD $0x85875449 // ldr z9, [x2, #61, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85875848 // ldr z8, [x2, #62, MUL VL] + WORD $0x85875c49 // ldr z9, [x2, #63, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85884048 // ldr z8, [x2, #64, MUL VL] + WORD $0x85884449 // ldr z9, [x2, #65, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85884848 // ldr z8, [x2, #66, MUL VL] + WORD $0x85884c49 // ldr z9, [x2, #67, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85885048 // ldr z8, [x2, #68, MUL VL] + WORD $0x85885449 // ldr z9, [x2, #69, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 5 to 7 outputs + WORD $0x8580414a // ldr z10, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85885848 // ldr z8, [x2, #70, MUL VL] + WORD $0x85885c49 // ldr z9, [x2, #71, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85894048 // ldr z8, [x2, #72, MUL VL] + WORD $0x85894449 // ldr z9, [x2, #73, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85894848 // ldr z8, [x2, #74, MUL VL] + WORD $0x85894c49 // ldr z9, [x2, #75, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85895048 // ldr z8, [x2, #76, MUL VL] + WORD $0x85895449 // ldr z9, [x2, #77, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85895848 // ldr z8, [x2, #78, MUL VL] + WORD $0x85895c49 // ldr z9, [x2, #79, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858a4048 // ldr z8, [x2, #80, MUL VL] + WORD $0x858a4449 // ldr z9, [x2, #81, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858a4848 // ldr z8, [x2, #82, MUL VL] + WORD $0x858a4c49 // ldr z9, [x2, #83, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 6 to 7 outputs + WORD $0x8580416a // ldr z10, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858a5048 // ldr z8, [x2, #84, MUL VL] + WORD $0x858a5449 // ldr z9, [x2, #85, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858a5848 // ldr z8, [x2, #86, MUL VL] + WORD $0x858a5c49 // ldr z9, [x2, #87, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858b4048 // ldr z8, [x2, #88, MUL VL] + WORD $0x858b4449 // ldr z9, [x2, #89, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858b4848 // ldr z8, [x2, #90, MUL VL] + WORD $0x858b4c49 // ldr z9, [x2, #91, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858b5048 // ldr z8, [x2, #92, MUL VL] + WORD $0x858b5449 // ldr z9, [x2, #93, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858b5848 // ldr z8, [x2, #94, MUL VL] + WORD $0x858b5c49 // ldr z9, [x2, #95, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858c4048 // ldr z8, [x2, #96, MUL VL] + WORD $0x858c4449 // ldr z9, [x2, #97, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 7 to 7 outputs + WORD $0x8580418a // ldr z10, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858c4848 // ldr z8, [x2, #98, MUL VL] + WORD $0x858c4c49 // ldr z9, [x2, #99, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858c5048 // ldr z8, [x2, #100, MUL VL] + WORD $0x858c5449 // ldr z9, [x2, #101, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858c5848 // ldr z8, [x2, #102, MUL VL] + WORD $0x858c5c49 // ldr z9, [x2, #103, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858d4048 // ldr z8, [x2, #104, MUL VL] + WORD $0x858d4449 // ldr z9, [x2, #105, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858d4848 // ldr z8, [x2, #106, MUL VL] + WORD $0x858d4c49 // ldr z9, [x2, #107, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858d5048 // ldr z8, [x2, #108, MUL VL] + WORD $0x858d5449 // ldr z9, [x2, #109, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858d5848 // ldr z8, [x2, #110, MUL VL] + WORD $0x858d5c49 // ldr z9, [x2, #111, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 8 to 7 outputs + WORD $0x858041aa // ldr z10, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858e4048 // ldr z8, [x2, #112, MUL VL] + WORD $0x858e4449 // ldr z9, [x2, #113, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858e4848 // ldr z8, [x2, #114, MUL VL] + WORD $0x858e4c49 // ldr z9, [x2, #115, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858e5048 // ldr z8, [x2, #116, MUL VL] + WORD $0x858e5449 // ldr z9, [x2, #117, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858e5848 // ldr z8, [x2, #118, MUL VL] + WORD $0x858e5c49 // ldr z9, [x2, #119, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858f4048 // ldr z8, [x2, #120, MUL VL] + WORD $0x858f4449 // ldr z9, [x2, #121, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858f4848 // ldr z8, [x2, #122, MUL VL] + WORD $0x858f4c49 // ldr z9, [x2, #123, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858f5048 // ldr z8, [x2, #124, MUL VL] + WORD $0x858f5449 // ldr z9, [x2, #125, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 9 to 7 outputs + WORD $0x8580406a // ldr z10, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858f5848 // ldr z8, [x2, #126, MUL VL] + WORD $0x858f5c49 // ldr z9, [x2, #127, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85904048 // ldr z8, [x2, #128, MUL VL] + WORD $0x85904449 // ldr z9, [x2, #129, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85904848 // ldr z8, [x2, #130, MUL VL] + WORD $0x85904c49 // ldr z9, [x2, #131, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85905048 // ldr z8, [x2, #132, MUL VL] + WORD $0x85905449 // ldr z9, [x2, #133, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85905848 // ldr z8, [x2, #134, MUL VL] + WORD $0x85905c49 // ldr z9, [x2, #135, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85914048 // ldr z8, [x2, #136, MUL VL] + WORD $0x85914449 // ldr z9, [x2, #137, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85914848 // ldr z8, [x2, #138, MUL VL] + WORD $0x85914c49 // ldr z9, [x2, #139, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + +mulSve_10x7_store: + // Store 7 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x7_loop + +mulSve_10x7_end: + RET + +// func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x7Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c7 // mov z7.d, x6 + WORD $0x052120e7 // dup z7.b, z7.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + WORD $0x8580402a // ldr z10, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804048 // ldr z8, [x2] + WORD $0x85804449 // ldr z9, [x2, #1, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804848 // ldr z8, [x2, #2, MUL VL] + WORD $0x85804c49 // ldr z9, [x2, #3, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805048 // ldr z8, [x2, #4, MUL VL] + WORD $0x85805449 // ldr z9, [x2, #5, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805848 // ldr z8, [x2, #6, MUL VL] + WORD $0x85805c49 // ldr z9, [x2, #7, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814048 // ldr z8, [x2, #8, MUL VL] + WORD $0x85814449 // ldr z9, [x2, #9, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814848 // ldr z8, [x2, #10, MUL VL] + WORD $0x85814c49 // ldr z9, [x2, #11, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815048 // ldr z8, [x2, #12, MUL VL] + WORD $0x85815449 // ldr z9, [x2, #13, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 1 to 7 outputs + WORD $0x8580408a // ldr z10, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85815848 // ldr z8, [x2, #14, MUL VL] + WORD $0x85815c49 // ldr z9, [x2, #15, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85824048 // ldr z8, [x2, #16, MUL VL] + WORD $0x85824449 // ldr z9, [x2, #17, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85824848 // ldr z8, [x2, #18, MUL VL] + WORD $0x85824c49 // ldr z9, [x2, #19, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85825048 // ldr z8, [x2, #20, MUL VL] + WORD $0x85825449 // ldr z9, [x2, #21, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85825848 // ldr z8, [x2, #22, MUL VL] + WORD $0x85825c49 // ldr z9, [x2, #23, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85834048 // ldr z8, [x2, #24, MUL VL] + WORD $0x85834449 // ldr z9, [x2, #25, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85834848 // ldr z8, [x2, #26, MUL VL] + WORD $0x85834c49 // ldr z9, [x2, #27, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 2 to 7 outputs + WORD $0x858040aa // ldr z10, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85835048 // ldr z8, [x2, #28, MUL VL] + WORD $0x85835449 // ldr z9, [x2, #29, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85835848 // ldr z8, [x2, #30, MUL VL] + WORD $0x85835c49 // ldr z9, [x2, #31, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85844048 // ldr z8, [x2, #32, MUL VL] + WORD $0x85844449 // ldr z9, [x2, #33, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85844848 // ldr z8, [x2, #34, MUL VL] + WORD $0x85844c49 // ldr z9, [x2, #35, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85845048 // ldr z8, [x2, #36, MUL VL] + WORD $0x85845449 // ldr z9, [x2, #37, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85845848 // ldr z8, [x2, #38, MUL VL] + WORD $0x85845c49 // ldr z9, [x2, #39, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85854048 // ldr z8, [x2, #40, MUL VL] + WORD $0x85854449 // ldr z9, [x2, #41, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 3 to 7 outputs + WORD $0x8580410a // ldr z10, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85854848 // ldr z8, [x2, #42, MUL VL] + WORD $0x85854c49 // ldr z9, [x2, #43, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85855048 // ldr z8, [x2, #44, MUL VL] + WORD $0x85855449 // ldr z9, [x2, #45, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85855848 // ldr z8, [x2, #46, MUL VL] + WORD $0x85855c49 // ldr z9, [x2, #47, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85864048 // ldr z8, [x2, #48, MUL VL] + WORD $0x85864449 // ldr z9, [x2, #49, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85864848 // ldr z8, [x2, #50, MUL VL] + WORD $0x85864c49 // ldr z9, [x2, #51, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85865048 // ldr z8, [x2, #52, MUL VL] + WORD $0x85865449 // ldr z9, [x2, #53, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85865848 // ldr z8, [x2, #54, MUL VL] + WORD $0x85865c49 // ldr z9, [x2, #55, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 4 to 7 outputs + WORD $0x8580412a // ldr z10, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85874048 // ldr z8, [x2, #56, MUL VL] + WORD $0x85874449 // ldr z9, [x2, #57, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85874848 // ldr z8, [x2, #58, MUL VL] + WORD $0x85874c49 // ldr z9, [x2, #59, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85875048 // ldr z8, [x2, #60, MUL VL] + WORD $0x85875449 // ldr z9, [x2, #61, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85875848 // ldr z8, [x2, #62, MUL VL] + WORD $0x85875c49 // ldr z9, [x2, #63, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85884048 // ldr z8, [x2, #64, MUL VL] + WORD $0x85884449 // ldr z9, [x2, #65, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85884848 // ldr z8, [x2, #66, MUL VL] + WORD $0x85884c49 // ldr z9, [x2, #67, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85885048 // ldr z8, [x2, #68, MUL VL] + WORD $0x85885449 // ldr z9, [x2, #69, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 5 to 7 outputs + WORD $0x8580414a // ldr z10, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85885848 // ldr z8, [x2, #70, MUL VL] + WORD $0x85885c49 // ldr z9, [x2, #71, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85894048 // ldr z8, [x2, #72, MUL VL] + WORD $0x85894449 // ldr z9, [x2, #73, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85894848 // ldr z8, [x2, #74, MUL VL] + WORD $0x85894c49 // ldr z9, [x2, #75, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85895048 // ldr z8, [x2, #76, MUL VL] + WORD $0x85895449 // ldr z9, [x2, #77, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85895848 // ldr z8, [x2, #78, MUL VL] + WORD $0x85895c49 // ldr z9, [x2, #79, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858a4048 // ldr z8, [x2, #80, MUL VL] + WORD $0x858a4449 // ldr z9, [x2, #81, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858a4848 // ldr z8, [x2, #82, MUL VL] + WORD $0x858a4c49 // ldr z9, [x2, #83, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 6 to 7 outputs + WORD $0x8580416a // ldr z10, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858a5048 // ldr z8, [x2, #84, MUL VL] + WORD $0x858a5449 // ldr z9, [x2, #85, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858a5848 // ldr z8, [x2, #86, MUL VL] + WORD $0x858a5c49 // ldr z9, [x2, #87, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858b4048 // ldr z8, [x2, #88, MUL VL] + WORD $0x858b4449 // ldr z9, [x2, #89, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858b4848 // ldr z8, [x2, #90, MUL VL] + WORD $0x858b4c49 // ldr z9, [x2, #91, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858b5048 // ldr z8, [x2, #92, MUL VL] + WORD $0x858b5449 // ldr z9, [x2, #93, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858b5848 // ldr z8, [x2, #94, MUL VL] + WORD $0x858b5c49 // ldr z9, [x2, #95, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858c4048 // ldr z8, [x2, #96, MUL VL] + WORD $0x858c4449 // ldr z9, [x2, #97, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 7 to 7 outputs + WORD $0x8580418a // ldr z10, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858c4848 // ldr z8, [x2, #98, MUL VL] + WORD $0x858c4c49 // ldr z9, [x2, #99, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858c5048 // ldr z8, [x2, #100, MUL VL] + WORD $0x858c5449 // ldr z9, [x2, #101, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858c5848 // ldr z8, [x2, #102, MUL VL] + WORD $0x858c5c49 // ldr z9, [x2, #103, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858d4048 // ldr z8, [x2, #104, MUL VL] + WORD $0x858d4449 // ldr z9, [x2, #105, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858d4848 // ldr z8, [x2, #106, MUL VL] + WORD $0x858d4c49 // ldr z9, [x2, #107, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858d5048 // ldr z8, [x2, #108, MUL VL] + WORD $0x858d5449 // ldr z9, [x2, #109, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858d5848 // ldr z8, [x2, #110, MUL VL] + WORD $0x858d5c49 // ldr z9, [x2, #111, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 8 to 7 outputs + WORD $0x858041aa // ldr z10, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858e4048 // ldr z8, [x2, #112, MUL VL] + WORD $0x858e4449 // ldr z9, [x2, #113, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858e4848 // ldr z8, [x2, #114, MUL VL] + WORD $0x858e4c49 // ldr z9, [x2, #115, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858e5048 // ldr z8, [x2, #116, MUL VL] + WORD $0x858e5449 // ldr z9, [x2, #117, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858e5848 // ldr z8, [x2, #118, MUL VL] + WORD $0x858e5c49 // ldr z9, [x2, #119, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858f4048 // ldr z8, [x2, #120, MUL VL] + WORD $0x858f4449 // ldr z9, [x2, #121, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858f4848 // ldr z8, [x2, #122, MUL VL] + WORD $0x858f4c49 // ldr z9, [x2, #123, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858f5048 // ldr z8, [x2, #124, MUL VL] + WORD $0x858f5449 // ldr z9, [x2, #125, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 9 to 7 outputs + WORD $0x8580406a // ldr z10, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858f5848 // ldr z8, [x2, #126, MUL VL] + WORD $0x858f5c49 // ldr z9, [x2, #127, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85904048 // ldr z8, [x2, #128, MUL VL] + WORD $0x85904449 // ldr z9, [x2, #129, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85904848 // ldr z8, [x2, #130, MUL VL] + WORD $0x85904c49 // ldr z9, [x2, #131, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85905048 // ldr z8, [x2, #132, MUL VL] + WORD $0x85905449 // ldr z9, [x2, #133, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85905848 // ldr z8, [x2, #134, MUL VL] + WORD $0x85905c49 // ldr z9, [x2, #135, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85914048 // ldr z8, [x2, #136, MUL VL] + WORD $0x85914449 // ldr z9, [x2, #137, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85914848 // ldr z8, [x2, #138, MUL VL] + WORD $0x85914c49 // ldr z9, [x2, #139, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + +mulSve_10x7Xor_store: + // Store 7 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x7Xor_loop + +mulSve_10x7Xor_end: + RET + +// func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x8_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c8 // mov z8.d, x6 + WORD $0x05212108 // dup z8.b, z8.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85804049 // ldr z9, [x2] + WORD $0x8580444a // ldr z10, [x2, #1, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93140 // eor z0.d, z10.d, z9.d + WORD $0x85804849 // ldr z9, [x2, #2, MUL VL] + WORD $0x85804c4a // ldr z10, [x2, #3, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93141 // eor z1.d, z10.d, z9.d + WORD $0x85805049 // ldr z9, [x2, #4, MUL VL] + WORD $0x8580544a // ldr z10, [x2, #5, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93142 // eor z2.d, z10.d, z9.d + WORD $0x85805849 // ldr z9, [x2, #6, MUL VL] + WORD $0x85805c4a // ldr z10, [x2, #7, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93143 // eor z3.d, z10.d, z9.d + WORD $0x85814049 // ldr z9, [x2, #8, MUL VL] + WORD $0x8581444a // ldr z10, [x2, #9, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93144 // eor z4.d, z10.d, z9.d + WORD $0x85814849 // ldr z9, [x2, #10, MUL VL] + WORD $0x85814c4a // ldr z10, [x2, #11, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93145 // eor z5.d, z10.d, z9.d + WORD $0x85815049 // ldr z9, [x2, #12, MUL VL] + WORD $0x8581544a // ldr z10, [x2, #13, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93146 // eor z6.d, z10.d, z9.d + WORD $0x85815849 // ldr z9, [x2, #14, MUL VL] + WORD $0x85815c4a // ldr z10, [x2, #15, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93147 // eor z7.d, z10.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 1 to 8 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85824049 // ldr z9, [x2, #16, MUL VL] + WORD $0x8582444a // ldr z10, [x2, #17, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85824849 // ldr z9, [x2, #18, MUL VL] + WORD $0x85824c4a // ldr z10, [x2, #19, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825049 // ldr z9, [x2, #20, MUL VL] + WORD $0x8582544a // ldr z10, [x2, #21, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85825849 // ldr z9, [x2, #22, MUL VL] + WORD $0x85825c4a // ldr z10, [x2, #23, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85834049 // ldr z9, [x2, #24, MUL VL] + WORD $0x8583444a // ldr z10, [x2, #25, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85834849 // ldr z9, [x2, #26, MUL VL] + WORD $0x85834c4a // ldr z10, [x2, #27, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85835049 // ldr z9, [x2, #28, MUL VL] + WORD $0x8583544a // ldr z10, [x2, #29, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85835849 // ldr z9, [x2, #30, MUL VL] + WORD $0x85835c4a // ldr z10, [x2, #31, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 2 to 8 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85844049 // ldr z9, [x2, #32, MUL VL] + WORD $0x8584444a // ldr z10, [x2, #33, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85844849 // ldr z9, [x2, #34, MUL VL] + WORD $0x85844c4a // ldr z10, [x2, #35, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845049 // ldr z9, [x2, #36, MUL VL] + WORD $0x8584544a // ldr z10, [x2, #37, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85845849 // ldr z9, [x2, #38, MUL VL] + WORD $0x85845c4a // ldr z10, [x2, #39, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854049 // ldr z9, [x2, #40, MUL VL] + WORD $0x8585444a // ldr z10, [x2, #41, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85854849 // ldr z9, [x2, #42, MUL VL] + WORD $0x85854c4a // ldr z10, [x2, #43, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85855049 // ldr z9, [x2, #44, MUL VL] + WORD $0x8585544a // ldr z10, [x2, #45, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85855849 // ldr z9, [x2, #46, MUL VL] + WORD $0x85855c4a // ldr z10, [x2, #47, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 3 to 8 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85864049 // ldr z9, [x2, #48, MUL VL] + WORD $0x8586444a // ldr z10, [x2, #49, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85864849 // ldr z9, [x2, #50, MUL VL] + WORD $0x85864c4a // ldr z10, [x2, #51, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85865049 // ldr z9, [x2, #52, MUL VL] + WORD $0x8586544a // ldr z10, [x2, #53, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85865849 // ldr z9, [x2, #54, MUL VL] + WORD $0x85865c4a // ldr z10, [x2, #55, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874049 // ldr z9, [x2, #56, MUL VL] + WORD $0x8587444a // ldr z10, [x2, #57, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85874849 // ldr z9, [x2, #58, MUL VL] + WORD $0x85874c4a // ldr z10, [x2, #59, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85875049 // ldr z9, [x2, #60, MUL VL] + WORD $0x8587544a // ldr z10, [x2, #61, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85875849 // ldr z9, [x2, #62, MUL VL] + WORD $0x85875c4a // ldr z10, [x2, #63, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 4 to 8 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85884049 // ldr z9, [x2, #64, MUL VL] + WORD $0x8588444a // ldr z10, [x2, #65, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85884849 // ldr z9, [x2, #66, MUL VL] + WORD $0x85884c4a // ldr z10, [x2, #67, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85885049 // ldr z9, [x2, #68, MUL VL] + WORD $0x8588544a // ldr z10, [x2, #69, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85885849 // ldr z9, [x2, #70, MUL VL] + WORD $0x85885c4a // ldr z10, [x2, #71, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85894049 // ldr z9, [x2, #72, MUL VL] + WORD $0x8589444a // ldr z10, [x2, #73, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85894849 // ldr z9, [x2, #74, MUL VL] + WORD $0x85894c4a // ldr z10, [x2, #75, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85895049 // ldr z9, [x2, #76, MUL VL] + WORD $0x8589544a // ldr z10, [x2, #77, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85895849 // ldr z9, [x2, #78, MUL VL] + WORD $0x85895c4a // ldr z10, [x2, #79, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 5 to 8 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858a4049 // ldr z9, [x2, #80, MUL VL] + WORD $0x858a444a // ldr z10, [x2, #81, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858a4849 // ldr z9, [x2, #82, MUL VL] + WORD $0x858a4c4a // ldr z10, [x2, #83, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858a5049 // ldr z9, [x2, #84, MUL VL] + WORD $0x858a544a // ldr z10, [x2, #85, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858a5849 // ldr z9, [x2, #86, MUL VL] + WORD $0x858a5c4a // ldr z10, [x2, #87, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858b4049 // ldr z9, [x2, #88, MUL VL] + WORD $0x858b444a // ldr z10, [x2, #89, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858b4849 // ldr z9, [x2, #90, MUL VL] + WORD $0x858b4c4a // ldr z10, [x2, #91, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858b5049 // ldr z9, [x2, #92, MUL VL] + WORD $0x858b544a // ldr z10, [x2, #93, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858b5849 // ldr z9, [x2, #94, MUL VL] + WORD $0x858b5c4a // ldr z10, [x2, #95, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 6 to 8 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858c4049 // ldr z9, [x2, #96, MUL VL] + WORD $0x858c444a // ldr z10, [x2, #97, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858c4849 // ldr z9, [x2, #98, MUL VL] + WORD $0x858c4c4a // ldr z10, [x2, #99, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858c5049 // ldr z9, [x2, #100, MUL VL] + WORD $0x858c544a // ldr z10, [x2, #101, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858c5849 // ldr z9, [x2, #102, MUL VL] + WORD $0x858c5c4a // ldr z10, [x2, #103, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858d4049 // ldr z9, [x2, #104, MUL VL] + WORD $0x858d444a // ldr z10, [x2, #105, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858d4849 // ldr z9, [x2, #106, MUL VL] + WORD $0x858d4c4a // ldr z10, [x2, #107, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858d5049 // ldr z9, [x2, #108, MUL VL] + WORD $0x858d544a // ldr z10, [x2, #109, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858d5849 // ldr z9, [x2, #110, MUL VL] + WORD $0x858d5c4a // ldr z10, [x2, #111, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 7 to 8 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858e4049 // ldr z9, [x2, #112, MUL VL] + WORD $0x858e444a // ldr z10, [x2, #113, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858e4849 // ldr z9, [x2, #114, MUL VL] + WORD $0x858e4c4a // ldr z10, [x2, #115, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858e5049 // ldr z9, [x2, #116, MUL VL] + WORD $0x858e544a // ldr z10, [x2, #117, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858e5849 // ldr z9, [x2, #118, MUL VL] + WORD $0x858e5c4a // ldr z10, [x2, #119, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858f4049 // ldr z9, [x2, #120, MUL VL] + WORD $0x858f444a // ldr z10, [x2, #121, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858f4849 // ldr z9, [x2, #122, MUL VL] + WORD $0x858f4c4a // ldr z10, [x2, #123, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858f5049 // ldr z9, [x2, #124, MUL VL] + WORD $0x858f544a // ldr z10, [x2, #125, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858f5849 // ldr z9, [x2, #126, MUL VL] + WORD $0x858f5c4a // ldr z10, [x2, #127, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 8 to 8 outputs + WORD $0x858041ab // ldr z11, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85904049 // ldr z9, [x2, #128, MUL VL] + WORD $0x8590444a // ldr z10, [x2, #129, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85904849 // ldr z9, [x2, #130, MUL VL] + WORD $0x85904c4a // ldr z10, [x2, #131, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85905049 // ldr z9, [x2, #132, MUL VL] + WORD $0x8590544a // ldr z10, [x2, #133, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85905849 // ldr z9, [x2, #134, MUL VL] + WORD $0x85905c4a // ldr z10, [x2, #135, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85914049 // ldr z9, [x2, #136, MUL VL] + WORD $0x8591444a // ldr z10, [x2, #137, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85914849 // ldr z9, [x2, #138, MUL VL] + WORD $0x85914c4a // ldr z10, [x2, #139, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85915049 // ldr z9, [x2, #140, MUL VL] + WORD $0x8591544a // ldr z10, [x2, #141, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85915849 // ldr z9, [x2, #142, MUL VL] + WORD $0x85915c4a // ldr z10, [x2, #143, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 9 to 8 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85924049 // ldr z9, [x2, #144, MUL VL] + WORD $0x8592444a // ldr z10, [x2, #145, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85924849 // ldr z9, [x2, #146, MUL VL] + WORD $0x85924c4a // ldr z10, [x2, #147, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85925049 // ldr z9, [x2, #148, MUL VL] + WORD $0x8592544a // ldr z10, [x2, #149, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85925849 // ldr z9, [x2, #150, MUL VL] + WORD $0x85925c4a // ldr z10, [x2, #151, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85934049 // ldr z9, [x2, #152, MUL VL] + WORD $0x8593444a // ldr z10, [x2, #153, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85934849 // ldr z9, [x2, #154, MUL VL] + WORD $0x85934c4a // ldr z10, [x2, #155, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85935049 // ldr z9, [x2, #156, MUL VL] + WORD $0x8593544a // ldr z10, [x2, #157, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85935849 // ldr z9, [x2, #158, MUL VL] + WORD $0x85935c4a // ldr z10, [x2, #159, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + +mulSve_10x8_store: + // Store 8 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x8_loop + +mulSve_10x8_end: + RET + +// func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x8Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c8 // mov z8.d, x6 + WORD $0x05212108 // dup z8.b, z8.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804049 // ldr z9, [x2] + WORD $0x8580444a // ldr z10, [x2, #1, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804849 // ldr z9, [x2, #2, MUL VL] + WORD $0x85804c4a // ldr z10, [x2, #3, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805049 // ldr z9, [x2, #4, MUL VL] + WORD $0x8580544a // ldr z10, [x2, #5, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805849 // ldr z9, [x2, #6, MUL VL] + WORD $0x85805c4a // ldr z10, [x2, #7, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814049 // ldr z9, [x2, #8, MUL VL] + WORD $0x8581444a // ldr z10, [x2, #9, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814849 // ldr z9, [x2, #10, MUL VL] + WORD $0x85814c4a // ldr z10, [x2, #11, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815049 // ldr z9, [x2, #12, MUL VL] + WORD $0x8581544a // ldr z10, [x2, #13, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815849 // ldr z9, [x2, #14, MUL VL] + WORD $0x85815c4a // ldr z10, [x2, #15, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 1 to 8 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85824049 // ldr z9, [x2, #16, MUL VL] + WORD $0x8582444a // ldr z10, [x2, #17, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85824849 // ldr z9, [x2, #18, MUL VL] + WORD $0x85824c4a // ldr z10, [x2, #19, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825049 // ldr z9, [x2, #20, MUL VL] + WORD $0x8582544a // ldr z10, [x2, #21, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85825849 // ldr z9, [x2, #22, MUL VL] + WORD $0x85825c4a // ldr z10, [x2, #23, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85834049 // ldr z9, [x2, #24, MUL VL] + WORD $0x8583444a // ldr z10, [x2, #25, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85834849 // ldr z9, [x2, #26, MUL VL] + WORD $0x85834c4a // ldr z10, [x2, #27, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85835049 // ldr z9, [x2, #28, MUL VL] + WORD $0x8583544a // ldr z10, [x2, #29, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85835849 // ldr z9, [x2, #30, MUL VL] + WORD $0x85835c4a // ldr z10, [x2, #31, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 2 to 8 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85844049 // ldr z9, [x2, #32, MUL VL] + WORD $0x8584444a // ldr z10, [x2, #33, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85844849 // ldr z9, [x2, #34, MUL VL] + WORD $0x85844c4a // ldr z10, [x2, #35, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845049 // ldr z9, [x2, #36, MUL VL] + WORD $0x8584544a // ldr z10, [x2, #37, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85845849 // ldr z9, [x2, #38, MUL VL] + WORD $0x85845c4a // ldr z10, [x2, #39, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854049 // ldr z9, [x2, #40, MUL VL] + WORD $0x8585444a // ldr z10, [x2, #41, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85854849 // ldr z9, [x2, #42, MUL VL] + WORD $0x85854c4a // ldr z10, [x2, #43, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85855049 // ldr z9, [x2, #44, MUL VL] + WORD $0x8585544a // ldr z10, [x2, #45, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85855849 // ldr z9, [x2, #46, MUL VL] + WORD $0x85855c4a // ldr z10, [x2, #47, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 3 to 8 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85864049 // ldr z9, [x2, #48, MUL VL] + WORD $0x8586444a // ldr z10, [x2, #49, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85864849 // ldr z9, [x2, #50, MUL VL] + WORD $0x85864c4a // ldr z10, [x2, #51, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85865049 // ldr z9, [x2, #52, MUL VL] + WORD $0x8586544a // ldr z10, [x2, #53, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85865849 // ldr z9, [x2, #54, MUL VL] + WORD $0x85865c4a // ldr z10, [x2, #55, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874049 // ldr z9, [x2, #56, MUL VL] + WORD $0x8587444a // ldr z10, [x2, #57, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85874849 // ldr z9, [x2, #58, MUL VL] + WORD $0x85874c4a // ldr z10, [x2, #59, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85875049 // ldr z9, [x2, #60, MUL VL] + WORD $0x8587544a // ldr z10, [x2, #61, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85875849 // ldr z9, [x2, #62, MUL VL] + WORD $0x85875c4a // ldr z10, [x2, #63, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 4 to 8 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85884049 // ldr z9, [x2, #64, MUL VL] + WORD $0x8588444a // ldr z10, [x2, #65, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85884849 // ldr z9, [x2, #66, MUL VL] + WORD $0x85884c4a // ldr z10, [x2, #67, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85885049 // ldr z9, [x2, #68, MUL VL] + WORD $0x8588544a // ldr z10, [x2, #69, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85885849 // ldr z9, [x2, #70, MUL VL] + WORD $0x85885c4a // ldr z10, [x2, #71, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85894049 // ldr z9, [x2, #72, MUL VL] + WORD $0x8589444a // ldr z10, [x2, #73, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85894849 // ldr z9, [x2, #74, MUL VL] + WORD $0x85894c4a // ldr z10, [x2, #75, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85895049 // ldr z9, [x2, #76, MUL VL] + WORD $0x8589544a // ldr z10, [x2, #77, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85895849 // ldr z9, [x2, #78, MUL VL] + WORD $0x85895c4a // ldr z10, [x2, #79, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 5 to 8 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858a4049 // ldr z9, [x2, #80, MUL VL] + WORD $0x858a444a // ldr z10, [x2, #81, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858a4849 // ldr z9, [x2, #82, MUL VL] + WORD $0x858a4c4a // ldr z10, [x2, #83, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858a5049 // ldr z9, [x2, #84, MUL VL] + WORD $0x858a544a // ldr z10, [x2, #85, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858a5849 // ldr z9, [x2, #86, MUL VL] + WORD $0x858a5c4a // ldr z10, [x2, #87, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858b4049 // ldr z9, [x2, #88, MUL VL] + WORD $0x858b444a // ldr z10, [x2, #89, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858b4849 // ldr z9, [x2, #90, MUL VL] + WORD $0x858b4c4a // ldr z10, [x2, #91, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858b5049 // ldr z9, [x2, #92, MUL VL] + WORD $0x858b544a // ldr z10, [x2, #93, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858b5849 // ldr z9, [x2, #94, MUL VL] + WORD $0x858b5c4a // ldr z10, [x2, #95, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 6 to 8 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858c4049 // ldr z9, [x2, #96, MUL VL] + WORD $0x858c444a // ldr z10, [x2, #97, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858c4849 // ldr z9, [x2, #98, MUL VL] + WORD $0x858c4c4a // ldr z10, [x2, #99, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858c5049 // ldr z9, [x2, #100, MUL VL] + WORD $0x858c544a // ldr z10, [x2, #101, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858c5849 // ldr z9, [x2, #102, MUL VL] + WORD $0x858c5c4a // ldr z10, [x2, #103, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858d4049 // ldr z9, [x2, #104, MUL VL] + WORD $0x858d444a // ldr z10, [x2, #105, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858d4849 // ldr z9, [x2, #106, MUL VL] + WORD $0x858d4c4a // ldr z10, [x2, #107, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858d5049 // ldr z9, [x2, #108, MUL VL] + WORD $0x858d544a // ldr z10, [x2, #109, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858d5849 // ldr z9, [x2, #110, MUL VL] + WORD $0x858d5c4a // ldr z10, [x2, #111, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 7 to 8 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858e4049 // ldr z9, [x2, #112, MUL VL] + WORD $0x858e444a // ldr z10, [x2, #113, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858e4849 // ldr z9, [x2, #114, MUL VL] + WORD $0x858e4c4a // ldr z10, [x2, #115, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858e5049 // ldr z9, [x2, #116, MUL VL] + WORD $0x858e544a // ldr z10, [x2, #117, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858e5849 // ldr z9, [x2, #118, MUL VL] + WORD $0x858e5c4a // ldr z10, [x2, #119, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858f4049 // ldr z9, [x2, #120, MUL VL] + WORD $0x858f444a // ldr z10, [x2, #121, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858f4849 // ldr z9, [x2, #122, MUL VL] + WORD $0x858f4c4a // ldr z10, [x2, #123, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858f5049 // ldr z9, [x2, #124, MUL VL] + WORD $0x858f544a // ldr z10, [x2, #125, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858f5849 // ldr z9, [x2, #126, MUL VL] + WORD $0x858f5c4a // ldr z10, [x2, #127, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 8 to 8 outputs + WORD $0x858041ab // ldr z11, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85904049 // ldr z9, [x2, #128, MUL VL] + WORD $0x8590444a // ldr z10, [x2, #129, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85904849 // ldr z9, [x2, #130, MUL VL] + WORD $0x85904c4a // ldr z10, [x2, #131, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85905049 // ldr z9, [x2, #132, MUL VL] + WORD $0x8590544a // ldr z10, [x2, #133, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85905849 // ldr z9, [x2, #134, MUL VL] + WORD $0x85905c4a // ldr z10, [x2, #135, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85914049 // ldr z9, [x2, #136, MUL VL] + WORD $0x8591444a // ldr z10, [x2, #137, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85914849 // ldr z9, [x2, #138, MUL VL] + WORD $0x85914c4a // ldr z10, [x2, #139, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85915049 // ldr z9, [x2, #140, MUL VL] + WORD $0x8591544a // ldr z10, [x2, #141, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85915849 // ldr z9, [x2, #142, MUL VL] + WORD $0x85915c4a // ldr z10, [x2, #143, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 9 to 8 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85924049 // ldr z9, [x2, #144, MUL VL] + WORD $0x8592444a // ldr z10, [x2, #145, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85924849 // ldr z9, [x2, #146, MUL VL] + WORD $0x85924c4a // ldr z10, [x2, #147, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85925049 // ldr z9, [x2, #148, MUL VL] + WORD $0x8592544a // ldr z10, [x2, #149, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85925849 // ldr z9, [x2, #150, MUL VL] + WORD $0x85925c4a // ldr z10, [x2, #151, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85934049 // ldr z9, [x2, #152, MUL VL] + WORD $0x8593444a // ldr z10, [x2, #153, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85934849 // ldr z9, [x2, #154, MUL VL] + WORD $0x85934c4a // ldr z10, [x2, #155, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85935049 // ldr z9, [x2, #156, MUL VL] + WORD $0x8593544a // ldr z10, [x2, #157, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85935849 // ldr z9, [x2, #158, MUL VL] + WORD $0x85935c4a // ldr z10, [x2, #159, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + +mulSve_10x8Xor_store: + // Store 8 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x8Xor_loop + +mulSve_10x8Xor_end: + RET + +// func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x9_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c9 // mov z9.d, x6 + WORD $0x05212129 // dup z9.b, z9.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + WORD $0x8580402c // ldr z12, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8580404a // ldr z10, [x2] + WORD $0x8580444b // ldr z11, [x2, #1, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3160 // eor z0.d, z11.d, z10.d + WORD $0x8580484a // ldr z10, [x2, #2, MUL VL] + WORD $0x85804c4b // ldr z11, [x2, #3, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3161 // eor z1.d, z11.d, z10.d + WORD $0x8580504a // ldr z10, [x2, #4, MUL VL] + WORD $0x8580544b // ldr z11, [x2, #5, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3162 // eor z2.d, z11.d, z10.d + WORD $0x8580584a // ldr z10, [x2, #6, MUL VL] + WORD $0x85805c4b // ldr z11, [x2, #7, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3163 // eor z3.d, z11.d, z10.d + WORD $0x8581404a // ldr z10, [x2, #8, MUL VL] + WORD $0x8581444b // ldr z11, [x2, #9, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3164 // eor z4.d, z11.d, z10.d + WORD $0x8581484a // ldr z10, [x2, #10, MUL VL] + WORD $0x85814c4b // ldr z11, [x2, #11, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3165 // eor z5.d, z11.d, z10.d + WORD $0x8581504a // ldr z10, [x2, #12, MUL VL] + WORD $0x8581544b // ldr z11, [x2, #13, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3166 // eor z6.d, z11.d, z10.d + WORD $0x8581584a // ldr z10, [x2, #14, MUL VL] + WORD $0x85815c4b // ldr z11, [x2, #15, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3167 // eor z7.d, z11.d, z10.d + WORD $0x8582404a // ldr z10, [x2, #16, MUL VL] + WORD $0x8582444b // ldr z11, [x2, #17, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3168 // eor z8.d, z11.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 1 to 9 outputs + WORD $0x8580408c // ldr z12, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8582484a // ldr z10, [x2, #18, MUL VL] + WORD $0x85824c4b // ldr z11, [x2, #19, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8582504a // ldr z10, [x2, #20, MUL VL] + WORD $0x8582544b // ldr z11, [x2, #21, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8582584a // ldr z10, [x2, #22, MUL VL] + WORD $0x85825c4b // ldr z11, [x2, #23, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8583404a // ldr z10, [x2, #24, MUL VL] + WORD $0x8583444b // ldr z11, [x2, #25, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8583484a // ldr z10, [x2, #26, MUL VL] + WORD $0x85834c4b // ldr z11, [x2, #27, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8583504a // ldr z10, [x2, #28, MUL VL] + WORD $0x8583544b // ldr z11, [x2, #29, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8583584a // ldr z10, [x2, #30, MUL VL] + WORD $0x85835c4b // ldr z11, [x2, #31, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8584404a // ldr z10, [x2, #32, MUL VL] + WORD $0x8584444b // ldr z11, [x2, #33, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8584484a // ldr z10, [x2, #34, MUL VL] + WORD $0x85844c4b // ldr z11, [x2, #35, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 2 to 9 outputs + WORD $0x858040ac // ldr z12, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8584504a // ldr z10, [x2, #36, MUL VL] + WORD $0x8584544b // ldr z11, [x2, #37, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8584584a // ldr z10, [x2, #38, MUL VL] + WORD $0x85845c4b // ldr z11, [x2, #39, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8585404a // ldr z10, [x2, #40, MUL VL] + WORD $0x8585444b // ldr z11, [x2, #41, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8585484a // ldr z10, [x2, #42, MUL VL] + WORD $0x85854c4b // ldr z11, [x2, #43, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8585504a // ldr z10, [x2, #44, MUL VL] + WORD $0x8585544b // ldr z11, [x2, #45, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8585584a // ldr z10, [x2, #46, MUL VL] + WORD $0x85855c4b // ldr z11, [x2, #47, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8586404a // ldr z10, [x2, #48, MUL VL] + WORD $0x8586444b // ldr z11, [x2, #49, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8586484a // ldr z10, [x2, #50, MUL VL] + WORD $0x85864c4b // ldr z11, [x2, #51, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8586504a // ldr z10, [x2, #52, MUL VL] + WORD $0x8586544b // ldr z11, [x2, #53, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 3 to 9 outputs + WORD $0x8580410c // ldr z12, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8586584a // ldr z10, [x2, #54, MUL VL] + WORD $0x85865c4b // ldr z11, [x2, #55, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8587404a // ldr z10, [x2, #56, MUL VL] + WORD $0x8587444b // ldr z11, [x2, #57, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8587484a // ldr z10, [x2, #58, MUL VL] + WORD $0x85874c4b // ldr z11, [x2, #59, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8587504a // ldr z10, [x2, #60, MUL VL] + WORD $0x8587544b // ldr z11, [x2, #61, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8587584a // ldr z10, [x2, #62, MUL VL] + WORD $0x85875c4b // ldr z11, [x2, #63, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8588404a // ldr z10, [x2, #64, MUL VL] + WORD $0x8588444b // ldr z11, [x2, #65, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8588484a // ldr z10, [x2, #66, MUL VL] + WORD $0x85884c4b // ldr z11, [x2, #67, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8588504a // ldr z10, [x2, #68, MUL VL] + WORD $0x8588544b // ldr z11, [x2, #69, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8588584a // ldr z10, [x2, #70, MUL VL] + WORD $0x85885c4b // ldr z11, [x2, #71, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 4 to 9 outputs + WORD $0x8580412c // ldr z12, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8589404a // ldr z10, [x2, #72, MUL VL] + WORD $0x8589444b // ldr z11, [x2, #73, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8589484a // ldr z10, [x2, #74, MUL VL] + WORD $0x85894c4b // ldr z11, [x2, #75, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8589504a // ldr z10, [x2, #76, MUL VL] + WORD $0x8589544b // ldr z11, [x2, #77, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8589584a // ldr z10, [x2, #78, MUL VL] + WORD $0x85895c4b // ldr z11, [x2, #79, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858a404a // ldr z10, [x2, #80, MUL VL] + WORD $0x858a444b // ldr z11, [x2, #81, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858a484a // ldr z10, [x2, #82, MUL VL] + WORD $0x858a4c4b // ldr z11, [x2, #83, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858a504a // ldr z10, [x2, #84, MUL VL] + WORD $0x858a544b // ldr z11, [x2, #85, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858a584a // ldr z10, [x2, #86, MUL VL] + WORD $0x858a5c4b // ldr z11, [x2, #87, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858b404a // ldr z10, [x2, #88, MUL VL] + WORD $0x858b444b // ldr z11, [x2, #89, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 5 to 9 outputs + WORD $0x8580414c // ldr z12, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858b484a // ldr z10, [x2, #90, MUL VL] + WORD $0x858b4c4b // ldr z11, [x2, #91, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858b504a // ldr z10, [x2, #92, MUL VL] + WORD $0x858b544b // ldr z11, [x2, #93, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858b584a // ldr z10, [x2, #94, MUL VL] + WORD $0x858b5c4b // ldr z11, [x2, #95, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858c404a // ldr z10, [x2, #96, MUL VL] + WORD $0x858c444b // ldr z11, [x2, #97, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858c484a // ldr z10, [x2, #98, MUL VL] + WORD $0x858c4c4b // ldr z11, [x2, #99, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858c504a // ldr z10, [x2, #100, MUL VL] + WORD $0x858c544b // ldr z11, [x2, #101, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858c584a // ldr z10, [x2, #102, MUL VL] + WORD $0x858c5c4b // ldr z11, [x2, #103, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858d404a // ldr z10, [x2, #104, MUL VL] + WORD $0x858d444b // ldr z11, [x2, #105, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858d484a // ldr z10, [x2, #106, MUL VL] + WORD $0x858d4c4b // ldr z11, [x2, #107, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 6 to 9 outputs + WORD $0x8580416c // ldr z12, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858d504a // ldr z10, [x2, #108, MUL VL] + WORD $0x858d544b // ldr z11, [x2, #109, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858d584a // ldr z10, [x2, #110, MUL VL] + WORD $0x858d5c4b // ldr z11, [x2, #111, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858e404a // ldr z10, [x2, #112, MUL VL] + WORD $0x858e444b // ldr z11, [x2, #113, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858e484a // ldr z10, [x2, #114, MUL VL] + WORD $0x858e4c4b // ldr z11, [x2, #115, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858e504a // ldr z10, [x2, #116, MUL VL] + WORD $0x858e544b // ldr z11, [x2, #117, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858e584a // ldr z10, [x2, #118, MUL VL] + WORD $0x858e5c4b // ldr z11, [x2, #119, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858f404a // ldr z10, [x2, #120, MUL VL] + WORD $0x858f444b // ldr z11, [x2, #121, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858f484a // ldr z10, [x2, #122, MUL VL] + WORD $0x858f4c4b // ldr z11, [x2, #123, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858f504a // ldr z10, [x2, #124, MUL VL] + WORD $0x858f544b // ldr z11, [x2, #125, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 7 to 9 outputs + WORD $0x8580418c // ldr z12, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858f584a // ldr z10, [x2, #126, MUL VL] + WORD $0x858f5c4b // ldr z11, [x2, #127, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8590404a // ldr z10, [x2, #128, MUL VL] + WORD $0x8590444b // ldr z11, [x2, #129, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8590484a // ldr z10, [x2, #130, MUL VL] + WORD $0x85904c4b // ldr z11, [x2, #131, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8590504a // ldr z10, [x2, #132, MUL VL] + WORD $0x8590544b // ldr z11, [x2, #133, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8590584a // ldr z10, [x2, #134, MUL VL] + WORD $0x85905c4b // ldr z11, [x2, #135, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8591404a // ldr z10, [x2, #136, MUL VL] + WORD $0x8591444b // ldr z11, [x2, #137, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8591484a // ldr z10, [x2, #138, MUL VL] + WORD $0x85914c4b // ldr z11, [x2, #139, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8591504a // ldr z10, [x2, #140, MUL VL] + WORD $0x8591544b // ldr z11, [x2, #141, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8591584a // ldr z10, [x2, #142, MUL VL] + WORD $0x85915c4b // ldr z11, [x2, #143, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 8 to 9 outputs + WORD $0x858041ac // ldr z12, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8592404a // ldr z10, [x2, #144, MUL VL] + WORD $0x8592444b // ldr z11, [x2, #145, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8592484a // ldr z10, [x2, #146, MUL VL] + WORD $0x85924c4b // ldr z11, [x2, #147, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8592504a // ldr z10, [x2, #148, MUL VL] + WORD $0x8592544b // ldr z11, [x2, #149, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8592584a // ldr z10, [x2, #150, MUL VL] + WORD $0x85925c4b // ldr z11, [x2, #151, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8593404a // ldr z10, [x2, #152, MUL VL] + WORD $0x8593444b // ldr z11, [x2, #153, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8593484a // ldr z10, [x2, #154, MUL VL] + WORD $0x85934c4b // ldr z11, [x2, #155, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8593504a // ldr z10, [x2, #156, MUL VL] + WORD $0x8593544b // ldr z11, [x2, #157, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8593584a // ldr z10, [x2, #158, MUL VL] + WORD $0x85935c4b // ldr z11, [x2, #159, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8594404a // ldr z10, [x2, #160, MUL VL] + WORD $0x8594444b // ldr z11, [x2, #161, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 9 to 9 outputs + WORD $0x8580406c // ldr z12, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8594484a // ldr z10, [x2, #162, MUL VL] + WORD $0x85944c4b // ldr z11, [x2, #163, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8594504a // ldr z10, [x2, #164, MUL VL] + WORD $0x8594544b // ldr z11, [x2, #165, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8594584a // ldr z10, [x2, #166, MUL VL] + WORD $0x85945c4b // ldr z11, [x2, #167, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8595404a // ldr z10, [x2, #168, MUL VL] + WORD $0x8595444b // ldr z11, [x2, #169, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8595484a // ldr z10, [x2, #170, MUL VL] + WORD $0x85954c4b // ldr z11, [x2, #171, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8595504a // ldr z10, [x2, #172, MUL VL] + WORD $0x8595544b // ldr z11, [x2, #173, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8595584a // ldr z10, [x2, #174, MUL VL] + WORD $0x85955c4b // ldr z11, [x2, #175, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8596404a // ldr z10, [x2, #176, MUL VL] + WORD $0x8596444b // ldr z11, [x2, #177, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8596484a // ldr z10, [x2, #178, MUL VL] + WORD $0x85964c4b // ldr z11, [x2, #179, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + +mulSve_10x9_store: + // Store 9 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x9_loop + +mulSve_10x9_end: + RET + +// func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x9Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c9 // mov z9.d, x6 + WORD $0x05212129 // dup z9.b, z9.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + WORD $0x8580402c // ldr z12, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580404a // ldr z10, [x2] + WORD $0x8580444b // ldr z11, [x2, #1, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580484a // ldr z10, [x2, #2, MUL VL] + WORD $0x85804c4b // ldr z11, [x2, #3, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580504a // ldr z10, [x2, #4, MUL VL] + WORD $0x8580544b // ldr z11, [x2, #5, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580584a // ldr z10, [x2, #6, MUL VL] + WORD $0x85805c4b // ldr z11, [x2, #7, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581404a // ldr z10, [x2, #8, MUL VL] + WORD $0x8581444b // ldr z11, [x2, #9, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581484a // ldr z10, [x2, #10, MUL VL] + WORD $0x85814c4b // ldr z11, [x2, #11, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581504a // ldr z10, [x2, #12, MUL VL] + WORD $0x8581544b // ldr z11, [x2, #13, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581584a // ldr z10, [x2, #14, MUL VL] + WORD $0x85815c4b // ldr z11, [x2, #15, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + MOVD 192(R14), R6 + WORD $0xa5ef40c8 // ld1d { z8.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582404a // ldr z10, [x2, #16, MUL VL] + WORD $0x8582444b // ldr z11, [x2, #17, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 1 to 9 outputs + WORD $0x8580408c // ldr z12, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8582484a // ldr z10, [x2, #18, MUL VL] + WORD $0x85824c4b // ldr z11, [x2, #19, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8582504a // ldr z10, [x2, #20, MUL VL] + WORD $0x8582544b // ldr z11, [x2, #21, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8582584a // ldr z10, [x2, #22, MUL VL] + WORD $0x85825c4b // ldr z11, [x2, #23, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8583404a // ldr z10, [x2, #24, MUL VL] + WORD $0x8583444b // ldr z11, [x2, #25, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8583484a // ldr z10, [x2, #26, MUL VL] + WORD $0x85834c4b // ldr z11, [x2, #27, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8583504a // ldr z10, [x2, #28, MUL VL] + WORD $0x8583544b // ldr z11, [x2, #29, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8583584a // ldr z10, [x2, #30, MUL VL] + WORD $0x85835c4b // ldr z11, [x2, #31, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8584404a // ldr z10, [x2, #32, MUL VL] + WORD $0x8584444b // ldr z11, [x2, #33, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8584484a // ldr z10, [x2, #34, MUL VL] + WORD $0x85844c4b // ldr z11, [x2, #35, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 2 to 9 outputs + WORD $0x858040ac // ldr z12, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8584504a // ldr z10, [x2, #36, MUL VL] + WORD $0x8584544b // ldr z11, [x2, #37, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8584584a // ldr z10, [x2, #38, MUL VL] + WORD $0x85845c4b // ldr z11, [x2, #39, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8585404a // ldr z10, [x2, #40, MUL VL] + WORD $0x8585444b // ldr z11, [x2, #41, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8585484a // ldr z10, [x2, #42, MUL VL] + WORD $0x85854c4b // ldr z11, [x2, #43, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8585504a // ldr z10, [x2, #44, MUL VL] + WORD $0x8585544b // ldr z11, [x2, #45, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8585584a // ldr z10, [x2, #46, MUL VL] + WORD $0x85855c4b // ldr z11, [x2, #47, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8586404a // ldr z10, [x2, #48, MUL VL] + WORD $0x8586444b // ldr z11, [x2, #49, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8586484a // ldr z10, [x2, #50, MUL VL] + WORD $0x85864c4b // ldr z11, [x2, #51, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8586504a // ldr z10, [x2, #52, MUL VL] + WORD $0x8586544b // ldr z11, [x2, #53, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 3 to 9 outputs + WORD $0x8580410c // ldr z12, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8586584a // ldr z10, [x2, #54, MUL VL] + WORD $0x85865c4b // ldr z11, [x2, #55, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8587404a // ldr z10, [x2, #56, MUL VL] + WORD $0x8587444b // ldr z11, [x2, #57, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8587484a // ldr z10, [x2, #58, MUL VL] + WORD $0x85874c4b // ldr z11, [x2, #59, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8587504a // ldr z10, [x2, #60, MUL VL] + WORD $0x8587544b // ldr z11, [x2, #61, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8587584a // ldr z10, [x2, #62, MUL VL] + WORD $0x85875c4b // ldr z11, [x2, #63, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8588404a // ldr z10, [x2, #64, MUL VL] + WORD $0x8588444b // ldr z11, [x2, #65, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8588484a // ldr z10, [x2, #66, MUL VL] + WORD $0x85884c4b // ldr z11, [x2, #67, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8588504a // ldr z10, [x2, #68, MUL VL] + WORD $0x8588544b // ldr z11, [x2, #69, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8588584a // ldr z10, [x2, #70, MUL VL] + WORD $0x85885c4b // ldr z11, [x2, #71, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 4 to 9 outputs + WORD $0x8580412c // ldr z12, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8589404a // ldr z10, [x2, #72, MUL VL] + WORD $0x8589444b // ldr z11, [x2, #73, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8589484a // ldr z10, [x2, #74, MUL VL] + WORD $0x85894c4b // ldr z11, [x2, #75, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8589504a // ldr z10, [x2, #76, MUL VL] + WORD $0x8589544b // ldr z11, [x2, #77, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8589584a // ldr z10, [x2, #78, MUL VL] + WORD $0x85895c4b // ldr z11, [x2, #79, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858a404a // ldr z10, [x2, #80, MUL VL] + WORD $0x858a444b // ldr z11, [x2, #81, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858a484a // ldr z10, [x2, #82, MUL VL] + WORD $0x858a4c4b // ldr z11, [x2, #83, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858a504a // ldr z10, [x2, #84, MUL VL] + WORD $0x858a544b // ldr z11, [x2, #85, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858a584a // ldr z10, [x2, #86, MUL VL] + WORD $0x858a5c4b // ldr z11, [x2, #87, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858b404a // ldr z10, [x2, #88, MUL VL] + WORD $0x858b444b // ldr z11, [x2, #89, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 5 to 9 outputs + WORD $0x8580414c // ldr z12, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858b484a // ldr z10, [x2, #90, MUL VL] + WORD $0x858b4c4b // ldr z11, [x2, #91, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858b504a // ldr z10, [x2, #92, MUL VL] + WORD $0x858b544b // ldr z11, [x2, #93, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858b584a // ldr z10, [x2, #94, MUL VL] + WORD $0x858b5c4b // ldr z11, [x2, #95, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858c404a // ldr z10, [x2, #96, MUL VL] + WORD $0x858c444b // ldr z11, [x2, #97, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858c484a // ldr z10, [x2, #98, MUL VL] + WORD $0x858c4c4b // ldr z11, [x2, #99, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858c504a // ldr z10, [x2, #100, MUL VL] + WORD $0x858c544b // ldr z11, [x2, #101, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858c584a // ldr z10, [x2, #102, MUL VL] + WORD $0x858c5c4b // ldr z11, [x2, #103, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858d404a // ldr z10, [x2, #104, MUL VL] + WORD $0x858d444b // ldr z11, [x2, #105, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858d484a // ldr z10, [x2, #106, MUL VL] + WORD $0x858d4c4b // ldr z11, [x2, #107, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 6 to 9 outputs + WORD $0x8580416c // ldr z12, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858d504a // ldr z10, [x2, #108, MUL VL] + WORD $0x858d544b // ldr z11, [x2, #109, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858d584a // ldr z10, [x2, #110, MUL VL] + WORD $0x858d5c4b // ldr z11, [x2, #111, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858e404a // ldr z10, [x2, #112, MUL VL] + WORD $0x858e444b // ldr z11, [x2, #113, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858e484a // ldr z10, [x2, #114, MUL VL] + WORD $0x858e4c4b // ldr z11, [x2, #115, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858e504a // ldr z10, [x2, #116, MUL VL] + WORD $0x858e544b // ldr z11, [x2, #117, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858e584a // ldr z10, [x2, #118, MUL VL] + WORD $0x858e5c4b // ldr z11, [x2, #119, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858f404a // ldr z10, [x2, #120, MUL VL] + WORD $0x858f444b // ldr z11, [x2, #121, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858f484a // ldr z10, [x2, #122, MUL VL] + WORD $0x858f4c4b // ldr z11, [x2, #123, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858f504a // ldr z10, [x2, #124, MUL VL] + WORD $0x858f544b // ldr z11, [x2, #125, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 7 to 9 outputs + WORD $0x8580418c // ldr z12, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858f584a // ldr z10, [x2, #126, MUL VL] + WORD $0x858f5c4b // ldr z11, [x2, #127, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8590404a // ldr z10, [x2, #128, MUL VL] + WORD $0x8590444b // ldr z11, [x2, #129, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8590484a // ldr z10, [x2, #130, MUL VL] + WORD $0x85904c4b // ldr z11, [x2, #131, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8590504a // ldr z10, [x2, #132, MUL VL] + WORD $0x8590544b // ldr z11, [x2, #133, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8590584a // ldr z10, [x2, #134, MUL VL] + WORD $0x85905c4b // ldr z11, [x2, #135, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8591404a // ldr z10, [x2, #136, MUL VL] + WORD $0x8591444b // ldr z11, [x2, #137, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8591484a // ldr z10, [x2, #138, MUL VL] + WORD $0x85914c4b // ldr z11, [x2, #139, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8591504a // ldr z10, [x2, #140, MUL VL] + WORD $0x8591544b // ldr z11, [x2, #141, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8591584a // ldr z10, [x2, #142, MUL VL] + WORD $0x85915c4b // ldr z11, [x2, #143, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 8 to 9 outputs + WORD $0x858041ac // ldr z12, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8592404a // ldr z10, [x2, #144, MUL VL] + WORD $0x8592444b // ldr z11, [x2, #145, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8592484a // ldr z10, [x2, #146, MUL VL] + WORD $0x85924c4b // ldr z11, [x2, #147, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8592504a // ldr z10, [x2, #148, MUL VL] + WORD $0x8592544b // ldr z11, [x2, #149, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8592584a // ldr z10, [x2, #150, MUL VL] + WORD $0x85925c4b // ldr z11, [x2, #151, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8593404a // ldr z10, [x2, #152, MUL VL] + WORD $0x8593444b // ldr z11, [x2, #153, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8593484a // ldr z10, [x2, #154, MUL VL] + WORD $0x85934c4b // ldr z11, [x2, #155, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8593504a // ldr z10, [x2, #156, MUL VL] + WORD $0x8593544b // ldr z11, [x2, #157, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8593584a // ldr z10, [x2, #158, MUL VL] + WORD $0x85935c4b // ldr z11, [x2, #159, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8594404a // ldr z10, [x2, #160, MUL VL] + WORD $0x8594444b // ldr z11, [x2, #161, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 9 to 9 outputs + WORD $0x8580406c // ldr z12, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8594484a // ldr z10, [x2, #162, MUL VL] + WORD $0x85944c4b // ldr z11, [x2, #163, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8594504a // ldr z10, [x2, #164, MUL VL] + WORD $0x8594544b // ldr z11, [x2, #165, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8594584a // ldr z10, [x2, #166, MUL VL] + WORD $0x85945c4b // ldr z11, [x2, #167, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8595404a // ldr z10, [x2, #168, MUL VL] + WORD $0x8595444b // ldr z11, [x2, #169, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8595484a // ldr z10, [x2, #170, MUL VL] + WORD $0x85954c4b // ldr z11, [x2, #171, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8595504a // ldr z10, [x2, #172, MUL VL] + WORD $0x8595544b // ldr z11, [x2, #173, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8595584a // ldr z10, [x2, #174, MUL VL] + WORD $0x85955c4b // ldr z11, [x2, #175, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8596404a // ldr z10, [x2, #176, MUL VL] + WORD $0x8596444b // ldr z11, [x2, #177, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8596484a // ldr z10, [x2, #178, MUL VL] + WORD $0x85964c4b // ldr z11, [x2, #179, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + +mulSve_10x9Xor_store: + // Store 9 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x9Xor_loop + +mulSve_10x9Xor_end: + RET + +// func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x10_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038ca // mov z10.d, x6 + WORD $0x0521214a // dup z10.b, z10.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + WORD $0x8580402d // ldr z13, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8580404b // ldr z11, [x2] + WORD $0x8580444c // ldr z12, [x2, #1, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3180 // eor z0.d, z12.d, z11.d + WORD $0x8580484b // ldr z11, [x2, #2, MUL VL] + WORD $0x85804c4c // ldr z12, [x2, #3, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3181 // eor z1.d, z12.d, z11.d + WORD $0x8580504b // ldr z11, [x2, #4, MUL VL] + WORD $0x8580544c // ldr z12, [x2, #5, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3182 // eor z2.d, z12.d, z11.d + WORD $0x8580584b // ldr z11, [x2, #6, MUL VL] + WORD $0x85805c4c // ldr z12, [x2, #7, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3183 // eor z3.d, z12.d, z11.d + WORD $0x8581404b // ldr z11, [x2, #8, MUL VL] + WORD $0x8581444c // ldr z12, [x2, #9, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3184 // eor z4.d, z12.d, z11.d + WORD $0x8581484b // ldr z11, [x2, #10, MUL VL] + WORD $0x85814c4c // ldr z12, [x2, #11, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3185 // eor z5.d, z12.d, z11.d + WORD $0x8581504b // ldr z11, [x2, #12, MUL VL] + WORD $0x8581544c // ldr z12, [x2, #13, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3186 // eor z6.d, z12.d, z11.d + WORD $0x8581584b // ldr z11, [x2, #14, MUL VL] + WORD $0x85815c4c // ldr z12, [x2, #15, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3187 // eor z7.d, z12.d, z11.d + WORD $0x8582404b // ldr z11, [x2, #16, MUL VL] + WORD $0x8582444c // ldr z12, [x2, #17, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3188 // eor z8.d, z12.d, z11.d + WORD $0x8582484b // ldr z11, [x2, #18, MUL VL] + WORD $0x85824c4c // ldr z12, [x2, #19, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3189 // eor z9.d, z12.d, z11.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 1 to 10 outputs + WORD $0x8580408d // ldr z13, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8582504b // ldr z11, [x2, #20, MUL VL] + WORD $0x8582544c // ldr z12, [x2, #21, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8582584b // ldr z11, [x2, #22, MUL VL] + WORD $0x85825c4c // ldr z12, [x2, #23, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8583404b // ldr z11, [x2, #24, MUL VL] + WORD $0x8583444c // ldr z12, [x2, #25, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8583484b // ldr z11, [x2, #26, MUL VL] + WORD $0x85834c4c // ldr z12, [x2, #27, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8583504b // ldr z11, [x2, #28, MUL VL] + WORD $0x8583544c // ldr z12, [x2, #29, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8583584b // ldr z11, [x2, #30, MUL VL] + WORD $0x85835c4c // ldr z12, [x2, #31, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8584404b // ldr z11, [x2, #32, MUL VL] + WORD $0x8584444c // ldr z12, [x2, #33, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8584484b // ldr z11, [x2, #34, MUL VL] + WORD $0x85844c4c // ldr z12, [x2, #35, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8584504b // ldr z11, [x2, #36, MUL VL] + WORD $0x8584544c // ldr z12, [x2, #37, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8584584b // ldr z11, [x2, #38, MUL VL] + WORD $0x85845c4c // ldr z12, [x2, #39, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 2 to 10 outputs + WORD $0x858040ad // ldr z13, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8585404b // ldr z11, [x2, #40, MUL VL] + WORD $0x8585444c // ldr z12, [x2, #41, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8585484b // ldr z11, [x2, #42, MUL VL] + WORD $0x85854c4c // ldr z12, [x2, #43, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8585504b // ldr z11, [x2, #44, MUL VL] + WORD $0x8585544c // ldr z12, [x2, #45, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8585584b // ldr z11, [x2, #46, MUL VL] + WORD $0x85855c4c // ldr z12, [x2, #47, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8586404b // ldr z11, [x2, #48, MUL VL] + WORD $0x8586444c // ldr z12, [x2, #49, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8586484b // ldr z11, [x2, #50, MUL VL] + WORD $0x85864c4c // ldr z12, [x2, #51, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8586504b // ldr z11, [x2, #52, MUL VL] + WORD $0x8586544c // ldr z12, [x2, #53, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8586584b // ldr z11, [x2, #54, MUL VL] + WORD $0x85865c4c // ldr z12, [x2, #55, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8587404b // ldr z11, [x2, #56, MUL VL] + WORD $0x8587444c // ldr z12, [x2, #57, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8587484b // ldr z11, [x2, #58, MUL VL] + WORD $0x85874c4c // ldr z12, [x2, #59, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 3 to 10 outputs + WORD $0x8580410d // ldr z13, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8587504b // ldr z11, [x2, #60, MUL VL] + WORD $0x8587544c // ldr z12, [x2, #61, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8587584b // ldr z11, [x2, #62, MUL VL] + WORD $0x85875c4c // ldr z12, [x2, #63, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8588404b // ldr z11, [x2, #64, MUL VL] + WORD $0x8588444c // ldr z12, [x2, #65, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8588484b // ldr z11, [x2, #66, MUL VL] + WORD $0x85884c4c // ldr z12, [x2, #67, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8588504b // ldr z11, [x2, #68, MUL VL] + WORD $0x8588544c // ldr z12, [x2, #69, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8588584b // ldr z11, [x2, #70, MUL VL] + WORD $0x85885c4c // ldr z12, [x2, #71, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8589404b // ldr z11, [x2, #72, MUL VL] + WORD $0x8589444c // ldr z12, [x2, #73, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8589484b // ldr z11, [x2, #74, MUL VL] + WORD $0x85894c4c // ldr z12, [x2, #75, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8589504b // ldr z11, [x2, #76, MUL VL] + WORD $0x8589544c // ldr z12, [x2, #77, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8589584b // ldr z11, [x2, #78, MUL VL] + WORD $0x85895c4c // ldr z12, [x2, #79, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 4 to 10 outputs + WORD $0x8580412d // ldr z13, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858a404b // ldr z11, [x2, #80, MUL VL] + WORD $0x858a444c // ldr z12, [x2, #81, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858a484b // ldr z11, [x2, #82, MUL VL] + WORD $0x858a4c4c // ldr z12, [x2, #83, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858a504b // ldr z11, [x2, #84, MUL VL] + WORD $0x858a544c // ldr z12, [x2, #85, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858a584b // ldr z11, [x2, #86, MUL VL] + WORD $0x858a5c4c // ldr z12, [x2, #87, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858b404b // ldr z11, [x2, #88, MUL VL] + WORD $0x858b444c // ldr z12, [x2, #89, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858b484b // ldr z11, [x2, #90, MUL VL] + WORD $0x858b4c4c // ldr z12, [x2, #91, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858b504b // ldr z11, [x2, #92, MUL VL] + WORD $0x858b544c // ldr z12, [x2, #93, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858b584b // ldr z11, [x2, #94, MUL VL] + WORD $0x858b5c4c // ldr z12, [x2, #95, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858c404b // ldr z11, [x2, #96, MUL VL] + WORD $0x858c444c // ldr z12, [x2, #97, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858c484b // ldr z11, [x2, #98, MUL VL] + WORD $0x858c4c4c // ldr z12, [x2, #99, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 5 to 10 outputs + WORD $0x8580414d // ldr z13, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858c504b // ldr z11, [x2, #100, MUL VL] + WORD $0x858c544c // ldr z12, [x2, #101, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858c584b // ldr z11, [x2, #102, MUL VL] + WORD $0x858c5c4c // ldr z12, [x2, #103, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858d404b // ldr z11, [x2, #104, MUL VL] + WORD $0x858d444c // ldr z12, [x2, #105, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858d484b // ldr z11, [x2, #106, MUL VL] + WORD $0x858d4c4c // ldr z12, [x2, #107, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858d504b // ldr z11, [x2, #108, MUL VL] + WORD $0x858d544c // ldr z12, [x2, #109, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858d584b // ldr z11, [x2, #110, MUL VL] + WORD $0x858d5c4c // ldr z12, [x2, #111, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858e404b // ldr z11, [x2, #112, MUL VL] + WORD $0x858e444c // ldr z12, [x2, #113, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858e484b // ldr z11, [x2, #114, MUL VL] + WORD $0x858e4c4c // ldr z12, [x2, #115, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858e504b // ldr z11, [x2, #116, MUL VL] + WORD $0x858e544c // ldr z12, [x2, #117, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858e584b // ldr z11, [x2, #118, MUL VL] + WORD $0x858e5c4c // ldr z12, [x2, #119, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 6 to 10 outputs + WORD $0x8580416d // ldr z13, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858f404b // ldr z11, [x2, #120, MUL VL] + WORD $0x858f444c // ldr z12, [x2, #121, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858f484b // ldr z11, [x2, #122, MUL VL] + WORD $0x858f4c4c // ldr z12, [x2, #123, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858f504b // ldr z11, [x2, #124, MUL VL] + WORD $0x858f544c // ldr z12, [x2, #125, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858f584b // ldr z11, [x2, #126, MUL VL] + WORD $0x858f5c4c // ldr z12, [x2, #127, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8590404b // ldr z11, [x2, #128, MUL VL] + WORD $0x8590444c // ldr z12, [x2, #129, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8590484b // ldr z11, [x2, #130, MUL VL] + WORD $0x85904c4c // ldr z12, [x2, #131, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8590504b // ldr z11, [x2, #132, MUL VL] + WORD $0x8590544c // ldr z12, [x2, #133, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8590584b // ldr z11, [x2, #134, MUL VL] + WORD $0x85905c4c // ldr z12, [x2, #135, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8591404b // ldr z11, [x2, #136, MUL VL] + WORD $0x8591444c // ldr z12, [x2, #137, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8591484b // ldr z11, [x2, #138, MUL VL] + WORD $0x85914c4c // ldr z12, [x2, #139, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 7 to 10 outputs + WORD $0x8580418d // ldr z13, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8591504b // ldr z11, [x2, #140, MUL VL] + WORD $0x8591544c // ldr z12, [x2, #141, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8591584b // ldr z11, [x2, #142, MUL VL] + WORD $0x85915c4c // ldr z12, [x2, #143, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8592404b // ldr z11, [x2, #144, MUL VL] + WORD $0x8592444c // ldr z12, [x2, #145, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8592484b // ldr z11, [x2, #146, MUL VL] + WORD $0x85924c4c // ldr z12, [x2, #147, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8592504b // ldr z11, [x2, #148, MUL VL] + WORD $0x8592544c // ldr z12, [x2, #149, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8592584b // ldr z11, [x2, #150, MUL VL] + WORD $0x85925c4c // ldr z12, [x2, #151, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8593404b // ldr z11, [x2, #152, MUL VL] + WORD $0x8593444c // ldr z12, [x2, #153, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8593484b // ldr z11, [x2, #154, MUL VL] + WORD $0x85934c4c // ldr z12, [x2, #155, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8593504b // ldr z11, [x2, #156, MUL VL] + WORD $0x8593544c // ldr z12, [x2, #157, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8593584b // ldr z11, [x2, #158, MUL VL] + WORD $0x85935c4c // ldr z12, [x2, #159, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 8 to 10 outputs + WORD $0x858041ad // ldr z13, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8594404b // ldr z11, [x2, #160, MUL VL] + WORD $0x8594444c // ldr z12, [x2, #161, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8594484b // ldr z11, [x2, #162, MUL VL] + WORD $0x85944c4c // ldr z12, [x2, #163, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8594504b // ldr z11, [x2, #164, MUL VL] + WORD $0x8594544c // ldr z12, [x2, #165, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8594584b // ldr z11, [x2, #166, MUL VL] + WORD $0x85945c4c // ldr z12, [x2, #167, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8595404b // ldr z11, [x2, #168, MUL VL] + WORD $0x8595444c // ldr z12, [x2, #169, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8595484b // ldr z11, [x2, #170, MUL VL] + WORD $0x85954c4c // ldr z12, [x2, #171, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8595504b // ldr z11, [x2, #172, MUL VL] + WORD $0x8595544c // ldr z12, [x2, #173, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8595584b // ldr z11, [x2, #174, MUL VL] + WORD $0x85955c4c // ldr z12, [x2, #175, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8596404b // ldr z11, [x2, #176, MUL VL] + WORD $0x8596444c // ldr z12, [x2, #177, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8596484b // ldr z11, [x2, #178, MUL VL] + WORD $0x85964c4c // ldr z12, [x2, #179, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 9 to 10 outputs + WORD $0x8580406d // ldr z13, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8596504b // ldr z11, [x2, #180, MUL VL] + WORD $0x8596544c // ldr z12, [x2, #181, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8596584b // ldr z11, [x2, #182, MUL VL] + WORD $0x85965c4c // ldr z12, [x2, #183, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8597404b // ldr z11, [x2, #184, MUL VL] + WORD $0x8597444c // ldr z12, [x2, #185, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8597484b // ldr z11, [x2, #186, MUL VL] + WORD $0x85974c4c // ldr z12, [x2, #187, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8597504b // ldr z11, [x2, #188, MUL VL] + WORD $0x8597544c // ldr z12, [x2, #189, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8597584b // ldr z11, [x2, #190, MUL VL] + WORD $0x85975c4c // ldr z12, [x2, #191, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8598404b // ldr z11, [x2, #192, MUL VL] + WORD $0x8598444c // ldr z12, [x2, #193, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8598484b // ldr z11, [x2, #194, MUL VL] + WORD $0x85984c4c // ldr z12, [x2, #195, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8598504b // ldr z11, [x2, #196, MUL VL] + WORD $0x8598544c // ldr z12, [x2, #197, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8598584b // ldr z11, [x2, #198, MUL VL] + WORD $0x85985c4c // ldr z12, [x2, #199, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + +mulSve_10x10_store: + // Store 10 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + MOVD 216(R14), R6 + WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x10_loop + +mulSve_10x10_end: + RET + +// func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x10Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038ca // mov z10.d, x6 + WORD $0x0521214a // dup z10.b, z10.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 + +mulSve_10x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + WORD $0x8580402d // ldr z13, [x1] + WORD $0x04215021 // addvl x1, x1, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580404b // ldr z11, [x2] + WORD $0x8580444c // ldr z12, [x2, #1, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580484b // ldr z11, [x2, #2, MUL VL] + WORD $0x85804c4c // ldr z12, [x2, #3, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580504b // ldr z11, [x2, #4, MUL VL] + WORD $0x8580544c // ldr z12, [x2, #5, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580584b // ldr z11, [x2, #6, MUL VL] + WORD $0x85805c4c // ldr z12, [x2, #7, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581404b // ldr z11, [x2, #8, MUL VL] + WORD $0x8581444c // ldr z12, [x2, #9, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581484b // ldr z11, [x2, #10, MUL VL] + WORD $0x85814c4c // ldr z12, [x2, #11, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581504b // ldr z11, [x2, #12, MUL VL] + WORD $0x8581544c // ldr z12, [x2, #13, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581584b // ldr z11, [x2, #14, MUL VL] + WORD $0x85815c4c // ldr z12, [x2, #15, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + MOVD 192(R14), R6 + WORD $0xa5ef40c8 // ld1d { z8.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582404b // ldr z11, [x2, #16, MUL VL] + WORD $0x8582444c // ldr z12, [x2, #17, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + MOVD 216(R14), R6 + WORD $0xa5ef40c9 // ld1d { z9.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582484b // ldr z11, [x2, #18, MUL VL] + WORD $0x85824c4c // ldr z12, [x2, #19, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 1 to 10 outputs + WORD $0x8580408d // ldr z13, [x4] + WORD $0x04245024 // addvl x4, x4, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8582504b // ldr z11, [x2, #20, MUL VL] + WORD $0x8582544c // ldr z12, [x2, #21, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8582584b // ldr z11, [x2, #22, MUL VL] + WORD $0x85825c4c // ldr z12, [x2, #23, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8583404b // ldr z11, [x2, #24, MUL VL] + WORD $0x8583444c // ldr z12, [x2, #25, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8583484b // ldr z11, [x2, #26, MUL VL] + WORD $0x85834c4c // ldr z12, [x2, #27, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8583504b // ldr z11, [x2, #28, MUL VL] + WORD $0x8583544c // ldr z12, [x2, #29, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8583584b // ldr z11, [x2, #30, MUL VL] + WORD $0x85835c4c // ldr z12, [x2, #31, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8584404b // ldr z11, [x2, #32, MUL VL] + WORD $0x8584444c // ldr z12, [x2, #33, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8584484b // ldr z11, [x2, #34, MUL VL] + WORD $0x85844c4c // ldr z12, [x2, #35, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8584504b // ldr z11, [x2, #36, MUL VL] + WORD $0x8584544c // ldr z12, [x2, #37, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8584584b // ldr z11, [x2, #38, MUL VL] + WORD $0x85845c4c // ldr z12, [x2, #39, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 2 to 10 outputs + WORD $0x858040ad // ldr z13, [x5] + WORD $0x04255025 // addvl x5, x5, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8585404b // ldr z11, [x2, #40, MUL VL] + WORD $0x8585444c // ldr z12, [x2, #41, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8585484b // ldr z11, [x2, #42, MUL VL] + WORD $0x85854c4c // ldr z12, [x2, #43, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8585504b // ldr z11, [x2, #44, MUL VL] + WORD $0x8585544c // ldr z12, [x2, #45, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8585584b // ldr z11, [x2, #46, MUL VL] + WORD $0x85855c4c // ldr z12, [x2, #47, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8586404b // ldr z11, [x2, #48, MUL VL] + WORD $0x8586444c // ldr z12, [x2, #49, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8586484b // ldr z11, [x2, #50, MUL VL] + WORD $0x85864c4c // ldr z12, [x2, #51, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8586504b // ldr z11, [x2, #52, MUL VL] + WORD $0x8586544c // ldr z12, [x2, #53, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8586584b // ldr z11, [x2, #54, MUL VL] + WORD $0x85865c4c // ldr z12, [x2, #55, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8587404b // ldr z11, [x2, #56, MUL VL] + WORD $0x8587444c // ldr z12, [x2, #57, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8587484b // ldr z11, [x2, #58, MUL VL] + WORD $0x85874c4c // ldr z12, [x2, #59, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 3 to 10 outputs + WORD $0x8580410d // ldr z13, [x8] + WORD $0x04285028 // addvl x8, x8, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8587504b // ldr z11, [x2, #60, MUL VL] + WORD $0x8587544c // ldr z12, [x2, #61, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8587584b // ldr z11, [x2, #62, MUL VL] + WORD $0x85875c4c // ldr z12, [x2, #63, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8588404b // ldr z11, [x2, #64, MUL VL] + WORD $0x8588444c // ldr z12, [x2, #65, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8588484b // ldr z11, [x2, #66, MUL VL] + WORD $0x85884c4c // ldr z12, [x2, #67, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8588504b // ldr z11, [x2, #68, MUL VL] + WORD $0x8588544c // ldr z12, [x2, #69, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8588584b // ldr z11, [x2, #70, MUL VL] + WORD $0x85885c4c // ldr z12, [x2, #71, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8589404b // ldr z11, [x2, #72, MUL VL] + WORD $0x8589444c // ldr z12, [x2, #73, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8589484b // ldr z11, [x2, #74, MUL VL] + WORD $0x85894c4c // ldr z12, [x2, #75, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8589504b // ldr z11, [x2, #76, MUL VL] + WORD $0x8589544c // ldr z12, [x2, #77, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8589584b // ldr z11, [x2, #78, MUL VL] + WORD $0x85895c4c // ldr z12, [x2, #79, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 4 to 10 outputs + WORD $0x8580412d // ldr z13, [x9] + WORD $0x04295029 // addvl x9, x9, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858a404b // ldr z11, [x2, #80, MUL VL] + WORD $0x858a444c // ldr z12, [x2, #81, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858a484b // ldr z11, [x2, #82, MUL VL] + WORD $0x858a4c4c // ldr z12, [x2, #83, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858a504b // ldr z11, [x2, #84, MUL VL] + WORD $0x858a544c // ldr z12, [x2, #85, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858a584b // ldr z11, [x2, #86, MUL VL] + WORD $0x858a5c4c // ldr z12, [x2, #87, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858b404b // ldr z11, [x2, #88, MUL VL] + WORD $0x858b444c // ldr z12, [x2, #89, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858b484b // ldr z11, [x2, #90, MUL VL] + WORD $0x858b4c4c // ldr z12, [x2, #91, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858b504b // ldr z11, [x2, #92, MUL VL] + WORD $0x858b544c // ldr z12, [x2, #93, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858b584b // ldr z11, [x2, #94, MUL VL] + WORD $0x858b5c4c // ldr z12, [x2, #95, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858c404b // ldr z11, [x2, #96, MUL VL] + WORD $0x858c444c // ldr z12, [x2, #97, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858c484b // ldr z11, [x2, #98, MUL VL] + WORD $0x858c4c4c // ldr z12, [x2, #99, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 5 to 10 outputs + WORD $0x8580414d // ldr z13, [x10] + WORD $0x042a502a // addvl x10, x10, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858c504b // ldr z11, [x2, #100, MUL VL] + WORD $0x858c544c // ldr z12, [x2, #101, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858c584b // ldr z11, [x2, #102, MUL VL] + WORD $0x858c5c4c // ldr z12, [x2, #103, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858d404b // ldr z11, [x2, #104, MUL VL] + WORD $0x858d444c // ldr z12, [x2, #105, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858d484b // ldr z11, [x2, #106, MUL VL] + WORD $0x858d4c4c // ldr z12, [x2, #107, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858d504b // ldr z11, [x2, #108, MUL VL] + WORD $0x858d544c // ldr z12, [x2, #109, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858d584b // ldr z11, [x2, #110, MUL VL] + WORD $0x858d5c4c // ldr z12, [x2, #111, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858e404b // ldr z11, [x2, #112, MUL VL] + WORD $0x858e444c // ldr z12, [x2, #113, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858e484b // ldr z11, [x2, #114, MUL VL] + WORD $0x858e4c4c // ldr z12, [x2, #115, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858e504b // ldr z11, [x2, #116, MUL VL] + WORD $0x858e544c // ldr z12, [x2, #117, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858e584b // ldr z11, [x2, #118, MUL VL] + WORD $0x858e5c4c // ldr z12, [x2, #119, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 6 to 10 outputs + WORD $0x8580416d // ldr z13, [x11] + WORD $0x042b502b // addvl x11, x11, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858f404b // ldr z11, [x2, #120, MUL VL] + WORD $0x858f444c // ldr z12, [x2, #121, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858f484b // ldr z11, [x2, #122, MUL VL] + WORD $0x858f4c4c // ldr z12, [x2, #123, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858f504b // ldr z11, [x2, #124, MUL VL] + WORD $0x858f544c // ldr z12, [x2, #125, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858f584b // ldr z11, [x2, #126, MUL VL] + WORD $0x858f5c4c // ldr z12, [x2, #127, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8590404b // ldr z11, [x2, #128, MUL VL] + WORD $0x8590444c // ldr z12, [x2, #129, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8590484b // ldr z11, [x2, #130, MUL VL] + WORD $0x85904c4c // ldr z12, [x2, #131, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8590504b // ldr z11, [x2, #132, MUL VL] + WORD $0x8590544c // ldr z12, [x2, #133, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8590584b // ldr z11, [x2, #134, MUL VL] + WORD $0x85905c4c // ldr z12, [x2, #135, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8591404b // ldr z11, [x2, #136, MUL VL] + WORD $0x8591444c // ldr z12, [x2, #137, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8591484b // ldr z11, [x2, #138, MUL VL] + WORD $0x85914c4c // ldr z12, [x2, #139, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 7 to 10 outputs + WORD $0x8580418d // ldr z13, [x12] + WORD $0x042c502c // addvl x12, x12, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8591504b // ldr z11, [x2, #140, MUL VL] + WORD $0x8591544c // ldr z12, [x2, #141, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8591584b // ldr z11, [x2, #142, MUL VL] + WORD $0x85915c4c // ldr z12, [x2, #143, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8592404b // ldr z11, [x2, #144, MUL VL] + WORD $0x8592444c // ldr z12, [x2, #145, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8592484b // ldr z11, [x2, #146, MUL VL] + WORD $0x85924c4c // ldr z12, [x2, #147, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8592504b // ldr z11, [x2, #148, MUL VL] + WORD $0x8592544c // ldr z12, [x2, #149, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8592584b // ldr z11, [x2, #150, MUL VL] + WORD $0x85925c4c // ldr z12, [x2, #151, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8593404b // ldr z11, [x2, #152, MUL VL] + WORD $0x8593444c // ldr z12, [x2, #153, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8593484b // ldr z11, [x2, #154, MUL VL] + WORD $0x85934c4c // ldr z12, [x2, #155, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8593504b // ldr z11, [x2, #156, MUL VL] + WORD $0x8593544c // ldr z12, [x2, #157, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8593584b // ldr z11, [x2, #158, MUL VL] + WORD $0x85935c4c // ldr z12, [x2, #159, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 8 to 10 outputs + WORD $0x858041ad // ldr z13, [x13] + WORD $0x042d502d // addvl x13, x13, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8594404b // ldr z11, [x2, #160, MUL VL] + WORD $0x8594444c // ldr z12, [x2, #161, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8594484b // ldr z11, [x2, #162, MUL VL] + WORD $0x85944c4c // ldr z12, [x2, #163, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8594504b // ldr z11, [x2, #164, MUL VL] + WORD $0x8594544c // ldr z12, [x2, #165, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8594584b // ldr z11, [x2, #166, MUL VL] + WORD $0x85945c4c // ldr z12, [x2, #167, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8595404b // ldr z11, [x2, #168, MUL VL] + WORD $0x8595444c // ldr z12, [x2, #169, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8595484b // ldr z11, [x2, #170, MUL VL] + WORD $0x85954c4c // ldr z12, [x2, #171, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8595504b // ldr z11, [x2, #172, MUL VL] + WORD $0x8595544c // ldr z12, [x2, #173, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8595584b // ldr z11, [x2, #174, MUL VL] + WORD $0x85955c4c // ldr z12, [x2, #175, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8596404b // ldr z11, [x2, #176, MUL VL] + WORD $0x8596444c // ldr z12, [x2, #177, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8596484b // ldr z11, [x2, #178, MUL VL] + WORD $0x85964c4c // ldr z12, [x2, #179, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 9 to 10 outputs + WORD $0x8580406d // ldr z13, [x3] + WORD $0x04235023 // addvl x3, x3, #1 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8596504b // ldr z11, [x2, #180, MUL VL] + WORD $0x8596544c // ldr z12, [x2, #181, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8596584b // ldr z11, [x2, #182, MUL VL] + WORD $0x85965c4c // ldr z12, [x2, #183, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8597404b // ldr z11, [x2, #184, MUL VL] + WORD $0x8597444c // ldr z12, [x2, #185, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8597484b // ldr z11, [x2, #186, MUL VL] + WORD $0x85974c4c // ldr z12, [x2, #187, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8597504b // ldr z11, [x2, #188, MUL VL] + WORD $0x8597544c // ldr z12, [x2, #189, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8597584b // ldr z11, [x2, #190, MUL VL] + WORD $0x85975c4c // ldr z12, [x2, #191, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8598404b // ldr z11, [x2, #192, MUL VL] + WORD $0x8598444c // ldr z12, [x2, #193, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8598484b // ldr z11, [x2, #194, MUL VL] + WORD $0x85984c4c // ldr z12, [x2, #195, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8598504b // ldr z11, [x2, #196, MUL VL] + WORD $0x8598544c // ldr z12, [x2, #197, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8598584b // ldr z11, [x2, #198, MUL VL] + WORD $0x85985c4c // ldr z12, [x2, #199, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + +mulSve_10x10Xor_store: + // Store 10 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + MOVD 216(R14), R6 + WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x8b1101ef // add x15, x15, x17 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x10Xor_loop + +mulSve_10x10Xor_end: + RET + +// func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x1_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + ADD R15, R14 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + MOVD $15, R15 + VMOV R15, V4.B[0] + VDUP V4.B[0], V4.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x1_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 1 outputs + VLD1.P 32(R1), [V12.B16, V13.B16] + VLD1.P 32(R1), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V8.B16, V0.B16 + VEOR V7.B16, V9.B16, V1.B16 + VEOR V10.B16, V12.B16, V2.B16 + VEOR V11.B16, V13.B16, V3.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 1 to 1 outputs + VLD1.P 32(R4), [V12.B16, V13.B16] + VLD1.P 32(R4), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 2 to 1 outputs + VLD1.P 32(R5), [V12.B16, V13.B16] + VLD1.P 32(R5), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 3 to 1 outputs + VLD1.P 32(R8), [V12.B16, V13.B16] + VLD1.P 32(R8), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 4 to 1 outputs + VLD1.P 32(R9), [V12.B16, V13.B16] + VLD1.P 32(R9), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 5 to 1 outputs + VLD1.P 32(R10), [V12.B16, V13.B16] + VLD1.P 32(R10), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 6 to 1 outputs + VLD1.P 32(R11), [V12.B16, V13.B16] + VLD1.P 32(R11), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 7 to 1 outputs + VLD1.P 32(R12), [V12.B16, V13.B16] + VLD1.P 32(R12), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 8 to 1 outputs + VLD1.P 32(R13), [V12.B16, V13.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 9 to 1 outputs + VLD1.P 32(R3), [V12.B16, V13.B16] + VLD1.P 32(R3), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + +mulNeon_10x1_64_store: + // Store 1 outputs + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x1_64_loop + +mulNeon_10x1_64_end: + RET + +// func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x1_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + ADD R15, R14 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + MOVD $15, R15 + VMOV R15, V4.B[0] + VDUP V4.B[0], V4.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x1_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 1 outputs + VLD1.P 32(R14), [V0.B16, V1.B16] + VLD1.P 32(R14), [V2.B16, V3.B16] + + // Load and process 64 bytes from input 0 to 1 outputs + VLD1.P 32(R1), [V12.B16, V13.B16] + VLD1.P 32(R1), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 1 to 1 outputs + VLD1.P 32(R4), [V12.B16, V13.B16] + VLD1.P 32(R4), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 2 to 1 outputs + VLD1.P 32(R5), [V12.B16, V13.B16] + VLD1.P 32(R5), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 3 to 1 outputs + VLD1.P 32(R8), [V12.B16, V13.B16] + VLD1.P 32(R8), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 4 to 1 outputs + VLD1.P 32(R9), [V12.B16, V13.B16] + VLD1.P 32(R9), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 5 to 1 outputs + VLD1.P 32(R10), [V12.B16, V13.B16] + VLD1.P 32(R10), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 6 to 1 outputs + VLD1.P 32(R11), [V12.B16, V13.B16] + VLD1.P 32(R11), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 7 to 1 outputs + VLD1.P 32(R12), [V12.B16, V13.B16] + VLD1.P 32(R12), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 8 to 1 outputs + VLD1.P 32(R13), [V12.B16, V13.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 9 to 1 outputs + VLD1.P 32(R3), [V12.B16, V13.B16] + VLD1.P 32(R3), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + +mulNeon_10x1_64Xor_store: + // Store 1 outputs + SUB $64, R14 + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x1_64Xor_loop + +mulNeon_10x1_64Xor_end: + RET + +// func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x2_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R15 + ADD R6, R14 + + // Add start offset to input + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R13 + ADD R6, R3 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x2_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 2 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V0.B16 + VEOR V11.B16, V13.B16, V1.B16 + VEOR V14.B16, V16.B16, V2.B16 + VEOR V15.B16, V17.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V4.B16 + VEOR V11.B16, V13.B16, V5.B16 + VEOR V14.B16, V16.B16, V6.B16 + VEOR V15.B16, V17.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 1 to 2 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 2 to 2 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 3 to 2 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 4 to 2 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 5 to 2 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 6 to 2 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 7 to 2 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 8 to 2 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 9 to 2 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + +mulNeon_10x2_64_store: + // Store 2 outputs + VST1.P [V0.D2, V1.D2], 32(R15) + VST1.P [V2.D2, V3.D2], 32(R15) + VST1.P [V4.D2, V5.D2], 32(R14) + VST1.P [V6.D2, V7.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x2_64_loop + +mulNeon_10x2_64_end: + RET + +// func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x2_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R15 + ADD R6, R14 + + // Add start offset to input + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R13 + ADD R6, R3 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x2_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 2 outputs + VLD1.P 32(R15), [V0.B16, V1.B16] + VLD1.P 32(R15), [V2.B16, V3.B16] + VLD1.P 32(R14), [V4.B16, V5.B16] + VLD1.P 32(R14), [V6.B16, V7.B16] + + // Load and process 64 bytes from input 0 to 2 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 1 to 2 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 2 to 2 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 3 to 2 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 4 to 2 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 5 to 2 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 6 to 2 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 7 to 2 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 8 to 2 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 9 to 2 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + +mulNeon_10x2_64Xor_store: + // Store 2 outputs + SUB $64, R15 + VST1.P [V0.D2, V1.D2], 32(R15) + VST1.P [V2.D2, V3.D2], 32(R15) + SUB $64, R14 + VST1.P [V4.D2, V5.D2], 32(R14) + VST1.P [V6.D2, V7.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x2_64Xor_loop + +mulNeon_10x2_64Xor_end: + RET + +// func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x3_64_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R14 + ADD R6, R15 + ADD R6, R13 + + // Add start offset to input + ADD R6, R3 + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R0 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Reload length to save a register + MOVD n+80(FP), R6 + LSR $6, R6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x3_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 3 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V0.B16 + VEOR V15.B16, V17.B16, V1.B16 + VEOR V18.B16, V20.B16, V2.B16 + VEOR V19.B16, V21.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V4.B16 + VEOR V15.B16, V17.B16, V5.B16 + VEOR V18.B16, V20.B16, V6.B16 + VEOR V19.B16, V21.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V8.B16 + VEOR V15.B16, V17.B16, V9.B16 + VEOR V18.B16, V20.B16, V10.B16 + VEOR V19.B16, V21.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 1 to 3 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 2 to 3 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 3 to 3 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 4 to 3 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 5 to 3 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 6 to 3 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 7 to 3 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 8 to 3 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 9 to 3 outputs + VLD1.P 32(R0), [V22.B16, V23.B16] + VLD1.P 32(R0), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + +mulNeon_10x3_64_store: + // Store 3 outputs + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + VST1.P [V4.D2, V5.D2], 32(R15) + VST1.P [V6.D2, V7.D2], 32(R15) + VST1.P [V8.D2, V9.D2], 32(R13) + VST1.P [V10.D2, V11.D2], 32(R13) + + // Prepare for next loop + SUBS $1, R6 + BNE mulNeon_10x3_64_loop + +mulNeon_10x3_64_end: + RET + +// func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x3_64Xor_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R14 + ADD R6, R15 + ADD R6, R13 + + // Add start offset to input + ADD R6, R3 + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R0 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Reload length to save a register + MOVD n+80(FP), R6 + LSR $6, R6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x3_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 3 outputs + VLD1.P 32(R14), [V0.B16, V1.B16] + VLD1.P 32(R14), [V2.B16, V3.B16] + VLD1.P 32(R15), [V4.B16, V5.B16] + VLD1.P 32(R15), [V6.B16, V7.B16] + VLD1.P 32(R13), [V8.B16, V9.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + + // Load and process 64 bytes from input 0 to 3 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 1 to 3 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 2 to 3 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 3 to 3 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 4 to 3 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 5 to 3 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 6 to 3 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 7 to 3 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 8 to 3 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 9 to 3 outputs + VLD1.P 32(R0), [V22.B16, V23.B16] + VLD1.P 32(R0), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + +mulNeon_10x3_64Xor_store: + // Store 3 outputs + SUB $64, R14 + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + SUB $64, R15 + VST1.P [V4.D2, V5.D2], 32(R15) + VST1.P [V6.D2, V7.D2], 32(R15) + SUB $64, R13 + VST1.P [V8.D2, V9.D2], 32(R13) + VST1.P [V10.D2, V11.D2], 32(R13) + + // Prepare for next loop + SUBS $1, R6 + BNE mulNeon_10x3_64Xor_loop + +mulNeon_10x3_64Xor_end: + RET + +// func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x4_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x4_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 4 outputs + VLD1.P 32(R1), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V0.B16 + VEOR V11.B16, V13.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V2.B16 + VEOR V11.B16, V13.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V4.B16 + VEOR V11.B16, V13.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V6.B16 + VEOR V11.B16, V13.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 1 to 4 outputs + VLD1.P 32(R4), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 2 to 4 outputs + VLD1.P 32(R5), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 3 to 4 outputs + VLD1.P 32(R8), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 4 to 4 outputs + VLD1.P 32(R9), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 5 to 4 outputs + VLD1.P 32(R10), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 6 to 4 outputs + VLD1.P 32(R11), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 7 to 4 outputs + VLD1.P 32(R12), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 8 to 4 outputs + VLD1.P 32(R13), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 9 to 4 outputs + VLD1.P 32(R3), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + +mulNeon_10x4_store: + // Store 4 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x4_loop + +mulNeon_10x4_end: + RET + +// func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x4Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x4Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 4 outputs + VLD1.P 32(R1), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 1 to 4 outputs + VLD1.P 32(R4), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 2 to 4 outputs + VLD1.P 32(R5), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 3 to 4 outputs + VLD1.P 32(R8), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 4 to 4 outputs + VLD1.P 32(R9), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 5 to 4 outputs + VLD1.P 32(R10), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 6 to 4 outputs + VLD1.P 32(R11), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 7 to 4 outputs + VLD1.P 32(R12), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 8 to 4 outputs + VLD1.P 32(R13), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 9 to 4 outputs + VLD1.P 32(R3), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + +mulNeon_10x4Xor_store: + // Store 4 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x4Xor_loop + +mulNeon_10x4Xor_end: + RET + +// func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x5_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V10.B[0] + VDUP V10.B[0], V10.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x5_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 5 outputs + VLD1.P 32(R1), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V0.B16 + VEOR V13.B16, V15.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V2.B16 + VEOR V13.B16, V15.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V4.B16 + VEOR V13.B16, V15.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V6.B16 + VEOR V13.B16, V15.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V8.B16 + VEOR V13.B16, V15.B16, V9.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 1 to 5 outputs + VLD1.P 32(R4), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 2 to 5 outputs + VLD1.P 32(R5), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 3 to 5 outputs + VLD1.P 32(R8), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 4 to 5 outputs + VLD1.P 32(R9), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 5 to 5 outputs + VLD1.P 32(R10), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 6 to 5 outputs + VLD1.P 32(R11), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 7 to 5 outputs + VLD1.P 32(R12), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 8 to 5 outputs + VLD1.P 32(R13), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 9 to 5 outputs + VLD1.P 32(R3), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + +mulNeon_10x5_store: + // Store 5 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x5_loop + +mulNeon_10x5_end: + RET + +// func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x5Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V10.B[0] + VDUP V10.B[0], V10.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x5Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 5 outputs + VLD1.P 32(R1), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 1 to 5 outputs + VLD1.P 32(R4), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 2 to 5 outputs + VLD1.P 32(R5), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 3 to 5 outputs + VLD1.P 32(R8), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 4 to 5 outputs + VLD1.P 32(R9), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 5 to 5 outputs + VLD1.P 32(R10), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 6 to 5 outputs + VLD1.P 32(R11), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 7 to 5 outputs + VLD1.P 32(R12), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 8 to 5 outputs + VLD1.P 32(R13), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 9 to 5 outputs + VLD1.P 32(R3), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + +mulNeon_10x5Xor_store: + // Store 5 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x5Xor_loop + +mulNeon_10x5Xor_end: + RET + +// func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x6_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x6_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 6 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V0.B16 + VEOR V15.B16, V17.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V2.B16 + VEOR V15.B16, V17.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V4.B16 + VEOR V15.B16, V17.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V6.B16 + VEOR V15.B16, V17.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V8.B16 + VEOR V15.B16, V17.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V10.B16 + VEOR V15.B16, V17.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 1 to 6 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 2 to 6 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 3 to 6 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 4 to 6 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 5 to 6 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 6 to 6 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 7 to 6 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 8 to 6 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 9 to 6 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + +mulNeon_10x6_store: + // Store 6 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x6_loop + +mulNeon_10x6_end: + RET + +// func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x6Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x6Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 6 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 1 to 6 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 2 to 6 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 3 to 6 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 4 to 6 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 5 to 6 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 6 to 6 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 7 to 6 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 8 to 6 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 9 to 6 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + +mulNeon_10x6Xor_store: + // Store 6 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x6Xor_loop + +mulNeon_10x6Xor_end: + RET + +// func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x7_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V14.B[0] + VDUP V14.B[0], V14.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x7_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 7 outputs + VLD1.P 32(R1), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V0.B16 + VEOR V17.B16, V19.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V2.B16 + VEOR V17.B16, V19.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V4.B16 + VEOR V17.B16, V19.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V6.B16 + VEOR V17.B16, V19.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V8.B16 + VEOR V17.B16, V19.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V10.B16 + VEOR V17.B16, V19.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V12.B16 + VEOR V17.B16, V19.B16, V13.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 1 to 7 outputs + VLD1.P 32(R4), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 2 to 7 outputs + VLD1.P 32(R5), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 3 to 7 outputs + VLD1.P 32(R8), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 4 to 7 outputs + VLD1.P 32(R9), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 5 to 7 outputs + VLD1.P 32(R10), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 6 to 7 outputs + VLD1.P 32(R11), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 7 to 7 outputs + VLD1.P 32(R12), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 8 to 7 outputs + VLD1.P 32(R13), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 9 to 7 outputs + VLD1.P 32(R3), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + +mulNeon_10x7_store: + // Store 7 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x7_loop + +mulNeon_10x7_end: + RET + +// func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x7Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V14.B[0] + VDUP V14.B[0], V14.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x7Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 7 outputs + VLD1.P 32(R1), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 1 to 7 outputs + VLD1.P 32(R4), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 2 to 7 outputs + VLD1.P 32(R5), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 3 to 7 outputs + VLD1.P 32(R8), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 4 to 7 outputs + VLD1.P 32(R9), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 5 to 7 outputs + VLD1.P 32(R10), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 6 to 7 outputs + VLD1.P 32(R11), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 7 to 7 outputs + VLD1.P 32(R12), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 8 to 7 outputs + VLD1.P 32(R13), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 9 to 7 outputs + VLD1.P 32(R3), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + +mulNeon_10x7Xor_store: + // Store 7 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x7Xor_loop + +mulNeon_10x7Xor_end: + RET + +// func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x8_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V16.B[0] + VDUP V16.B[0], V16.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x8_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 8 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V0.B16 + VEOR V19.B16, V21.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V2.B16 + VEOR V19.B16, V21.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V4.B16 + VEOR V19.B16, V21.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V6.B16 + VEOR V19.B16, V21.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V8.B16 + VEOR V19.B16, V21.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V10.B16 + VEOR V19.B16, V21.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V12.B16 + VEOR V19.B16, V21.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V14.B16 + VEOR V19.B16, V21.B16, V15.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 1 to 8 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 2 to 8 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 3 to 8 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 4 to 8 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 5 to 8 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 6 to 8 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 7 to 8 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 8 to 8 outputs + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 9 to 8 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + +mulNeon_10x8_store: + // Store 8 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x8_loop + +mulNeon_10x8_end: + RET + +// func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x8Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V16.B[0] + VDUP V16.B[0], V16.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x8Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 8 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 1 to 8 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 2 to 8 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 3 to 8 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 4 to 8 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 5 to 8 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 6 to 8 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 7 to 8 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 8 to 8 outputs + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 9 to 8 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + +mulNeon_10x8Xor_store: + // Store 8 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x8Xor_loop + +mulNeon_10x8Xor_end: + RET + +// func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x9_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V18.B[0] + VDUP V18.B[0], V18.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x9_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 9 outputs + VLD1.P 32(R1), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V0.B16 + VEOR V21.B16, V23.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V2.B16 + VEOR V21.B16, V23.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V4.B16 + VEOR V21.B16, V23.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V6.B16 + VEOR V21.B16, V23.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V8.B16 + VEOR V21.B16, V23.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V10.B16 + VEOR V21.B16, V23.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V12.B16 + VEOR V21.B16, V23.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V14.B16 + VEOR V21.B16, V23.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V16.B16 + VEOR V21.B16, V23.B16, V17.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 1 to 9 outputs + VLD1.P 32(R4), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 2 to 9 outputs + VLD1.P 32(R5), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 3 to 9 outputs + VLD1.P 32(R8), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 4 to 9 outputs + VLD1.P 32(R9), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 5 to 9 outputs + VLD1.P 32(R10), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 6 to 9 outputs + VLD1.P 32(R11), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 7 to 9 outputs + VLD1.P 32(R12), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 8 to 9 outputs + VLD1.P 32(R13), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 9 to 9 outputs + VLD1.P 32(R3), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + +mulNeon_10x9_store: + // Store 9 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x9_loop + +mulNeon_10x9_end: + RET + +// func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x9Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V18.B[0] + VDUP V18.B[0], V18.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x9Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 9 outputs + VLD1.P 32(R1), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + MOVD 192(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V16.B16, V17.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 1 to 9 outputs + VLD1.P 32(R4), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 2 to 9 outputs + VLD1.P 32(R5), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 3 to 9 outputs + VLD1.P 32(R8), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 4 to 9 outputs + VLD1.P 32(R9), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 5 to 9 outputs + VLD1.P 32(R10), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 6 to 9 outputs + VLD1.P 32(R11), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 7 to 9 outputs + VLD1.P 32(R12), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 8 to 9 outputs + VLD1.P 32(R13), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 9 to 9 outputs + VLD1.P 32(R3), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + +mulNeon_10x9Xor_store: + // Store 9 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x9Xor_loop + +mulNeon_10x9Xor_end: + RET + +// func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x10_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V20.B[0] + VDUP V20.B[0], V20.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x10_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 10 outputs + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V0.B16 + VEOR V23.B16, V25.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V2.B16 + VEOR V23.B16, V25.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V4.B16 + VEOR V23.B16, V25.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V6.B16 + VEOR V23.B16, V25.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V8.B16 + VEOR V23.B16, V25.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V10.B16 + VEOR V23.B16, V25.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V12.B16 + VEOR V23.B16, V25.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V14.B16 + VEOR V23.B16, V25.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V16.B16 + VEOR V23.B16, V25.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V18.B16 + VEOR V23.B16, V25.B16, V19.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 1 to 10 outputs + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 2 to 10 outputs + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 3 to 10 outputs + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 4 to 10 outputs + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 5 to 10 outputs + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 6 to 10 outputs + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 7 to 10 outputs + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 8 to 10 outputs + VLD1.P 32(R13), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 9 to 10 outputs + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + +mulNeon_10x10_store: + // Store 10 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + MOVD 216(R14), R6 + ADD R15<<3, R6 + VST1 [V18.D2, V19.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x10_loop + +mulNeon_10x10_end: + RET + +// func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x10Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V20.B[0] + VDUP V20.B[0], V20.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x10Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 10 outputs + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + MOVD 192(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V16.B16, V17.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + MOVD 216(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V18.B16, V19.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 1 to 10 outputs + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 2 to 10 outputs + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 3 to 10 outputs + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 4 to 10 outputs + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 5 to 10 outputs + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 6 to 10 outputs + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 7 to 10 outputs + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 8 to 10 outputs + VLD1.P 32(R13), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 9 to 10 outputs + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + +mulNeon_10x10Xor_store: + // Store 10 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + MOVD 216(R14), R6 + ADD R15<<3, R6 + VST1 [V18.D2, V19.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x10Xor_loop + +mulNeon_10x10Xor_end: + RET + diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go index 11929e6..3e25898 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go @@ -1,26 +1,19 @@ -//go:build !amd64 || noasm || appengine || gccgo || nogen -// +build !amd64 noasm appengine gccgo nogen +//go:build !(amd64 || arm64) || noasm || appengine || gccgo || nogen package reedsolomon -const maxAvx2Inputs = 1 -const maxAvx2Outputs = 1 -const minAvx2Size = 1 -const avxSizeMask = 0 -const avx2CodeGen = false +const ( + codeGen = false + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 1 + codeGenMaxOutputs = 1 + minCodeGenSize = 1 +) -func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) hasCodeGen(int, int, int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } -func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) canGFNI(int, int, int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go new file mode 100644 index 0000000..298bf50 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go @@ -0,0 +1,2264 @@ +// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && nopshufb && gc + +package reedsolomon + +func _dummy_() + +//go:noescape +func sSE2XorSlice(in []byte, out []byte) + +//go:noescape +func sSE2XorSlice_64(in []byte, out []byte) + +//go:noescape +func avx2XorSlice_64(in []byte, out []byte) + +// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s new file mode 100644 index 0000000..5782759 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s @@ -0,0 +1,67987 @@ +// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && nopshufb && gc + +#include "textflag.h" + +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#define XOR3WAY(ignore, a, b, dst) \ + VPTERNLOGD $0x96, a, b, dst + +#else +#define XOR3WAY(ignore, a, b, dst) \ + VPXOR a, dst, dst \ + VPXOR b, dst, dst + +#endif + RET + +// sSE2XorSlice will XOR in with out and store in out. +// Processes 16 bytes/loop. + +// func sSE2XorSlice(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x04, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU (CX), X1 + PXOR X0, X1 + MOVOU X1, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + DECQ DX + JNZ loop + +end: + RET + +// sSE2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func sSE2XorSlice_64(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU 16(AX), X2 + MOVOU 32(AX), X4 + MOVOU 48(AX), X6 + MOVOU (CX), X1 + MOVOU 16(CX), X3 + MOVOU 32(CX), X5 + MOVOU 48(CX), X7 + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (CX) + MOVOU X3, 16(CX) + MOVOU X5, 32(CX) + MOVOU X7, 48(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + RET + +// avx2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func avx2XorSlice_64(in []byte, out []byte) +// Requires: AVX, AVX2 +TEXT ·avx2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y2 + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (CX) + VMOVDQU Y3, 32(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + VZEROUPPER + RET + +// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z1 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z1, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64_loop + VZEROUPPER + +mulGFNI_1x1_64_end: + RET + +// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1_loop + VZEROUPPER + +mulAvxGFNI_1x1_end: + RET + +// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DX), Z1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z2 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z2, Z2 + VXORPD Z1, Z2, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64Xor_loop + VZEROUPPER + +mulGFNI_1x1_64Xor_end: + RET + +// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1Xor_loop + VZEROUPPER + +mulAvxGFNI_1x1Xor_end: + RET + +// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64_loop + VZEROUPPER + +mulGFNI_1x2_64_end: + RET + +// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2_loop + VZEROUPPER + +mulAvxGFNI_1x2_end: + RET + +// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (BX), Z2 + VMOVDQU64 (DX), Z3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z5 + VXORPD Z3, Z5, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64Xor_loop + VZEROUPPER + +mulGFNI_1x2_64Xor_end: + RET + +// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2Xor_loop + VZEROUPPER + +mulAvxGFNI_1x2Xor_end: + RET + +// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64_loop + VZEROUPPER + +mulGFNI_1x3_64_end: + RET + +// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3_loop + VZEROUPPER + +mulAvxGFNI_1x3_end: + RET + +// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (BX), Z3 + VMOVDQU64 (SI), Z4 + VMOVDQU64 (DX), Z5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z3, Z7, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64Xor_loop + VZEROUPPER + +mulGFNI_1x3_64Xor_end: + RET + +// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3Xor_loop + VZEROUPPER + +mulAvxGFNI_1x3Xor_end: + RET + +// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z7, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z7, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64_loop + VZEROUPPER + +mulGFNI_1x4_64_end: + RET + +// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4_loop + VZEROUPPER + +mulAvxGFNI_1x4_end: + RET + +// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (BX), Z4 + VMOVDQU64 (SI), Z5 + VMOVDQU64 (DI), Z6 + VMOVDQU64 (DX), Z7 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z4, Z9, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z5, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64Xor_loop + VZEROUPPER + +mulGFNI_1x4_64Xor_end: + RET + +// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4Xor_loop + VZEROUPPER + +mulAvxGFNI_1x4Xor_end: + RET + +// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z9, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64_loop + VZEROUPPER + +mulGFNI_1x5_64_end: + RET + +// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5_loop + VZEROUPPER + +mulAvxGFNI_1x5_end: + RET + +// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (BX), Z5 + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DX), Z9 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z5, Z11, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z6, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z7, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64Xor_loop + VZEROUPPER + +mulGFNI_1x5_64Xor_end: + RET + +// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5Xor_loop + VZEROUPPER + +mulAvxGFNI_1x5Xor_end: + RET + +// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z11, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64_loop + VZEROUPPER + +mulGFNI_1x6_64_end: + RET + +// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6_loop + VZEROUPPER + +mulAvxGFNI_1x6_end: + RET + +// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (BX), Z6 + VMOVDQU64 (SI), Z7 + VMOVDQU64 (DI), Z8 + VMOVDQU64 (R8), Z9 + VMOVDQU64 (R9), Z10 + VMOVDQU64 (DX), Z11 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z6, Z13, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z7, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64Xor_loop + VZEROUPPER + +mulGFNI_1x6_64Xor_end: + RET + +// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6Xor_loop + VZEROUPPER + +mulAvxGFNI_1x6Xor_end: + RET + +// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z13 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z13, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z13, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z13, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64_loop + VZEROUPPER + +mulGFNI_1x7_64_end: + RET + +// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7_loop + VZEROUPPER + +mulAvxGFNI_1x7_end: + RET + +// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (BX), Z7 + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (R9), Z11 + VMOVDQU64 (R10), Z12 + VMOVDQU64 (DX), Z13 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z7, Z15, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z8, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z9, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z10, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z11, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64Xor_loop + VZEROUPPER + +mulGFNI_1x7_64Xor_end: + RET + +// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7Xor_loop + VZEROUPPER + +mulAvxGFNI_1x7Xor_end: + RET + +// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64_loop + VZEROUPPER + +mulGFNI_1x8_64_end: + RET + +// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 56(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8_loop + VZEROUPPER + +mulAvxGFNI_1x8_end: + RET + +// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (BX), Z8 + VMOVDQU64 (SI), Z9 + VMOVDQU64 (DI), Z10 + VMOVDQU64 (R8), Z11 + VMOVDQU64 (R9), Z12 + VMOVDQU64 (R10), Z13 + VMOVDQU64 (R11), Z14 + VMOVDQU64 (DX), Z15 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z8, Z17, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z9, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z10, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z11, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64Xor_loop + VZEROUPPER + +mulGFNI_1x8_64Xor_end: + RET + +// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8Xor_loop + VZEROUPPER + +mulAvxGFNI_1x8Xor_end: + RET + +// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z17 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z17, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z17, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z17, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64_loop + VZEROUPPER + +mulGFNI_1x9_64_end: + RET + +// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 64(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9_loop + VZEROUPPER + +mulAvxGFNI_1x9_end: + RET + +// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (BX), Z9 + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (R10), Z14 + VMOVDQU64 (R11), Z15 + VMOVDQU64 (R12), Z16 + VMOVDQU64 (DX), Z17 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z9, Z19, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z10, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z11, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64Xor_loop + VZEROUPPER + +mulGFNI_1x9_64Xor_end: + RET + +// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9Xor_loop + VZEROUPPER + +mulAvxGFNI_1x9Xor_end: + RET + +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z19 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 72(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10_loop + VZEROUPPER + +mulAvxGFNI_1x10_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64Xor_loop + VZEROUPPER + +mulGFNI_1x10_64Xor_end: + RET + +// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10Xor_loop + VZEROUPPER + +mulAvxGFNI_1x10Xor_end: + RET + +// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64_loop + VZEROUPPER + +mulGFNI_2x1_64_end: + RET + +// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1_loop + VZEROUPPER + +mulAvxGFNI_2x1_end: + RET + +// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (BX), Z2 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64Xor_loop + VZEROUPPER + +mulGFNI_2x1_64Xor_end: + RET + +// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1Xor_loop + VZEROUPPER + +mulAvxGFNI_2x1Xor_end: + RET + +// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64_loop + VZEROUPPER + +mulGFNI_2x2_64_end: + RET + +// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2_loop + VZEROUPPER + +mulAvxGFNI_2x2_end: + RET + +// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (SI), Z4 + VMOVDQU64 (BX), Z5 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64Xor_loop + VZEROUPPER + +mulGFNI_2x2_64Xor_end: + RET + +// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2Xor_loop + VZEROUPPER + +mulAvxGFNI_2x2Xor_end: + RET + +// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64_loop + VZEROUPPER + +mulGFNI_2x3_64_end: + RET + +// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3_loop + VZEROUPPER + +mulAvxGFNI_2x3_end: + RET + +// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (BX), Z8 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64Xor_loop + VZEROUPPER + +mulGFNI_2x3_64Xor_end: + RET + +// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3Xor_loop + VZEROUPPER + +mulAvxGFNI_2x3Xor_end: + RET + +// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64_loop + VZEROUPPER + +mulGFNI_2x4_64_end: + RET + +// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4_loop + VZEROUPPER + +mulAvxGFNI_2x4_end: + RET + +// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (BX), Z11 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64Xor_loop + VZEROUPPER + +mulGFNI_2x4_64Xor_end: + RET + +// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4Xor_loop + VZEROUPPER + +mulAvxGFNI_2x4Xor_end: + RET + +// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64_loop + VZEROUPPER + +mulGFNI_2x5_64_end: + RET + +// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5_loop + VZEROUPPER + +mulAvxGFNI_2x5_end: + RET + +// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (BX), Z14 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64Xor_loop + VZEROUPPER + +mulGFNI_2x5_64Xor_end: + RET + +// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5Xor_loop + VZEROUPPER + +mulAvxGFNI_2x5Xor_end: + RET + +// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64_loop + VZEROUPPER + +mulGFNI_2x6_64_end: + RET + +// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6_loop + VZEROUPPER + +mulAvxGFNI_2x6_end: + RET + +// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (SI), Z12 + VMOVDQU64 (DI), Z13 + VMOVDQU64 (R8), Z14 + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (BX), Z17 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64Xor_loop + VZEROUPPER + +mulGFNI_2x6_64Xor_end: + RET + +// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6Xor_loop + VZEROUPPER + +mulAvxGFNI_2x6Xor_end: + RET + +// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64_loop + VZEROUPPER + +mulGFNI_2x7_64_end: + RET + +// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7_loop + VZEROUPPER + +mulAvxGFNI_2x7_end: + RET + +// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (SI), Z14 + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (BX), Z20 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64Xor_loop + VZEROUPPER + +mulGFNI_2x7_64Xor_end: + RET + +// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7Xor_loop + VZEROUPPER + +mulAvxGFNI_2x7Xor_end: + RET + +// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64_loop + VZEROUPPER + +mulGFNI_2x8_64_end: + RET + +// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8_loop + VZEROUPPER + +mulAvxGFNI_2x8_end: + RET + +// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (SI), Z16 + VMOVDQU64 (DI), Z17 + VMOVDQU64 (R8), Z18 + VMOVDQU64 (R9), Z19 + VMOVDQU64 (R10), Z20 + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (BX), Z23 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64Xor_loop + VZEROUPPER + +mulGFNI_2x8_64Xor_end: + RET + +// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8Xor_loop + VZEROUPPER + +mulAvxGFNI_2x8Xor_end: + RET + +// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64_loop + VZEROUPPER + +mulGFNI_2x9_64_end: + RET + +// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9_loop + VZEROUPPER + +mulAvxGFNI_2x9_end: + RET + +// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (SI), Z18 + VMOVDQU64 (DI), Z19 + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (BX), Z26 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64Xor_loop + VZEROUPPER + +mulGFNI_2x9_64Xor_end: + RET + +// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9Xor_loop + VZEROUPPER + +mulAvxGFNI_2x9Xor_end: + RET + +// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64_loop + VZEROUPPER + +mulGFNI_2x10_64_end: + RET + +// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10_loop + VZEROUPPER + +mulAvxGFNI_2x10_end: + RET + +// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (SI), Z20 + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (BX), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64Xor_loop + VZEROUPPER + +mulGFNI_2x10_64Xor_end: + RET + +// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10Xor_loop + VZEROUPPER + +mulAvxGFNI_2x10Xor_end: + RET + +// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64_loop + VZEROUPPER + +mulGFNI_3x1_64_end: + RET + +// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1_loop + VZEROUPPER + +mulAvxGFNI_3x1_end: + RET + +// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (SI), Z3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64Xor_loop + VZEROUPPER + +mulGFNI_3x1_64Xor_end: + RET + +// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1Xor_loop + VZEROUPPER + +mulAvxGFNI_3x1Xor_end: + RET + +// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64_loop + VZEROUPPER + +mulGFNI_3x2_64_end: + RET + +// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2_loop + VZEROUPPER + +mulAvxGFNI_3x2_end: + RET + +// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (DI), Z6 + VMOVDQU64 (SI), Z7 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64Xor_loop + VZEROUPPER + +mulGFNI_3x2_64Xor_end: + RET + +// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2Xor_loop + VZEROUPPER + +mulAvxGFNI_3x2Xor_end: + RET + +// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64_loop + VZEROUPPER + +mulGFNI_3x3_64_end: + RET + +// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3_loop + VZEROUPPER + +mulAvxGFNI_3x3_end: + RET + +// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (SI), Z11 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64Xor_loop + VZEROUPPER + +mulGFNI_3x3_64Xor_end: + RET + +// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3Xor_loop + VZEROUPPER + +mulAvxGFNI_3x3Xor_end: + RET + +// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64_loop + VZEROUPPER + +mulGFNI_3x4_64_end: + RET + +// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4_loop + VZEROUPPER + +mulAvxGFNI_3x4_end: + RET + +// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (SI), Z15 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64Xor_loop + VZEROUPPER + +mulGFNI_3x4_64Xor_end: + RET + +// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4Xor_loop + VZEROUPPER + +mulAvxGFNI_3x4Xor_end: + RET + +// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64_loop + VZEROUPPER + +mulGFNI_3x5_64_end: + RET + +// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5_loop + VZEROUPPER + +mulAvxGFNI_3x5_end: + RET + +// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (SI), Z19 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64Xor_loop + VZEROUPPER + +mulGFNI_3x5_64Xor_end: + RET + +// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5Xor_loop + VZEROUPPER + +mulAvxGFNI_3x5Xor_end: + RET + +// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64_loop + VZEROUPPER + +mulGFNI_3x6_64_end: + RET + +// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6_loop + VZEROUPPER + +mulAvxGFNI_3x6_end: + RET + +// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (DI), Z18 + VMOVDQU64 (R8), Z19 + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (SI), Z23 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64Xor_loop + VZEROUPPER + +mulGFNI_3x6_64Xor_end: + RET + +// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6Xor_loop + VZEROUPPER + +mulAvxGFNI_3x6Xor_end: + RET + +// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64_loop + VZEROUPPER + +mulGFNI_3x7_64_end: + RET + +// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7_loop + VZEROUPPER + +mulAvxGFNI_3x7_end: + RET + +// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (SI), Z27 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64Xor_loop + VZEROUPPER + +mulGFNI_3x7_64Xor_end: + RET + +// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7Xor_loop + VZEROUPPER + +mulAvxGFNI_3x7Xor_end: + RET + +// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64_loop + VZEROUPPER + +mulGFNI_3x8_64_end: + RET + +// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8_loop + VZEROUPPER + +mulAvxGFNI_3x8_end: + RET + +// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64Xor_loop + VZEROUPPER + +mulGFNI_3x8_64Xor_end: + RET + +// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8Xor_loop + VZEROUPPER + +mulAvxGFNI_3x8Xor_end: + RET + +// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64_loop + VZEROUPPER + +mulGFNI_3x9_64_end: + RET + +// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9_loop + VZEROUPPER + +mulAvxGFNI_3x9_end: + RET + +// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64Xor_loop + VZEROUPPER + +mulGFNI_3x9_64Xor_end: + RET + +// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9Xor_loop + VZEROUPPER + +mulAvxGFNI_3x9Xor_end: + RET + +// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64_loop + VZEROUPPER + +mulGFNI_3x10_64_end: + RET + +// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10_loop + VZEROUPPER + +mulAvxGFNI_3x10_end: + RET + +// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (DI), Z20 + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (SI), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64Xor_loop + VZEROUPPER + +mulGFNI_3x10_64Xor_end: + RET + +// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10Xor_loop + VZEROUPPER + +mulAvxGFNI_3x10Xor_end: + RET + +// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64_loop + VZEROUPPER + +mulGFNI_4x1_64_end: + RET + +// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1_loop + VZEROUPPER + +mulAvxGFNI_4x1_end: + RET + +// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DI), Z4 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64Xor_loop + VZEROUPPER + +mulGFNI_4x1_64Xor_end: + RET + +// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1Xor_loop + VZEROUPPER + +mulAvxGFNI_4x1Xor_end: + RET + +// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64_loop + VZEROUPPER + +mulGFNI_4x2_64_end: + RET + +// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2_loop + VZEROUPPER + +mulAvxGFNI_4x2_end: + RET + +// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DI), Z9 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64Xor_loop + VZEROUPPER + +mulGFNI_4x2_64Xor_end: + RET + +// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2Xor_loop + VZEROUPPER + +mulAvxGFNI_4x2Xor_end: + RET + +// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64_loop + VZEROUPPER + +mulGFNI_4x3_64_end: + RET + +// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3_loop + VZEROUPPER + +mulAvxGFNI_4x3_end: + RET + +// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (DI), Z14 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64Xor_loop + VZEROUPPER + +mulGFNI_4x3_64Xor_end: + RET + +// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3Xor_loop + VZEROUPPER + +mulAvxGFNI_4x3Xor_end: + RET + +// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64_loop + VZEROUPPER + +mulGFNI_4x4_64_end: + RET + +// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4_loop + VZEROUPPER + +mulAvxGFNI_4x4_end: + RET + +// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (DI), Z19 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64Xor_loop + VZEROUPPER + +mulGFNI_4x4_64Xor_end: + RET + +// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4Xor_loop + VZEROUPPER + +mulAvxGFNI_4x4Xor_end: + RET + +// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64_loop + VZEROUPPER + +mulGFNI_4x5_64_end: + RET + +// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5_loop + VZEROUPPER + +mulAvxGFNI_4x5_end: + RET + +// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (DI), Z24 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64Xor_loop + VZEROUPPER + +mulGFNI_4x5_64Xor_end: + RET + +// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5Xor_loop + VZEROUPPER + +mulAvxGFNI_4x5Xor_end: + RET + +// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64_loop + VZEROUPPER + +mulGFNI_4x6_64_end: + RET + +// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6_loop + VZEROUPPER + +mulAvxGFNI_4x6_end: + RET + +// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R8), Z24 + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64Xor_loop + VZEROUPPER + +mulGFNI_4x6_64Xor_end: + RET + +// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6Xor_loop + VZEROUPPER + +mulAvxGFNI_4x6Xor_end: + RET + +// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64_loop + VZEROUPPER + +mulGFNI_4x7_64_end: + RET + +// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7_loop + VZEROUPPER + +mulAvxGFNI_4x7_end: + RET + +// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64Xor_loop + VZEROUPPER + +mulGFNI_4x7_64Xor_end: + RET + +// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7Xor_loop + VZEROUPPER + +mulAvxGFNI_4x7Xor_end: + RET + +// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64_loop + VZEROUPPER + +mulGFNI_4x8_64_end: + RET + +// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8_loop + VZEROUPPER + +mulAvxGFNI_4x8_end: + RET + +// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64Xor_loop + VZEROUPPER + +mulGFNI_4x8_64Xor_end: + RET + +// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8Xor_loop + VZEROUPPER + +mulAvxGFNI_4x8Xor_end: + RET + +// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64_loop + VZEROUPPER + +mulGFNI_4x9_64_end: + RET + +// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9_loop + VZEROUPPER + +mulAvxGFNI_4x9_end: + RET + +// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64Xor_loop + VZEROUPPER + +mulGFNI_4x9_64Xor_end: + RET + +// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9Xor_loop + VZEROUPPER + +mulAvxGFNI_4x9Xor_end: + RET + +// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10_loop + VZEROUPPER + +mulAvxGFNI_4x10_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z20 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z21 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z22 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z23 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z24 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z25 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z26 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z27 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z28 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop + VZEROUPPER + +mulGFNI_4x10_64Xor_end: + RET + +// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10Xor_loop + VZEROUPPER + +mulAvxGFNI_4x10Xor_end: + RET + +// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64_loop + VZEROUPPER + +mulGFNI_5x1_64_end: + RET + +// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1_loop + VZEROUPPER + +mulAvxGFNI_5x1_end: + RET + +// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R8), Z5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64Xor_loop + VZEROUPPER + +mulGFNI_5x1_64Xor_end: + RET + +// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1Xor_loop + VZEROUPPER + +mulAvxGFNI_5x1Xor_end: + RET + +// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64_loop + VZEROUPPER + +mulGFNI_5x2_64_end: + RET + +// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2_loop + VZEROUPPER + +mulAvxGFNI_5x2_end: + RET + +// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R9), Z10 + VMOVDQU64 (R8), Z11 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64Xor_loop + VZEROUPPER + +mulGFNI_5x2_64Xor_end: + RET + +// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2Xor_loop + VZEROUPPER + +mulAvxGFNI_5x2Xor_end: + RET + +// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64_loop + VZEROUPPER + +mulGFNI_5x3_64_end: + RET + +// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3_loop + VZEROUPPER + +mulAvxGFNI_5x3_end: + RET + +// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (R8), Z17 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64Xor_loop + VZEROUPPER + +mulGFNI_5x3_64Xor_end: + RET + +// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3Xor_loop + VZEROUPPER + +mulAvxGFNI_5x3Xor_end: + RET + +// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64_loop + VZEROUPPER + +mulGFNI_5x4_64_end: + RET + +// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4_loop + VZEROUPPER + +mulAvxGFNI_5x4_end: + RET + +// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (R8), Z23 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64Xor_loop + VZEROUPPER + +mulGFNI_5x4_64Xor_end: + RET + +// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4Xor_loop + VZEROUPPER + +mulAvxGFNI_5x4Xor_end: + RET + +// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64_loop + VZEROUPPER + +mulGFNI_5x5_64_end: + RET + +// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5_loop + VZEROUPPER + +mulAvxGFNI_5x5_end: + RET + +// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64Xor_loop + VZEROUPPER + +mulGFNI_5x5_64Xor_end: + RET + +// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5Xor_loop + VZEROUPPER + +mulAvxGFNI_5x5Xor_end: + RET + +// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64_loop + VZEROUPPER + +mulGFNI_5x6_64_end: + RET + +// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6_loop + VZEROUPPER + +mulAvxGFNI_5x6_end: + RET + +// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64Xor_loop + VZEROUPPER + +mulGFNI_5x6_64Xor_end: + RET + +// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6Xor_loop + VZEROUPPER + +mulAvxGFNI_5x6Xor_end: + RET + +// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64_loop + VZEROUPPER + +mulGFNI_5x7_64_end: + RET + +// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7_loop + VZEROUPPER + +mulAvxGFNI_5x7_end: + RET + +// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64Xor_loop + VZEROUPPER + +mulGFNI_5x7_64Xor_end: + RET + +// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7Xor_loop + VZEROUPPER + +mulAvxGFNI_5x7Xor_end: + RET + +// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64_loop + VZEROUPPER + +mulGFNI_5x8_64_end: + RET + +// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8_loop + VZEROUPPER + +mulAvxGFNI_5x8_end: + RET + +// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64Xor_loop + VZEROUPPER + +mulGFNI_5x8_64Xor_end: + RET + +// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8Xor_loop + VZEROUPPER + +mulAvxGFNI_5x8Xor_end: + RET + +// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64_loop + VZEROUPPER + +mulGFNI_5x9_64_end: + RET + +// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9_loop + VZEROUPPER + +mulAvxGFNI_5x9_end: + RET + +// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64Xor_loop + VZEROUPPER + +mulGFNI_5x9_64Xor_end: + RET + +// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9Xor_loop + VZEROUPPER + +mulAvxGFNI_5x9Xor_end: + RET + +// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64_loop + VZEROUPPER + +mulGFNI_5x10_64_end: + RET + +// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10_loop + VZEROUPPER + +mulAvxGFNI_5x10_end: + RET + +// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z20 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 216(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64Xor_loop + VZEROUPPER + +mulGFNI_5x10_64Xor_end: + RET + +// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10Xor_loop + VZEROUPPER + +mulAvxGFNI_5x10Xor_end: + RET + +// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64_loop + VZEROUPPER + +mulGFNI_6x1_64_end: + RET + +// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1_loop + VZEROUPPER + +mulAvxGFNI_6x1_end: + RET + +// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64Xor_loop + VZEROUPPER + +mulGFNI_6x1_64Xor_end: + RET + +// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1Xor_loop + VZEROUPPER + +mulAvxGFNI_6x1Xor_end: + RET + +// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64_loop + VZEROUPPER + +mulGFNI_6x2_64_end: + RET + +// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2_loop + VZEROUPPER + +mulAvxGFNI_6x2_end: + RET + +// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R10), Z12 + VMOVDQU64 (R9), Z13 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64Xor_loop + VZEROUPPER + +mulGFNI_6x2_64Xor_end: + RET + +// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2Xor_loop + VZEROUPPER + +mulAvxGFNI_6x2Xor_end: + RET + +// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64_loop + VZEROUPPER + +mulGFNI_6x3_64_end: + RET + +// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3_loop + VZEROUPPER + +mulAvxGFNI_6x3_end: + RET + +// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (R9), Z20 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64Xor_loop + VZEROUPPER + +mulGFNI_6x3_64Xor_end: + RET + +// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3Xor_loop + VZEROUPPER + +mulAvxGFNI_6x3Xor_end: + RET + +// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64_loop + VZEROUPPER + +mulGFNI_6x4_64_end: + RET + +// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4_loop + VZEROUPPER + +mulAvxGFNI_6x4_end: + RET + +// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R9), Z27 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64Xor_loop + VZEROUPPER + +mulGFNI_6x4_64Xor_end: + RET + +// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4Xor_loop + VZEROUPPER + +mulAvxGFNI_6x4Xor_end: + RET + +// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64_loop + VZEROUPPER + +mulGFNI_6x5_64_end: + RET + +// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5_loop + VZEROUPPER + +mulAvxGFNI_6x5_end: + RET + +// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64Xor_loop + VZEROUPPER + +mulGFNI_6x5_64Xor_end: + RET + +// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5Xor_loop + VZEROUPPER + +mulAvxGFNI_6x5Xor_end: + RET + +// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64_loop + VZEROUPPER + +mulGFNI_6x6_64_end: + RET + +// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6_loop + VZEROUPPER + +mulAvxGFNI_6x6_end: + RET + +// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64Xor_loop + VZEROUPPER + +mulGFNI_6x6_64Xor_end: + RET + +// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6Xor_loop + VZEROUPPER + +mulAvxGFNI_6x6Xor_end: + RET + +// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64_loop + VZEROUPPER + +mulGFNI_6x7_64_end: + RET + +// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7_loop + VZEROUPPER + +mulAvxGFNI_6x7_end: + RET + +// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64Xor_loop + VZEROUPPER + +mulGFNI_6x7_64Xor_end: + RET + +// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7Xor_loop + VZEROUPPER + +mulAvxGFNI_6x7Xor_end: + RET + +// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64_loop + VZEROUPPER + +mulGFNI_6x8_64_end: + RET + +// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8_loop + VZEROUPPER + +mulAvxGFNI_6x8_end: + RET + +// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64Xor_loop + VZEROUPPER + +mulGFNI_6x8_64Xor_end: + RET + +// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8Xor_loop + VZEROUPPER + +mulAvxGFNI_6x8Xor_end: + RET + +// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64_loop + VZEROUPPER + +mulGFNI_6x9_64_end: + RET + +// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9_loop + VZEROUPPER + +mulAvxGFNI_6x9_end: + RET + +// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64Xor_loop + VZEROUPPER + +mulGFNI_6x9_64Xor_end: + RET + +// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9Xor_loop + VZEROUPPER + +mulAvxGFNI_6x9Xor_end: + RET + +// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64_loop + VZEROUPPER + +mulGFNI_6x10_64_end: + RET + +// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10_loop + VZEROUPPER + +mulAvxGFNI_6x10_end: + RET + +// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z20 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 216(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64Xor_loop + VZEROUPPER + +mulGFNI_6x10_64Xor_end: + RET + +// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10Xor_loop + VZEROUPPER + +mulAvxGFNI_6x10Xor_end: + RET + +// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64_loop + VZEROUPPER + +mulGFNI_7x1_64_end: + RET + +// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1_loop + VZEROUPPER + +mulAvxGFNI_7x1_end: + RET + +// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R10), Z7 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64Xor_loop + VZEROUPPER + +mulGFNI_7x1_64Xor_end: + RET + +// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1Xor_loop + VZEROUPPER + +mulAvxGFNI_7x1Xor_end: + RET + +// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64_loop + VZEROUPPER + +mulGFNI_7x2_64_end: + RET + +// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2_loop + VZEROUPPER + +mulAvxGFNI_7x2_end: + RET + +// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R11), Z14 + VMOVDQU64 (R10), Z15 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64Xor_loop + VZEROUPPER + +mulGFNI_7x2_64Xor_end: + RET + +// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2Xor_loop + VZEROUPPER + +mulAvxGFNI_7x2Xor_end: + RET + +// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64_loop + VZEROUPPER + +mulGFNI_7x3_64_end: + RET + +// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3_loop + VZEROUPPER + +mulAvxGFNI_7x3_end: + RET + +// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (R10), Z23 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64Xor_loop + VZEROUPPER + +mulGFNI_7x3_64Xor_end: + RET + +// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3Xor_loop + VZEROUPPER + +mulAvxGFNI_7x3Xor_end: + RET + +// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64_loop + VZEROUPPER + +mulGFNI_7x4_64_end: + RET + +// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4_loop + VZEROUPPER + +mulAvxGFNI_7x4_end: + RET + +// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64Xor_loop + VZEROUPPER + +mulGFNI_7x4_64Xor_end: + RET + +// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4Xor_loop + VZEROUPPER + +mulAvxGFNI_7x4Xor_end: + RET + +// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64_loop + VZEROUPPER + +mulGFNI_7x5_64_end: + RET + +// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5_loop + VZEROUPPER + +mulAvxGFNI_7x5_end: + RET + +// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64Xor_loop + VZEROUPPER + +mulGFNI_7x5_64Xor_end: + RET + +// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5Xor_loop + VZEROUPPER + +mulAvxGFNI_7x5Xor_end: + RET + +// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6_loop + VZEROUPPER + +mulAvxGFNI_7x6_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64Xor_loop + VZEROUPPER + +mulGFNI_7x6_64Xor_end: + RET + +// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6Xor_loop + VZEROUPPER + +mulAvxGFNI_7x6Xor_end: + RET + +// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64_loop + VZEROUPPER + +mulGFNI_7x7_64_end: + RET + +// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7_loop + VZEROUPPER + +mulAvxGFNI_7x7_end: + RET + +// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64Xor_loop + VZEROUPPER + +mulGFNI_7x7_64Xor_end: + RET + +// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7Xor_loop + VZEROUPPER + +mulAvxGFNI_7x7Xor_end: + RET + +// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64_loop + VZEROUPPER + +mulGFNI_7x8_64_end: + RET + +// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8_loop + VZEROUPPER + +mulAvxGFNI_7x8_end: + RET + +// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64Xor_loop + VZEROUPPER + +mulGFNI_7x8_64Xor_end: + RET + +// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8Xor_loop + VZEROUPPER + +mulAvxGFNI_7x8Xor_end: + RET + +// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64_loop + VZEROUPPER + +mulGFNI_7x9_64_end: + RET + +// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9_loop + VZEROUPPER + +mulAvxGFNI_7x9_end: + RET + +// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64Xor_loop + VZEROUPPER + +mulGFNI_7x9_64Xor_end: + RET + +// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9Xor_loop + VZEROUPPER + +mulAvxGFNI_7x9Xor_end: + RET + +// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64_loop + VZEROUPPER + +mulGFNI_7x10_64_end: + RET + +// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10_loop + VZEROUPPER + +mulAvxGFNI_7x10_end: + RET + +// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z20 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 216(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64Xor_loop + VZEROUPPER + +mulGFNI_7x10_64Xor_end: + RET + +// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10Xor_loop + VZEROUPPER + +mulAvxGFNI_7x10Xor_end: + RET + +// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1_loop + VZEROUPPER + +mulAvxGFNI_8x1_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64Xor_loop + VZEROUPPER + +mulGFNI_8x1_64Xor_end: + RET + +// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y8 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1Xor_loop + VZEROUPPER + +mulAvxGFNI_8x1Xor_end: + RET + +// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64_loop + VZEROUPPER + +mulGFNI_8x2_64_end: + RET + +// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2_loop + VZEROUPPER + +mulAvxGFNI_8x2_end: + RET + +// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R12), Z16 + VMOVDQU64 (R11), Z17 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64Xor_loop + VZEROUPPER + +mulGFNI_8x2_64Xor_end: + RET + +// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2Xor_loop + VZEROUPPER + +mulAvxGFNI_8x2Xor_end: + RET + +// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64_loop + VZEROUPPER + +mulGFNI_8x3_64_end: + RET + +// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3_loop + VZEROUPPER + +mulAvxGFNI_8x3_end: + RET + +// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (R11), Z26 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64Xor_loop + VZEROUPPER + +mulGFNI_8x3_64Xor_end: + RET + +// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3Xor_loop + VZEROUPPER + +mulAvxGFNI_8x3Xor_end: + RET + +// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64_loop + VZEROUPPER + +mulGFNI_8x4_64_end: + RET + +// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4_loop + VZEROUPPER + +mulAvxGFNI_8x4_end: + RET + +// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64Xor_loop + VZEROUPPER + +mulGFNI_8x4_64Xor_end: + RET + +// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4Xor_loop + VZEROUPPER + +mulAvxGFNI_8x4Xor_end: + RET + +// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64_loop + VZEROUPPER + +mulGFNI_8x5_64_end: + RET + +// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5_loop + VZEROUPPER + +mulAvxGFNI_8x5_end: + RET + +// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64Xor_loop + VZEROUPPER + +mulGFNI_8x5_64Xor_end: + RET + +// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5Xor_loop + VZEROUPPER + +mulAvxGFNI_8x5Xor_end: + RET + +// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64_loop + VZEROUPPER + +mulGFNI_8x6_64_end: + RET + +// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6_loop + VZEROUPPER + +mulAvxGFNI_8x6_end: + RET + +// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64Xor_loop + VZEROUPPER + +mulGFNI_8x6_64Xor_end: + RET + +// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6Xor_loop + VZEROUPPER + +mulAvxGFNI_8x6Xor_end: + RET + +// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64_loop + VZEROUPPER + +mulGFNI_8x7_64_end: + RET + +// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7_loop + VZEROUPPER + +mulAvxGFNI_8x7_end: + RET + +// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64Xor_loop + VZEROUPPER + +mulGFNI_8x7_64Xor_end: + RET + +// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7Xor_loop + VZEROUPPER + +mulAvxGFNI_8x7Xor_end: + RET + +// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64_loop + VZEROUPPER + +mulGFNI_8x8_64_end: + RET + +// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8_loop + VZEROUPPER + +mulAvxGFNI_8x8_end: + RET + +// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64Xor_loop + VZEROUPPER + +mulGFNI_8x8_64Xor_end: + RET + +// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8Xor_loop + VZEROUPPER + +mulAvxGFNI_8x8Xor_end: + RET + +// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64_loop + VZEROUPPER + +mulGFNI_8x9_64_end: + RET + +// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9_loop + VZEROUPPER + +mulAvxGFNI_8x9_end: + RET + +// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64Xor_loop + VZEROUPPER + +mulGFNI_8x9_64Xor_end: + RET + +// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9Xor_loop + VZEROUPPER + +mulAvxGFNI_8x9Xor_end: + RET + +// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64_loop + VZEROUPPER + +mulGFNI_8x10_64_end: + RET + +// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10_loop + VZEROUPPER + +mulAvxGFNI_8x10_end: + RET + +// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z20 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 216(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64Xor_loop + VZEROUPPER + +mulGFNI_8x10_64Xor_end: + RET + +// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10Xor_loop + VZEROUPPER + +mulAvxGFNI_8x10Xor_end: + RET + +// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64_loop + VZEROUPPER + +mulGFNI_9x1_64_end: + RET + +// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1_loop + VZEROUPPER + +mulAvxGFNI_9x1_end: + RET + +// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R12), Z9 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64Xor_loop + VZEROUPPER + +mulGFNI_9x1_64Xor_end: + RET + +// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1Xor_loop + VZEROUPPER + +mulAvxGFNI_9x1Xor_end: + RET + +// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64_loop + VZEROUPPER + +mulGFNI_9x2_64_end: + RET + +// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2_loop + VZEROUPPER + +mulAvxGFNI_9x2_end: + RET + +// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R13), Z18 + VMOVDQU64 (R12), Z19 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64Xor_loop + VZEROUPPER + +mulGFNI_9x2_64Xor_end: + RET + +// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2Xor_loop + VZEROUPPER + +mulAvxGFNI_9x2Xor_end: + RET + +// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3_loop + VZEROUPPER + +mulAvxGFNI_9x3_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64Xor_loop + VZEROUPPER + +mulGFNI_9x3_64Xor_end: + RET + +// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3Xor_loop + VZEROUPPER + +mulAvxGFNI_9x3Xor_end: + RET + +// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64_loop + VZEROUPPER + +mulGFNI_9x4_64_end: + RET + +// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4_loop + VZEROUPPER + +mulAvxGFNI_9x4_end: + RET + +// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64Xor_loop + VZEROUPPER + +mulGFNI_9x4_64Xor_end: + RET + +// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4Xor_loop + VZEROUPPER + +mulAvxGFNI_9x4Xor_end: + RET + +// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64_loop + VZEROUPPER + +mulGFNI_9x5_64_end: + RET + +// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5_loop + VZEROUPPER + +mulAvxGFNI_9x5_end: + RET + +// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64Xor_loop + VZEROUPPER + +mulGFNI_9x5_64Xor_end: + RET + +// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5Xor_loop + VZEROUPPER + +mulAvxGFNI_9x5Xor_end: + RET + +// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64_loop + VZEROUPPER + +mulGFNI_9x6_64_end: + RET + +// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6_loop + VZEROUPPER + +mulAvxGFNI_9x6_end: + RET + +// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64Xor_loop + VZEROUPPER + +mulGFNI_9x6_64Xor_end: + RET + +// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6Xor_loop + VZEROUPPER + +mulAvxGFNI_9x6Xor_end: + RET + +// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64_loop + VZEROUPPER + +mulGFNI_9x7_64_end: + RET + +// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7_loop + VZEROUPPER + +mulAvxGFNI_9x7_end: + RET + +// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64Xor_loop + VZEROUPPER + +mulGFNI_9x7_64Xor_end: + RET + +// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7Xor_loop + VZEROUPPER + +mulAvxGFNI_9x7Xor_end: + RET + +// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64_loop + VZEROUPPER + +mulGFNI_9x8_64_end: + RET + +// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8_loop + VZEROUPPER + +mulAvxGFNI_9x8_end: + RET + +// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64Xor_loop + VZEROUPPER + +mulGFNI_9x8_64Xor_end: + RET + +// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8Xor_loop + VZEROUPPER + +mulAvxGFNI_9x8Xor_end: + RET + +// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64_loop + VZEROUPPER + +mulGFNI_9x9_64_end: + RET + +// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9_loop + VZEROUPPER + +mulAvxGFNI_9x9_end: + RET + +// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64Xor_loop + VZEROUPPER + +mulGFNI_9x9_64Xor_end: + RET + +// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9Xor_loop + VZEROUPPER + +mulAvxGFNI_9x9Xor_end: + RET + +// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64_loop + VZEROUPPER + +mulGFNI_9x10_64_end: + RET + +// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10_loop + VZEROUPPER + +mulAvxGFNI_9x10_end: + RET + +// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z20 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 216(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64Xor_loop + VZEROUPPER + +mulGFNI_9x10_64Xor_end: + RET + +// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10Xor_loop + VZEROUPPER + +mulAvxGFNI_9x10Xor_end: + RET + +// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64_loop + VZEROUPPER + +mulGFNI_10x1_64_end: + RET + +// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1_loop + VZEROUPPER + +mulAvxGFNI_10x1_end: + RET + +// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R13), Z10 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64Xor_loop + VZEROUPPER + +mulGFNI_10x1_64Xor_end: + RET + +// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1Xor_loop + VZEROUPPER + +mulAvxGFNI_10x1Xor_end: + RET + +// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64_loop + VZEROUPPER + +mulGFNI_10x2_64_end: + RET + +// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2_loop + VZEROUPPER + +mulAvxGFNI_10x2_end: + RET + +// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R14), Z20 + VMOVDQU64 (R13), Z21 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64Xor_loop + VZEROUPPER + +mulGFNI_10x2_64Xor_end: + RET + +// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2Xor_loop + VZEROUPPER + +mulAvxGFNI_10x2Xor_end: + RET + +// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3_loop + VZEROUPPER + +mulAvxGFNI_10x3_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R13), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64Xor_loop + VZEROUPPER + +mulGFNI_10x3_64Xor_end: + RET + +// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3Xor_loop + VZEROUPPER + +mulAvxGFNI_10x3Xor_end: + RET + +// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64_loop + VZEROUPPER + +mulGFNI_10x4_64_end: + RET + +// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4_loop + VZEROUPPER + +mulAvxGFNI_10x4_end: + RET + +// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64Xor_loop + VZEROUPPER + +mulGFNI_10x4_64Xor_end: + RET + +// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4Xor_loop + VZEROUPPER + +mulAvxGFNI_10x4Xor_end: + RET + +// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64_loop + VZEROUPPER + +mulGFNI_10x5_64_end: + RET + +// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5_loop + VZEROUPPER + +mulAvxGFNI_10x5_end: + RET + +// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64Xor_loop + VZEROUPPER + +mulGFNI_10x5_64Xor_end: + RET + +// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5Xor_loop + VZEROUPPER + +mulAvxGFNI_10x5Xor_end: + RET + +// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64_loop + VZEROUPPER + +mulGFNI_10x6_64_end: + RET + +// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6_loop + VZEROUPPER + +mulAvxGFNI_10x6_end: + RET + +// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64Xor_loop + VZEROUPPER + +mulGFNI_10x6_64Xor_end: + RET + +// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6Xor_loop + VZEROUPPER + +mulAvxGFNI_10x6Xor_end: + RET + +// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64_loop + VZEROUPPER + +mulGFNI_10x7_64_end: + RET + +// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7_loop + VZEROUPPER + +mulAvxGFNI_10x7_end: + RET + +// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64Xor_loop + VZEROUPPER + +mulGFNI_10x7_64Xor_end: + RET + +// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7Xor_loop + VZEROUPPER + +mulAvxGFNI_10x7Xor_end: + RET + +// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64_loop + VZEROUPPER + +mulGFNI_10x8_64_end: + RET + +// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8_loop + VZEROUPPER + +mulAvxGFNI_10x8_end: + RET + +// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64Xor_loop + VZEROUPPER + +mulGFNI_10x8_64Xor_end: + RET + +// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8Xor_loop + VZEROUPPER + +mulAvxGFNI_10x8Xor_end: + RET + +// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64_loop + VZEROUPPER + +mulGFNI_10x9_64_end: + RET + +// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9_loop + VZEROUPPER + +mulAvxGFNI_10x9_end: + RET + +// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64Xor_loop + VZEROUPPER + +mulGFNI_10x9_64Xor_end: + RET + +// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9Xor_loop + VZEROUPPER + +mulAvxGFNI_10x9Xor_end: + RET + +// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64_loop + VZEROUPPER + +mulGFNI_10x10_64_end: + RET + +// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10_loop + VZEROUPPER + +mulAvxGFNI_10x10_end: + RET + +// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z20 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 216(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64Xor_loop + VZEROUPPER + +mulGFNI_10x10_64Xor_end: + RET + +// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10Xor_loop + VZEROUPPER + +mulAvxGFNI_10x10Xor_end: + RET + +// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z5, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VPTERNLOGD $0x96, Z7, Z3, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + VXORPD Z5, Z6, Z6 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z3, Z5, Z3 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VPTERNLOGD $0x96, Z5, Z1, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go index ffc1bb1..f9c36e2 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go @@ -1,7 +1,7 @@ // Code generated by command: go generate gen.go. DO NOT EDIT. -//go:build !appengine && !noasm && gc && !nogen -// +build !appengine,!noasm,gc,!nogen +//go:build !appengine && !noasm && gc && !nogen && !nopshufb +// +build !appengine,!noasm,gc,!nogen,!nopshufb package reedsolomon @@ -10,691 +10,734 @@ import ( ) const ( - avx2CodeGen = true - maxAvx2Inputs = 10 - maxAvx2Outputs = 10 - minAvx2Size = 64 - avxSizeMask = maxInt - (minAvx2Size - 1) + codeGen = true + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 ) -func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask +var ( + fAvx2 = galMulSlicesAvx2 + fAvx2Xor = galMulSlicesAvx2Xor + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return &fAvx2, &fAvx2Xor, codeGen && pshufb && r.o.useAVX2 && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(in) { case 1: switch len(out) { case 1: mulAvxTwo_1x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_1x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_1x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_1x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_1x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_1x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_1x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_1x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_1x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_1x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 2: switch len(out) { case 1: mulAvxTwo_2x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_2x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_2x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_2x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_2x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_2x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_2x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_2x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_2x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_2x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 3: switch len(out) { case 1: mulAvxTwo_3x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_3x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_3x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_3x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_3x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_3x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_3x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_3x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_3x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_3x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 4: switch len(out) { case 1: mulAvxTwo_4x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_4x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_4x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_4x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_4x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_4x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_4x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_4x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_4x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_4x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 5: switch len(out) { case 1: mulAvxTwo_5x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_5x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_5x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_5x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_5x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_5x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_5x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_5x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_5x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_5x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 6: switch len(out) { case 1: mulAvxTwo_6x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_6x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_6x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_6x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_6x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_6x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_6x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_6x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_6x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_6x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 7: switch len(out) { case 1: mulAvxTwo_7x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_7x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_7x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_7x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_7x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_7x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_7x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_7x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_7x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_7x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 8: switch len(out) { case 1: mulAvxTwo_8x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_8x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_8x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_8x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_8x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_8x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_8x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_8x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_8x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_8x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 9: switch len(out) { case 1: mulAvxTwo_9x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_9x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_9x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_9x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_9x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_9x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_9x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_9x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_9x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_9x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 10: switch len(out) { case 1: mulAvxTwo_10x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_10x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_10x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_10x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_10x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_10x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_10x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_10x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_10x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_10x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } -func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(in) { case 1: switch len(out) { case 1: mulAvxTwo_1x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_1x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_1x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_1x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_1x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_1x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_1x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_1x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_1x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_1x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 2: switch len(out) { case 1: mulAvxTwo_2x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_2x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_2x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_2x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_2x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_2x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_2x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_2x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_2x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_2x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 3: switch len(out) { case 1: mulAvxTwo_3x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_3x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_3x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_3x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_3x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_3x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_3x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_3x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_3x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_3x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 4: switch len(out) { case 1: mulAvxTwo_4x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_4x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_4x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_4x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_4x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_4x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_4x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_4x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_4x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_4x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 5: switch len(out) { case 1: mulAvxTwo_5x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_5x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_5x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_5x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_5x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_5x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_5x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_5x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_5x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_5x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 6: switch len(out) { case 1: mulAvxTwo_6x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_6x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_6x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_6x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_6x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_6x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_6x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_6x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_6x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_6x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 7: switch len(out) { case 1: mulAvxTwo_7x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_7x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_7x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_7x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_7x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_7x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_7x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_7x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_7x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_7x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 8: switch len(out) { case 1: mulAvxTwo_8x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_8x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_8x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_8x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_8x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_8x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_8x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_8x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_8x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_8x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 9: switch len(out) { case 1: mulAvxTwo_9x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_9x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_9x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_9x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_9x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_9x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_9x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_9x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_9x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_9x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 10: switch len(out) { case 1: mulAvxTwo_10x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_10x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_10x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_10x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_10x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_10x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_10x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_10x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_10x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_10x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } switch len(in) { case 1: @@ -1032,7 +1075,12 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } switch len(in) { case 1: @@ -1368,3 +1416,689 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go new file mode 100644 index 0000000..656e062 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go @@ -0,0 +1,219 @@ +//go:build !appengine && !noasm && gc && !nogen && !nopshufb +// +build !appengine,!noasm,gc,!nogen,!nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + codeGen = true + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +var ( + fSve = galMulSlicesSve + fSveXor = galMulSlicesSveXor + fNeon = galMulSlicesNeon + fNeonXor = galMulSlicesNeonXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useSVE { + return &fSve, &fSveXor, codeGen && pshufb && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fNeon, &fNeonXor, codeGen && pshufb && r.o.useNEON && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +// galMulSlicesSve +func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } + // fmt.Println(len(in), len(out)) + switch len(out) { + case 1: + mulSve_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulSve_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulSve_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulSve_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulSve_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulSve_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulSve_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulSve_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulSve_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulSve_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesSveXor +func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = (stop - start) + + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } + + switch len(out) { + case 1: + mulSve_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulSve_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulSve_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulSve_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulSve_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulSve_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulSve_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulSve_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulSve_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulSve_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesNeon +func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } + + switch len(out) { + case 1: + mulNeon_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulNeon_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulNeon_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulNeon_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulNeon_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulNeon_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulNeon_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulNeon_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulNeon_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulNeon_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesNeonXor +func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = (stop - start) + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } + switch len(out) { + case 1: + mulNeon_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulNeon_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulNeon_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulNeon_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulNeon_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulNeon_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulNeon_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulNeon_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulNeon_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulNeon_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go new file mode 100644 index 0000000..3ac349d --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go @@ -0,0 +1,1415 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + codeGen = true + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +var ( + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false // no code generation for generic case (only GFNI cases) +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (64 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (64 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_arm64.go new file mode 100644 index 0000000..db2aaa6 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_arm64.go @@ -0,0 +1,22 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +const ( + codeGen = false + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go index 9043601..fb5a3b6 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go @@ -1,12 +1,11 @@ -//go:build (!amd64 || noasm || appengine || gccgo) && (!arm64 || noasm || appengine || gccgo) && (!ppc64le || noasm || appengine || gccgo) -// +build !amd64 noasm appengine gccgo -// +build !arm64 noasm appengine gccgo -// +build !ppc64le noasm appengine gccgo +//go:build (!amd64 || noasm || appengine || gccgo) && (!arm64 || noasm || appengine || gccgo || nopshufb) && (!ppc64le || noasm || appengine || gccgo || nopshufb) // Copyright 2015, Klaus Post, see LICENSE for details. package reedsolomon +const pshufb = false + func galMulSlice(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { @@ -31,11 +30,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// simple slice xor -func sliceXor(in, out []byte, o *options) { - sliceXorGo(in, out, o) -} - func init() { defaultOptions.useAVX512 = false } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go new file mode 100644 index 0000000..89c74e2 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go @@ -0,0 +1,146 @@ +// Copyright 2015, Klaus Post, see LICENSE for details + +//go:build nopshufb && !noasm + +package reedsolomon + +// bigSwitchover is the size where 64 bytes are processed per loop. +const bigSwitchover = 128 + +const pshufb = false + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + if o.useSSE2 { + if len(in) >= bigSwitchover { + if o.useAVX2 { + avx2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } else { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + } + if len(in) >= 16 { + sSE2XorSlice(in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + } else { + sliceXorGo(in, out, o) + return + } + out = out[:len(in)] + for i := range in { + out[i] ^= in[i] + } +} + +func galMulSlice(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + copy(out, in) + return + } + mt := mulTable[c][:256] + for len(in) >= 4 { + ii := (*[4]byte)(in) + oo := (*[4]byte)(out) + oo[0] = mt[ii[0]] + oo[1] = mt[ii[1]] + oo[2] = mt[ii[2]] + oo[3] = mt[ii[3]] + in = in[4:] + out = out[4:] + } + for n, input := range in { + out[n] = mt[input] + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + sliceXor(in, out, o) + return + } + mt := mulTable[c][:256] + for len(in) >= 4 { + ii := (*[4]byte)(in) + oo := (*[4]byte)(out) + oo[0] ^= mt[ii[0]] + oo[1] ^= mt[ii[1]] + oo[2] ^= mt[ii[2]] + oo[3] ^= mt[ii[3]] + in = in[4:] + out = out[4:] + } + for n, input := range in { + out[n] ^= mt[input] + } +} + +func init() { + defaultOptions.useAVX512 = false +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + refMulAdd8(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd(x, y, log_m) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd8(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} + +func mulgf8(x, y []byte, log_m ffe8, o *options) { + refMul8(x, y, log_m) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go b/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go deleted file mode 100644 index e67905b..0000000 --- a/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go +++ /dev/null @@ -1,14 +0,0 @@ -//go:build !amd64 || noasm || appengine || gccgo -// +build !amd64 noasm appengine gccgo - -// Copyright 2020, Klaus Post, see LICENSE for details. - -package reedsolomon - -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { - panic("codeSomeShardsAvx512 should not be called if built without asm") -} - -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { - panic("codeSomeShardsAvx512P should not be called if built without asm") -} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go index 8cd7b52..c4c8035 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go @@ -1,11 +1,12 @@ -//go:build !noasm && !appengine && !gccgo -// +build !noasm,!appengine,!gccgo +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. package reedsolomon +const pshufb = true + //go:noescape func galMulPpc(low, high, in, out []byte) @@ -66,11 +67,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// slice galois add -func sliceXor(in, out []byte, o *options) { - sliceXorGo(in, out, o) -} - // 4-way butterfly func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s index 7213c61..c585c2b 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s @@ -1,6 +1,7 @@ //+build !noasm //+build !appengine //+build !gccgo +//+build !pshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. diff --git a/vendor/github.com/klauspost/reedsolomon/leopard.go b/vendor/github.com/klauspost/reedsolomon/leopard.go index 16bec4b..adf72c8 100644 --- a/vendor/github.com/klauspost/reedsolomon/leopard.go +++ b/vendor/github.com/klauspost/reedsolomon/leopard.go @@ -303,7 +303,7 @@ func (r *leopardFF16) Split(data []byte) ([][]byte, error) { // Copy partial shards copyFrom := data[perShard*fullShards : dataLen] for i := range padding { - if len(copyFrom) <= 0 { + if len(copyFrom) == 0 { break } copyFrom = copyFrom[copy(padding[i], copyFrom):] @@ -333,7 +333,10 @@ func (r *leopardFF16) Split(data []byte) ([][]byte, error) { } func (r *leopardFF16) ReconstructSome(shards [][]byte, required []bool) error { - return r.ReconstructData(shards) + if len(required) == r.totalShards { + return r.reconstruct(shards, true) + } + return r.reconstruct(shards, false) } func (r *leopardFF16) Reconstruct(shards [][]byte) error { @@ -448,13 +451,13 @@ func (r *leopardFF16) reconstruct(shards [][]byte, recoverAll bool) error { } // Evaluate error locator polynomial - fwht(&errLocs, order, m+r.dataShards) + fwht(&errLocs, m+r.dataShards) for i := 0; i < order; i++ { errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus) } - fwht(&errLocs, order, order) + fwht(&errLocs, order) var work [][]byte if w, ok := r.workPool.Get().([][]byte); ok { @@ -860,11 +863,11 @@ func ceilPow2(n int) int { // Decimation in time (DIT) Fast Walsh-Hadamard Transform // Unrolls pairs of layers to perform cross-layer operations in registers // mtrunc: Number of elements that are non-zero at the front of data -func fwht(data *[order]ffe, m, mtrunc int) { +func fwht(data *[order]ffe, mtrunc int) { // Decimation in time: Unroll 2 layers at a time dist := 1 dist4 := 4 - for dist4 <= m { + for dist4 <= order { // For each set of dist*4 elements: for r := 0; r < mtrunc; r += dist4 { // For each set of dist elements: @@ -895,14 +898,6 @@ func fwht(data *[order]ffe, m, mtrunc int) { dist = dist4 dist4 <<= 2 } - - // If there is one layer left: - if dist < m { - dist := uint16(dist) - for i := uint16(0); i < dist; i++ { - fwht2(&data[i], &data[i+dist]) - } - } } func fwht4(data []ffe, s int) { @@ -1033,7 +1028,7 @@ func initFFTSkew() { } logWalsh[0] = 0 - fwht(logWalsh, order, order) + fwht(logWalsh, order) } func initMul16LUT() { diff --git a/vendor/github.com/klauspost/reedsolomon/leopard8.go b/vendor/github.com/klauspost/reedsolomon/leopard8.go index 31c97ea..cd0a23e 100644 --- a/vendor/github.com/klauspost/reedsolomon/leopard8.go +++ b/vendor/github.com/klauspost/reedsolomon/leopard8.go @@ -344,7 +344,7 @@ func (r *leopardFF8) Split(data []byte) ([][]byte, error) { // Copy partial shards copyFrom := data[perShard*fullShards : dataLen] for i := range padding { - if len(copyFrom) <= 0 { + if len(copyFrom) == 0 { break } copyFrom = copyFrom[copy(padding[i], copyFrom):] @@ -369,7 +369,10 @@ func (r *leopardFF8) Split(data []byte) ([][]byte, error) { } func (r *leopardFF8) ReconstructSome(shards [][]byte, required []bool) error { - return r.ReconstructData(shards) + if len(required) == r.totalShards { + return r.reconstruct(shards, true) + } + return r.reconstruct(shards, false) } func (r *leopardFF8) Reconstruct(shards [][]byte) error { @@ -506,13 +509,13 @@ func (r *leopardFF8) reconstruct(shards [][]byte, recoverAll bool) error { } // Evaluate error locator polynomial8 - fwht8(&errLocs, order8, m+r.dataShards) + fwht8(&errLocs, m+r.dataShards) for i := 0; i < order8; i++ { errLocs[i] = ffe8((uint(errLocs[i]) * uint(logWalsh8[i])) % modulus8) } - fwht8(&errLocs, order8, order8) + fwht8(&errLocs, order8) if r.inversion != nil { c := leopardGF8cache{ @@ -940,11 +943,11 @@ func subMod8(a, b ffe8) ffe8 { // Decimation in time (DIT) Fast Walsh-Hadamard Transform // Unrolls pairs of layers to perform cross-layer operations in registers // mtrunc: Number of elements that are non-zero at the front of data -func fwht8(data *[order8]ffe8, m, mtrunc int) { +func fwht8(data *[order8]ffe8, mtrunc int) { // Decimation in time: Unroll 2 layers at a time dist := 1 dist4 := 4 - for dist4 <= m { + for dist4 <= order8 { // For each set of dist*4 elements: for r := 0; r < mtrunc; r += dist4 { // For each set of dist elements: @@ -975,14 +978,6 @@ func fwht8(data *[order8]ffe8, m, mtrunc int) { dist = dist4 dist4 <<= 2 } - - // If there is one layer left: - if dist < m { - dist := uint16(dist) - for i := uint16(0); i < dist; i++ { - fwht28(&data[i], &data[i+dist]) - } - } } func fwht48(data []ffe8, s int) { @@ -1110,7 +1105,7 @@ func initFFTSkew8() { } logWalsh8[0] = 0 - fwht8(logWalsh8, order8, order8) + fwht8(logWalsh8, order8) } func initMul8LUT() { diff --git a/vendor/github.com/klauspost/reedsolomon/matrix.go b/vendor/github.com/klauspost/reedsolomon/matrix.go index 497a3d9..bfdcca6 100644 --- a/vendor/github.com/klauspost/reedsolomon/matrix.go +++ b/vendor/github.com/klauspost/reedsolomon/matrix.go @@ -67,11 +67,11 @@ var errColSizeMismatch = errors.New("column size is not the same for all rows") func (m matrix) Check() error { rows := len(m) - if rows <= 0 { + if rows == 0 { return errInvalidRowSize } cols := len(m[0]) - if cols <= 0 { + if cols == 0 { return errInvalidColSize } @@ -231,7 +231,7 @@ func (m matrix) gaussianElimination() error { } // Scale to 1. if m[r][r] != 1 { - scale := galDivide(1, m[r][r]) + scale := galOneOver(m[r][r]) for c := 0; c < columns; c++ { m[r][c] = galMultiply(m[r][c], scale) } diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go index f74fe00..cde2555 100644 --- a/vendor/github.com/klauspost/reedsolomon/options.go +++ b/vendor/github.com/klauspost/reedsolomon/options.go @@ -2,6 +2,7 @@ package reedsolomon import ( "runtime" + "strings" "github.com/klauspost/cpuid/v2" ) @@ -15,15 +16,24 @@ type options struct { shardSize int perRound int - useGFNI, useAVX512, useAVX2, useSSSE3, useSSE2 bool - useJerasureMatrix bool - usePAR1Matrix bool - useCauchy bool - fastOneParity bool - inversionCache bool - forcedInversionCache bool - customMatrix [][]byte - withLeopard leopardMode + useAvxGNFI, + useAvx512GFNI, + useAVX512, + useAVX2, + useSSSE3, + useSSE2, + useNEON, + useSVE bool + vectorLength int + + useJerasureMatrix bool + usePAR1Matrix bool + useCauchy bool + fastOneParity bool + inversionCache bool + forcedInversionCache bool + customMatrix [][]byte + withLeopard leopardMode // stream options concReads bool @@ -38,11 +48,15 @@ var defaultOptions = options{ inversionCache: true, // Detect CPU capabilities. - useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), - useSSE2: cpuid.CPU.Supports(cpuid.SSE2), - useAVX2: cpuid.CPU.Supports(cpuid.AVX2), - useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), - useGFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), + useSSE2: cpuid.CPU.Supports(cpuid.SSE2), + useAVX2: cpuid.CPU.Supports(cpuid.AVX2), + useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), + useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), + useNEON: cpuid.CPU.Supports(cpuid.ASIMD), + useSVE: cpuid.CPU.Supports(cpuid.SVE), + vectorLength: 32, // default vector length is 32 bytes (256 bits) for AVX2 code gen } // leopardMode controls the use of leopard GF in encoding and decoding. @@ -159,10 +173,14 @@ func WithSSSE3(enabled bool) Option { } // WithAVX2 allows to enable/disable AVX2 instructions. -// If not set, AVX2 will be turned on or off automatically based on CPU ID information. +// If not set, AVX will be turned on or off automatically based on CPU ID information. +// This will also disable AVX GFNI instructions. func WithAVX2(enabled bool) Option { return func(o *options) { o.useAVX2 = enabled + if o.useAvxGNFI { + o.useAvxGNFI = enabled + } } } @@ -178,7 +196,7 @@ func WithSSE2(enabled bool) Option { func WithAVX512(enabled bool) Option { return func(o *options) { o.useAVX512 = enabled - o.useGFNI = enabled + o.useAvx512GFNI = enabled } } @@ -186,7 +204,15 @@ func WithAVX512(enabled bool) Option { // If not set, GFNI will be turned on or off automatically based on CPU ID information. func WithGFNI(enabled bool) Option { return func(o *options) { - o.useGFNI = enabled + o.useAvx512GFNI = enabled + } +} + +// WithAVXGFNI allows to enable/disable GFNI with AVX instructions. +// If not set, GFNI will be turned on or off automatically based on CPU ID information. +func WithAVXGFNI(enabled bool) Option { + return func(o *options) { + o.useAvxGNFI = enabled } } @@ -275,3 +301,34 @@ func WithLeopardGF(enabled bool) Option { } } } + +func (o *options) cpuOptions() string { + var res []string + if o.useSSE2 { + res = append(res, "SSE2") + } + if o.useAVX2 { + res = append(res, "AVX2") + } + if o.useSSSE3 { + res = append(res, "SSSE3") + } + if o.useAVX512 { + res = append(res, "AVX512") + } + if o.useAvx512GFNI { + res = append(res, "AVX512+GFNI") + } + if o.useAvxGNFI { + res = append(res, "AVX+GFNI") + } + if o.useSVE { + res = append(res, "ARM+SVE") + } else if o.useNEON { + res = append(res, "ARM+NEON") + } + if len(res) == 0 { + return "pure Go" + } + return strings.Join(res, ",") +} diff --git a/vendor/github.com/klauspost/reedsolomon/race.go b/vendor/github.com/klauspost/reedsolomon/race.go new file mode 100644 index 0000000..4f2c0b6 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/race.go @@ -0,0 +1,61 @@ +// Copyright (c) 2024+ Klaus Post. See LICENSE for license + +//go:build race + +package reedsolomon + +import ( + "runtime" + "unsafe" +) + +const raceEnabled = true + +func raceReadSlice[T any](s []T) { + if len(s) == 0 { + return + } + runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) +} + +func raceWriteSlice[T any](s []T) { + if len(s) == 0 { + return + } + runtime.RaceWriteRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) +} + +func raceReadSlices[T any](s [][]T, start, n int) { + if len(s) == 0 { + return + } + runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) + for _, v := range s { + if len(v) == 0 { + continue + } + n := n + if n < 0 { + n = len(v) - start + } + runtime.RaceReadRange(unsafe.Pointer(&v[start]), n*int(unsafe.Sizeof(v[0]))) + } +} + +func raceWriteSlices[T any](s [][]T, start, n int) { + if len(s) == 0 { + return + } + runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) + + for _, v := range s { + if len(v) == 0 { + continue + } + n := n + if n < 0 { + n = len(v) - start + } + runtime.RaceWriteRange(unsafe.Pointer(&v[start]), n*int(unsafe.Sizeof(v[0]))) + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/race_none.go b/vendor/github.com/klauspost/reedsolomon/race_none.go new file mode 100644 index 0000000..c7d05f2 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/race_none.go @@ -0,0 +1,17 @@ +// Copyright (c) 2024+ Klaus Post. See LICENSE for license + +//go:build !race + +package reedsolomon + +const raceEnabled = false + +func raceReadSlice[T any](s []T) { +} + +func raceWriteSlice[T any](s []T) { +} + +func raceReadSlices[T any](s [][]T, start, n int) {} + +func raceWriteSlices[T any](s [][]T, start, n int) {} diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go index 20e3974..443543f 100644 --- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go +++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go @@ -77,11 +77,12 @@ type Encoder interface { // calling the Verify function is likely to fail. ReconstructData(shards [][]byte) error - // ReconstructSome will recreate only requested data shards, if possible. + // ReconstructSome will recreate only requested shards, if possible. // // Given a list of shards, some of which contain data, fills in the - // data shards indicated by true values in the "required" parameter. - // The length of "required" array must be equal to DataShards. + // shards indicated by true values in the "required" parameter. + // The length of the "required" array must be equal to either Shards or DataShards. + // If the length is equal to DataShards, the reconstruction of parity shards will be ignored. // // The length of "shards" array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. @@ -152,9 +153,8 @@ type Extensions interface { } const ( - avx2CodeGenMinSize = 64 - avx2CodeGenMinShards = 3 - avx2CodeGenMaxGoroutines = 8 + codeGenMinSize = 64 + codeGenMinShards = 3 gfniCodeGenMaxGoroutines = 4 intSize = 32 << (^uint(0) >> 63) // 32 or 64 @@ -283,7 +283,7 @@ func buildMatrixJerasure(dataShards, totalShards int) (matrix, error) { // Multiply by the inverted matrix (same as vm.Multiply(vm[0:dataShards].Invert())) if vm[i][i] != 1 { // Make vm[i][i] = 1 by dividing the column by vm[i][i] - tmp := galDivide(1, vm[i][i]) + tmp := galOneOver(vm[i][i]) for j := 0; j < totalShards; j++ { vm[j][i] = galMultiply(vm[j][i], tmp) } @@ -303,7 +303,7 @@ func buildMatrixJerasure(dataShards, totalShards int) (matrix, error) { for j := 0; j < dataShards; j++ { tmp := vm[dataShards][j] if tmp != 1 { - tmp = galDivide(1, tmp) + tmp = galOneOver(tmp) for i := dataShards; i < totalShards; i++ { vm[i][j] = galMultiply(vm[i][j], tmp) } @@ -314,7 +314,7 @@ func buildMatrixJerasure(dataShards, totalShards int) (matrix, error) { for i := dataShards + 1; i < totalShards; i++ { tmp := vm[i][0] if tmp != 1 { - tmp = galDivide(1, tmp) + tmp = galOneOver(tmp) for j := 0; j < dataShards; j++ { vm[i][j] = galMultiply(vm[i][j], tmp) } @@ -481,21 +481,23 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.o.perRound = 128 << 10 } + _, _, useCodeGen := r.hasCodeGen(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs) + divide := parityShards + 1 - if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { + if codeGen && useCodeGen && (dataShards > codeGenMaxInputs || parityShards > codeGenMaxOutputs) { // Base on L1 cache if we have many inputs. r.o.perRound = cpuid.CPU.Cache.L1D if r.o.perRound < 32<<10 { r.o.perRound = 32 << 10 } divide = 0 - if dataShards > maxAvx2Inputs { - divide += maxAvx2Inputs + if dataShards > codeGenMaxInputs { + divide += codeGenMaxInputs } else { divide += dataShards } - if parityShards > maxAvx2Inputs { - divide += maxAvx2Outputs + if parityShards > codeGenMaxInputs { + divide += codeGenMaxOutputs } else { divide += parityShards } @@ -554,11 +556,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Generated AVX2 does not need data to stay in L1 cache between runs. // We will be purely limited by RAM speed. - if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { - r.o.maxGoroutines = avx2CodeGenMaxGoroutines + if useCodeGen && r.o.maxGoroutines > codeGenMaxGoroutines { + r.o.maxGoroutines = codeGenMaxGoroutines } - if r.canGFNI(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { + if _, _, useGFNI := r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { r.o.maxGoroutines = gfniCodeGenMaxGoroutines } @@ -576,7 +578,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.parity[i] = r.m[dataShards+i] } - if avx2CodeGen && r.o.useAVX2 { + if codeGen /* && r.o.useAVX2 */ { sz := r.dataShards * r.parityShards * 2 * 32 r.mPool.New = func() interface{} { return AllocAligned(1, sz)[0] @@ -652,15 +654,15 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && (r.o.useAVX2 || r.o.useGFNI) { + if codeGen && len(dataShard) >= r.o.perRound && len(parity) >= codeGenMinShards && (pshufb || r.o.useAvx512GFNI || r.o.useAvxGNFI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] } - if r.o.useGFNI { - r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) + if r.o.useAvx512GFNI || r.o.useAvxGNFI { + r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } else { - r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) + r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } return nil } @@ -802,18 +804,6 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil } -func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && r.o.useAVX2 && - byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && - inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs -} - -func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && r.o.useGFNI && - byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && - inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs -} - // Multiplies a subset of rows from a coding matrix by a full set of // input totalShards to produce some output totalShards. // 'matrixRows' is The rows from the matrix to use. @@ -837,18 +827,18 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if end > len(inputs[0]) { end = len(inputs[0]) } - if r.canGFNI(byteCount, len(inputs), len(outputs)) { - var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)); useGFNI { + var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) - start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount) end = len(inputs[0]) - } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { - m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) - start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) + } else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok { + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice()) + start += (*galMulGen)(m, inputs, outputs, 0, byteCount) r.putTmpSlice(m) end = len(inputs[0]) - } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { - var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount, codeGenMaxInputs, codeGenMaxOutputs); len(inputs)+len(outputs) > codeGenMinShards && ok { + var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 end = len(inputs[0]) inIdx := 0 m := r.getTmpSlice() @@ -856,32 +846,31 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } - if r.o.useGFNI { + if useGFNI { m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) if inIdx == 0 { - galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNI)(m, inPer, outPer, 0, byteCount) } else { - galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount) } } else { - m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, m) if inIdx == 0 { - galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + start = (*galMulGen)(m, inPer, outPer, 0, byteCount) } else { - galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + start = (*galMulGenXor)(m, inPer, outPer, 0, byteCount) } } - start = byteCount & avxSizeMask outIdx += len(outPer) outs = outs[len(outPer):] } @@ -917,27 +906,27 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte var wg sync.WaitGroup gor := r.o.maxGoroutines - var avx2Matrix []byte + var genMatrix []byte var gfniMatrix []uint64 - useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) - useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) + galMulGen, _, useCodeGen := r.hasCodeGen(byteCount, len(inputs), len(outputs)) + galMulGFNI, _, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) if useGFNI { - var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64 + var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64 gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) - } else if useAvx2 { - avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) - defer r.putTmpSlice(avx2Matrix) - } else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + } else if useCodeGen { + genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice()) + defer r.putTmpSlice(genMatrix) + } else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true, galMulGFNI, galMulGFNIXor) return - } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); ok && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true, galMulGen, galMulGenXor) return } @@ -949,9 +938,9 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte exec := func(start, stop int) { if stop-start >= 64 { if useGFNI { - start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) - } else if useAvx2 { - start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + start += (*galMulGFNI)(gfniMatrix, inputs, outputs, start, stop) + } else if useCodeGen { + start += (*galMulGen)(genMatrix, inputs, outputs, start, stop) } } @@ -1002,7 +991,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGen, galMulGenXor *func(matrix []byte, in [][]byte, out [][]byte, start int, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1013,7 +1002,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b first bool } // Make a plan... - plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs)) tmp := r.getTmpSlice() defer r.putTmpSlice(tmp) @@ -1025,18 +1014,18 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix - m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp) tmp = tmp[len(m):] plan = append(plan, state{ input: inPer, @@ -1055,19 +1044,19 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } inIdx := 0 ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } // Generate local matrix - m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp) tmp = tmp[len(m):] //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) plan = append(plan, state{ @@ -1096,16 +1085,17 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minAvx2Size { + if galMulGen != nil && galMulGenXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... + var n int for _, p := range plan { if p.first { - galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGen)(p.m, p.input, p.output, lstart, lstop) } else { - galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGenXor)(p.m, p.input, p.output, lstart, lstop) } } - lstart += (lstop - lstart) & avxSizeMask + lstart += n if lstart == lstop { lstop += r.o.perRound if lstop > stop { @@ -1156,7 +1146,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGFNI, galMulGFNIXor *func(matrix []uint64, in, out [][]byte, start, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1167,7 +1157,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b first bool } // Make a plan... - plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs)) // Flips between input first to output first. // We put the smallest data load in the inner loop. @@ -1176,15 +1166,15 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) @@ -1205,16 +1195,16 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } inIdx := 0 ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } // Generate local matrix m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) @@ -1245,16 +1235,17 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minAvx2Size { + if galMulGFNI != nil && galMulGFNIXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... + var n int for _, p := range plan { if p.first { - galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGFNI)(p.m, p.input, p.output, lstart, lstop) } else { - galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGFNIXor)(p.m, p.input, p.output, lstart, lstop) } } - lstart += (lstop - lstart) & avxSizeMask + lstart += n if lstart == lstop { lstop += r.o.perRound if lstop > stop { @@ -1402,13 +1393,14 @@ func (r *reedSolomon) ReconstructData(shards [][]byte) error { return r.reconstruct(shards, true, nil) } -// ReconstructSome will recreate only requested data shards, if possible. +// ReconstructSome will recreate only requested shards, if possible. // // Given a list of shards, some of which contain data, fills in the -// data shards indicated by true values in the "required" parameter. -// The length of "required" array must be equal to dataShards. +// shards indicated by true values in the "required" parameter. +// The length of the "required" array must be equal to either Shards or DataShards. +// If the length is equal to DataShards, the reconstruction of parity shards will be ignored. // -// The length of "shards" array must be equal to shards. +// The length of "shards" array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. @@ -1419,6 +1411,9 @@ func (r *reedSolomon) ReconstructData(shards [][]byte) error { // As the reconstructed shard set may contain missing parity shards, // calling the Verify function is likely to fail. func (r *reedSolomon) ReconstructSome(shards [][]byte, required []bool) error { + if len(required) == r.totalShards { + return r.reconstruct(shards, false, required) + } return r.reconstruct(shards, true, required) } @@ -1633,7 +1628,7 @@ func (r *reedSolomon) Split(data []byte) ([][]byte, error) { // Copy partial shards copyFrom := data[perShard*fullShards : dataLen] for i := range padding { - if len(copyFrom) <= 0 { + if len(copyFrom) == 0 { break } copyFrom = copyFrom[copy(padding[i], copyFrom):] diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go new file mode 100644 index 0000000..ffda888 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go @@ -0,0 +1,23 @@ +//go:build !noasm && !appengine && !gccgo + +package reedsolomon + +//go:noescape +func xorSliceNEON(in, out []byte) + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + done := (len(in) >> 5) << 5 + if raceEnabled { + raceWriteSlice(out[:done]) + raceReadSlice(in[:done]) + } + xorSliceNEON(in, out) + + remain := len(in) - done + if remain > 0 { + for i := done; i < len(in); i++ { + out[i] ^= in[i] + } + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.s b/vendor/github.com/klauspost/reedsolomon/xor_arm64.s new file mode 100644 index 0000000..5629873 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.s @@ -0,0 +1,29 @@ +//+build !noasm +//+build !appengine +//+build !gccgo + +// func xorSliceNEON(in, out []byte) +TEXT ·xorSliceNEON(SB), 7, $0 + MOVD in_base+0(FP), R1 + MOVD in_len+8(FP), R2 // length of message + MOVD out_base+24(FP), R5 + SUBS $32, R2 + BMI completeXor + +loopXor: + // Main loop + VLD1.P 32(R1), [V0.B16, V1.B16] + VLD1 (R5), [V20.B16, V21.B16] + + VEOR V20.B16, V0.B16, V4.B16 + VEOR V21.B16, V1.B16, V5.B16 + + // Store result + VST1.P [V4.D2, V5.D2], 32(R5) + + SUBS $32, R2 + BPL loopXor + +completeXor: + RET + diff --git a/vendor/github.com/klauspost/reedsolomon/xor_noasm.go b/vendor/github.com/klauspost/reedsolomon/xor_noasm.go new file mode 100644 index 0000000..d3e29f9 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/xor_noasm.go @@ -0,0 +1,7 @@ +//go:build noasm || gccgo || appengine || (!amd64 && !arm64) + +package reedsolomon + +func sliceXor(in, out []byte, o *options) { + sliceXorGo(in, out, o) +} diff --git a/vendor/github.com/mattn/go-isatty/isatty_bsd.go b/vendor/github.com/mattn/go-isatty/isatty_bsd.go index d569c0c..d0ea68f 100644 --- a/vendor/github.com/mattn/go-isatty/isatty_bsd.go +++ b/vendor/github.com/mattn/go-isatty/isatty_bsd.go @@ -1,6 +1,7 @@ -//go:build (darwin || freebsd || openbsd || netbsd || dragonfly || hurd) && !appengine +//go:build (darwin || freebsd || openbsd || netbsd || dragonfly || hurd) && !appengine && !tinygo // +build darwin freebsd openbsd netbsd dragonfly hurd // +build !appengine +// +build !tinygo package isatty diff --git a/vendor/github.com/mattn/go-isatty/isatty_others.go b/vendor/github.com/mattn/go-isatty/isatty_others.go index 3150322..7402e06 100644 --- a/vendor/github.com/mattn/go-isatty/isatty_others.go +++ b/vendor/github.com/mattn/go-isatty/isatty_others.go @@ -1,5 +1,6 @@ -//go:build appengine || js || nacl || wasm -// +build appengine js nacl wasm +//go:build (appengine || js || nacl || tinygo || wasm) && !windows +// +build appengine js nacl tinygo wasm +// +build !windows package isatty diff --git a/vendor/github.com/mattn/go-isatty/isatty_tcgets.go b/vendor/github.com/mattn/go-isatty/isatty_tcgets.go index 6778765..0337d8c 100644 --- a/vendor/github.com/mattn/go-isatty/isatty_tcgets.go +++ b/vendor/github.com/mattn/go-isatty/isatty_tcgets.go @@ -1,6 +1,7 @@ -//go:build (linux || aix || zos) && !appengine +//go:build (linux || aix || zos) && !appengine && !tinygo // +build linux aix zos // +build !appengine +// +build !tinygo package isatty diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/build/build_command.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/build/build_command.go index 5db5d1a..fd17260 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/build/build_command.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/build/build_command.go @@ -2,6 +2,8 @@ package build import ( "fmt" + "os" + "path" "github.com/onsi/ginkgo/v2/ginkgo/command" "github.com/onsi/ginkgo/v2/ginkgo/internal" @@ -53,7 +55,18 @@ func buildSpecs(args []string, cliConfig types.CLIConfig, goFlagsConfig types.Go if suite.State.Is(internal.TestSuiteStateFailedToCompile) { fmt.Println(suite.CompilationError.Error()) } else { - fmt.Printf("Compiled %s.test\n", suite.PackageName) + if len(goFlagsConfig.O) == 0 { + goFlagsConfig.O = path.Join(suite.Path, suite.PackageName+".test") + } else { + stat, err := os.Stat(goFlagsConfig.O) + if err != nil { + panic(err) + } + if stat.IsDir() { + goFlagsConfig.O += "/" + suite.PackageName + ".test" + } + } + fmt.Printf("Compiled %s\n", goFlagsConfig.O) } } diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/bootstrap_command.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/bootstrap_command.go index 73aff0b..b2dc59b 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/bootstrap_command.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/bootstrap_command.go @@ -7,7 +7,7 @@ import ( "os" "text/template" - sprig "github.com/go-task/slim-sprig" + sprig "github.com/go-task/slim-sprig/v3" "github.com/onsi/ginkgo/v2/ginkgo/command" "github.com/onsi/ginkgo/v2/ginkgo/internal" "github.com/onsi/ginkgo/v2/types" diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/generate_command.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/generate_command.go index be01dec..cf3b7cb 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/generate_command.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/generators/generate_command.go @@ -10,7 +10,7 @@ import ( "strings" "text/template" - sprig "github.com/go-task/slim-sprig" + sprig "github.com/go-task/slim-sprig/v3" "github.com/onsi/ginkgo/v2/ginkgo/command" "github.com/onsi/ginkgo/v2/ginkgo/internal" "github.com/onsi/ginkgo/v2/types" @@ -174,6 +174,7 @@ func moduleName(modRoot string) string { if err != nil { return "" } + defer modFile.Close() mod := make([]byte, 128) _, err = modFile.Read(mod) diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/compile.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/compile.go index 86da734..48827cc 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/compile.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/compile.go @@ -25,6 +25,18 @@ func CompileSuite(suite TestSuite, goFlagsConfig types.GoFlagsConfig) TestSuite return suite } + if len(goFlagsConfig.O) > 0 { + userDefinedPath, err := filepath.Abs(goFlagsConfig.O) + if err != nil { + suite.State = TestSuiteStateFailedToCompile + suite.CompilationError = fmt.Errorf("Failed to compute compilation target path %s:\n%s", goFlagsConfig.O, err.Error()) + return suite + } + path = userDefinedPath + } + + goFlagsConfig.O = path + ginkgoInvocationPath, _ := os.Getwd() ginkgoInvocationPath, _ = filepath.Abs(ginkgoInvocationPath) packagePath := suite.AbsPath() @@ -34,7 +46,7 @@ func CompileSuite(suite TestSuite, goFlagsConfig types.GoFlagsConfig) TestSuite suite.CompilationError = fmt.Errorf("Failed to get relative path from package to the current working directory:\n%s", err.Error()) return suite } - args, err := types.GenerateGoTestCompileArgs(goFlagsConfig, path, "./", pathToInvocationPath) + args, err := types.GenerateGoTestCompileArgs(goFlagsConfig, "./", pathToInvocationPath) if err != nil { suite.State = TestSuiteStateFailedToCompile suite.CompilationError = fmt.Errorf("Failed to generate go test compile flags:\n%s", err.Error()) diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/gocovmerge.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/gocovmerge.go new file mode 100644 index 0000000..3c5079f --- /dev/null +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/gocovmerge.go @@ -0,0 +1,129 @@ +// Copyright (c) 2015, Wade Simmons +// All rights reserved. + +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: + +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Package gocovmerge takes the results from multiple `go test -coverprofile` +// runs and merges them into one profile + +// this file was originally taken from the gocovmerge project +// see also: https://go.shabbyrobe.org/gocovmerge +package internal + +import ( + "fmt" + "io" + "sort" + + "golang.org/x/tools/cover" +) + +func AddCoverProfile(profiles []*cover.Profile, p *cover.Profile) []*cover.Profile { + i := sort.Search(len(profiles), func(i int) bool { return profiles[i].FileName >= p.FileName }) + if i < len(profiles) && profiles[i].FileName == p.FileName { + MergeCoverProfiles(profiles[i], p) + } else { + profiles = append(profiles, nil) + copy(profiles[i+1:], profiles[i:]) + profiles[i] = p + } + return profiles +} + +func DumpCoverProfiles(profiles []*cover.Profile, out io.Writer) error { + if len(profiles) == 0 { + return nil + } + if _, err := fmt.Fprintf(out, "mode: %s\n", profiles[0].Mode); err != nil { + return err + } + for _, p := range profiles { + for _, b := range p.Blocks { + if _, err := fmt.Fprintf(out, "%s:%d.%d,%d.%d %d %d\n", p.FileName, b.StartLine, b.StartCol, b.EndLine, b.EndCol, b.NumStmt, b.Count); err != nil { + return err + } + } + } + return nil +} + +func MergeCoverProfiles(into *cover.Profile, merge *cover.Profile) error { + if into.Mode != merge.Mode { + return fmt.Errorf("cannot merge profiles with different modes") + } + // Since the blocks are sorted, we can keep track of where the last block + // was inserted and only look at the blocks after that as targets for merge + startIndex := 0 + for _, b := range merge.Blocks { + var err error + startIndex, err = mergeProfileBlock(into, b, startIndex) + if err != nil { + return err + } + } + return nil +} + +func mergeProfileBlock(p *cover.Profile, pb cover.ProfileBlock, startIndex int) (int, error) { + sortFunc := func(i int) bool { + pi := p.Blocks[i+startIndex] + return pi.StartLine >= pb.StartLine && (pi.StartLine != pb.StartLine || pi.StartCol >= pb.StartCol) + } + + i := 0 + if sortFunc(i) != true { + i = sort.Search(len(p.Blocks)-startIndex, sortFunc) + } + + i += startIndex + if i < len(p.Blocks) && p.Blocks[i].StartLine == pb.StartLine && p.Blocks[i].StartCol == pb.StartCol { + if p.Blocks[i].EndLine != pb.EndLine || p.Blocks[i].EndCol != pb.EndCol { + return i, fmt.Errorf("gocovmerge: overlapping merge %v %v %v", p.FileName, p.Blocks[i], pb) + } + switch p.Mode { + case "set": + p.Blocks[i].Count |= pb.Count + case "count", "atomic": + p.Blocks[i].Count += pb.Count + default: + return i, fmt.Errorf("gocovmerge: unsupported covermode '%s'", p.Mode) + } + + } else { + if i > 0 { + pa := p.Blocks[i-1] + if pa.EndLine >= pb.EndLine && (pa.EndLine != pb.EndLine || pa.EndCol > pb.EndCol) { + return i, fmt.Errorf("gocovmerge: overlap before %v %v %v", p.FileName, pa, pb) + } + } + if i < len(p.Blocks)-1 { + pa := p.Blocks[i+1] + if pa.StartLine <= pb.StartLine && (pa.StartLine != pb.StartLine || pa.StartCol < pb.StartCol) { + return i, fmt.Errorf("gocovmerge: overlap after %v %v %v", p.FileName, pa, pb) + } + } + p.Blocks = append(p.Blocks, cover.ProfileBlock{}) + copy(p.Blocks[i+1:], p.Blocks[i:]) + p.Blocks[i] = pb + } + + return i + 1, nil +} diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/profiles_and_reports.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/profiles_and_reports.go index bd3c6d0..8e16d2b 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/profiles_and_reports.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/profiles_and_reports.go @@ -1,7 +1,6 @@ package internal import ( - "bytes" "fmt" "os" "os/exec" @@ -12,6 +11,7 @@ import ( "github.com/google/pprof/profile" "github.com/onsi/ginkgo/v2/reporters" "github.com/onsi/ginkgo/v2/types" + "golang.org/x/tools/cover" ) func AbsPathForGeneratedAsset(assetName string, suite TestSuite, cliConfig types.CLIConfig, process int) string { @@ -144,38 +144,27 @@ func FinalizeProfilesAndReportsForSuites(suites TestSuites, cliConfig types.CLIC return messages, nil } -//loads each profile, combines them, deletes them, stores them in destination +// loads each profile, merges them, deletes them, stores them in destination func MergeAndCleanupCoverProfiles(profiles []string, destination string) error { - combined := &bytes.Buffer{} - modeRegex := regexp.MustCompile(`^mode: .*\n`) - for i, profile := range profiles { - contents, err := os.ReadFile(profile) + var merged []*cover.Profile + for _, file := range profiles { + parsedProfiles, err := cover.ParseProfiles(file) if err != nil { - return fmt.Errorf("Unable to read coverage file %s:\n%s", profile, err.Error()) + return err } - os.Remove(profile) - - // remove the cover mode line from every file - // except the first one - if i > 0 { - contents = modeRegex.ReplaceAll(contents, []byte{}) - } - - _, err = combined.Write(contents) - - // Add a newline to the end of every file if missing. - if err == nil && len(contents) > 0 && contents[len(contents)-1] != '\n' { - _, err = combined.Write([]byte("\n")) - } - - if err != nil { - return fmt.Errorf("Unable to append to coverprofile:\n%s", err.Error()) + os.Remove(file) + for _, p := range parsedProfiles { + merged = AddCoverProfile(merged, p) } } - - err := os.WriteFile(destination, combined.Bytes(), 0666) + dst, err := os.OpenFile(destination, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0666) + if err != nil { + return err + } + defer dst.Close() + err = DumpCoverProfiles(merged, dst) if err != nil { - return fmt.Errorf("Unable to create combined cover profile:\n%s", err.Error()) + return err } return nil } @@ -184,7 +173,7 @@ func GetCoverageFromCoverProfile(profile string) (float64, error) { cmd := exec.Command("go", "tool", "cover", "-func", profile) output, err := cmd.CombinedOutput() if err != nil { - return 0, fmt.Errorf("Could not process Coverprofile %s: %s", profile, err.Error()) + return 0, fmt.Errorf("Could not process Coverprofile %s: %s - %s", profile, err.Error(), string(output)) } re := regexp.MustCompile(`total:\s*\(statements\)\s*(\d*\.\d*)\%`) matches := re.FindStringSubmatch(string(output)) @@ -208,6 +197,7 @@ func MergeProfiles(profilePaths []string, destination string) error { return fmt.Errorf("Could not open profile: %s\n%s", profilePath, err.Error()) } prof, err := profile.Parse(proFile) + _ = proFile.Close() if err != nil { return fmt.Errorf("Could not parse profile: %s\n%s", profilePath, err.Error()) } diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/test_suite.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/test_suite.go index 64dcb1b..df99875 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/test_suite.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/internal/test_suite.go @@ -7,6 +7,7 @@ import ( "path" "path/filepath" "regexp" + "runtime" "strings" "github.com/onsi/ginkgo/v2/types" @@ -192,7 +193,7 @@ func precompiledTestSuite(path string) (TestSuite, error) { return TestSuite{}, errors.New("this is not a .test binary") } - if filepath.Ext(path) == ".test" && info.Mode()&0111 == 0 { + if filepath.Ext(path) == ".test" && runtime.GOOS != "windows" && info.Mode()&0111 == 0 { return TestSuite{}, errors.New("this is not executable") } @@ -225,7 +226,7 @@ func suitesInDir(dir string, recurse bool) TestSuites { files, _ := os.ReadDir(dir) re := regexp.MustCompile(`^[^._].*_test\.go$`) for _, file := range files { - if !file.IsDir() && re.Match([]byte(file.Name())) { + if !file.IsDir() && re.MatchString(file.Name()) { suite := TestSuite{ Path: relPath(dir), PackageName: packageNameForSuite(dir), @@ -240,7 +241,7 @@ func suitesInDir(dir string, recurse bool) TestSuites { if recurse { re = regexp.MustCompile(`^[._]`) for _, file := range files { - if file.IsDir() && !re.Match([]byte(file.Name())) { + if file.IsDir() && !re.MatchString(file.Name()) { suites = append(suites, suitesInDir(dir+"/"+file.Name(), recurse)...) } } @@ -271,7 +272,7 @@ func filesHaveGinkgoSuite(dir string, files []os.DirEntry) bool { reGinkgo := regexp.MustCompile(`package ginkgo|\/ginkgo"|\/ginkgo\/v2"|\/ginkgo\/v2/dsl/`) for _, file := range files { - if !file.IsDir() && reTestFile.Match([]byte(file.Name())) { + if !file.IsDir() && reTestFile.MatchString(file.Name()) { contents, _ := os.ReadFile(dir + "/" + file.Name()) if reGinkgo.Match(contents) { return true diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/ginkgo.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/ginkgo.go index 958dacc..5d8d00b 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/ginkgo.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/ginkgo.go @@ -1,10 +1,11 @@ package outline import ( - "github.com/onsi/ginkgo/v2/types" "go/ast" "go/token" "strconv" + + "github.com/onsi/ginkgo/v2/types" ) const ( diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/import.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/import.go index 67ec5ab..f0a6b5d 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/import.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/outline/import.go @@ -28,14 +28,7 @@ func packageNameForImport(f *ast.File, path string) *string { } name := spec.Name.String() if name == "" { - // If the package name is not explicitly specified, - // make an educated guess. This is not guaranteed to be correct. - lastSlash := strings.LastIndex(path, "/") - if lastSlash == -1 { - name = path - } else { - name = path[lastSlash+1:] - } + name = "ginkgo" } if name == "." { name = "" diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/dependencies.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/dependencies.go index f5ddff3..a34d943 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/dependencies.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/dependencies.go @@ -78,7 +78,7 @@ func (d Dependencies) resolveAndAdd(deps []string, depth int) { if err != nil { continue } - if !pkg.Goroot && (!ginkgoAndGomegaFilter.Match([]byte(pkg.Dir)) || ginkgoIntegrationTestFilter.Match([]byte(pkg.Dir))) { + if !pkg.Goroot && (!ginkgoAndGomegaFilter.MatchString(pkg.Dir) || ginkgoIntegrationTestFilter.MatchString(pkg.Dir)) { d.addDepIfNotPresent(pkg.Dir, depth) } } diff --git a/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/package_hash.go b/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/package_hash.go index e9f7ec0..0e6ae1f 100644 --- a/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/package_hash.go +++ b/vendor/github.com/onsi/ginkgo/v2/ginkgo/watch/package_hash.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "regexp" + "strings" "time" ) @@ -79,7 +80,11 @@ func (p *PackageHash) computeHashes() (codeHash string, codeModifiedTime time.Ti continue } - if goTestRegExp.Match([]byte(info.Name())) { + if isHiddenFile(info) { + continue + } + + if goTestRegExp.MatchString(info.Name()) { testHash += p.hashForFileInfo(info) if info.ModTime().After(testModifiedTime) { testModifiedTime = info.ModTime() @@ -87,7 +92,7 @@ func (p *PackageHash) computeHashes() (codeHash string, codeModifiedTime time.Ti continue } - if p.watchRegExp.Match([]byte(info.Name())) { + if p.watchRegExp.MatchString(info.Name()) { codeHash += p.hashForFileInfo(info) if info.ModTime().After(codeModifiedTime) { codeModifiedTime = info.ModTime() @@ -103,6 +108,10 @@ func (p *PackageHash) computeHashes() (codeHash string, codeModifiedTime time.Ti return } +func isHiddenFile(info os.FileInfo) bool { + return strings.HasPrefix(info.Name(), ".") || strings.HasPrefix(info.Name(), "_") +} + func (p *PackageHash) hashForFileInfo(info os.FileInfo) string { return fmt.Sprintf("%s_%d_%d", info.Name(), info.Size(), info.ModTime().UnixNano()) } diff --git a/vendor/github.com/onsi/ginkgo/v2/reporters/default_reporter.go b/vendor/github.com/onsi/ginkgo/v2/reporters/default_reporter.go index 56b7be7..4807304 100644 --- a/vendor/github.com/onsi/ginkgo/v2/reporters/default_reporter.go +++ b/vendor/github.com/onsi/ginkgo/v2/reporters/default_reporter.go @@ -182,10 +182,31 @@ func (r *DefaultReporter) WillRun(report types.SpecReport) { r.emitBlock(r.f(r.codeLocationBlock(report, "{{/}}", v.Is(types.VerbosityLevelVeryVerbose), false))) } +func (r *DefaultReporter) wrapTextBlock(sectionName string, fn func()) { + r.emitBlock("\n") + if r.conf.GithubOutput { + r.emitBlock(r.fi(1, "::group::%s", sectionName)) + } else { + r.emitBlock(r.fi(1, "{{gray}}%s >>{{/}}", sectionName)) + } + fn() + if r.conf.GithubOutput { + r.emitBlock(r.fi(1, "::endgroup::")) + } else { + r.emitBlock(r.fi(1, "{{gray}}<< %s{{/}}", sectionName)) + } + +} + func (r *DefaultReporter) DidRun(report types.SpecReport) { v := r.conf.Verbosity() inParallel := report.RunningInParallel + //should we completely omit this spec? + if report.State.Is(types.SpecStateSkipped) && r.conf.SilenceSkips { + return + } + header := r.specDenoter if report.LeafNodeType.Is(types.NodeTypesForSuiteLevelNodes) { header = fmt.Sprintf("[%s]", report.LeafNodeType) @@ -262,9 +283,12 @@ func (r *DefaultReporter) DidRun(report types.SpecReport) { } } - // If we have no content to show, jsut emit the header and return + // If we have no content to show, just emit the header and return if !reportHasContent { r.emit(r.f(highlightColor + header + "{{/}}")) + if r.conf.ForceNewlines { + r.emit("\n") + } return } @@ -283,26 +307,23 @@ func (r *DefaultReporter) DidRun(report types.SpecReport) { //Emit Stdout/Stderr Output if showSeparateStdSection { - r.emitBlock("\n") - r.emitBlock(r.fi(1, "{{gray}}Captured StdOut/StdErr Output >>{{/}}")) - r.emitBlock(r.fi(1, "%s", report.CapturedStdOutErr)) - r.emitBlock(r.fi(1, "{{gray}}<< Captured StdOut/StdErr Output{{/}}")) + r.wrapTextBlock("Captured StdOut/StdErr Output", func() { + r.emitBlock(r.fi(1, "%s", report.CapturedStdOutErr)) + }) } if showSeparateVisibilityAlwaysReportsSection { - r.emitBlock("\n") - r.emitBlock(r.fi(1, "{{gray}}Report Entries >>{{/}}")) - for _, entry := range report.ReportEntries.WithVisibility(types.ReportEntryVisibilityAlways) { - r.emitReportEntry(1, entry) - } - r.emitBlock(r.fi(1, "{{gray}}<< Report Entries{{/}}")) + r.wrapTextBlock("Report Entries", func() { + for _, entry := range report.ReportEntries.WithVisibility(types.ReportEntryVisibilityAlways) { + r.emitReportEntry(1, entry) + } + }) } if showTimeline { - r.emitBlock("\n") - r.emitBlock(r.fi(1, "{{gray}}Timeline >>{{/}}")) - r.emitTimeline(1, report, timeline) - r.emitBlock(r.fi(1, "{{gray}}<< Timeline{{/}}")) + r.wrapTextBlock("Timeline", func() { + r.emitTimeline(1, report, timeline) + }) } // Emit Failure Message @@ -405,7 +426,15 @@ func (r *DefaultReporter) emitShortFailure(indent uint, state types.SpecState, f func (r *DefaultReporter) emitFailure(indent uint, state types.SpecState, failure types.Failure, includeAdditionalFailure bool) { highlightColor := r.highlightColorForState(state) r.emitBlock(r.fi(indent, highlightColor+"[%s] %s{{/}}", r.humanReadableState(state), failure.Message)) - r.emitBlock(r.fi(indent, highlightColor+"In {{bold}}[%s]{{/}}"+highlightColor+" at: {{bold}}%s{{/}} {{gray}}@ %s{{/}}\n", failure.FailureNodeType, failure.Location, failure.TimelineLocation.Time.Format(types.GINKGO_TIME_FORMAT))) + if r.conf.GithubOutput { + level := "error" + if state.Is(types.SpecStateSkipped) { + level = "notice" + } + r.emitBlock(r.fi(indent, "::%s file=%s,line=%d::%s %s", level, failure.Location.FileName, failure.Location.LineNumber, failure.FailureNodeType, failure.TimelineLocation.Time.Format(types.GINKGO_TIME_FORMAT))) + } else { + r.emitBlock(r.fi(indent, highlightColor+"In {{bold}}[%s]{{/}}"+highlightColor+" at: {{bold}}%s{{/}} {{gray}}@ %s{{/}}\n", failure.FailureNodeType, failure.Location, failure.TimelineLocation.Time.Format(types.GINKGO_TIME_FORMAT))) + } if failure.ForwardedPanic != "" { r.emitBlock("\n") r.emitBlock(r.fi(indent, highlightColor+"%s{{/}}", failure.ForwardedPanic)) diff --git a/vendor/github.com/onsi/ginkgo/v2/reporters/json_report.go b/vendor/github.com/onsi/ginkgo/v2/reporters/json_report.go index be506f9..5d3e8db 100644 --- a/vendor/github.com/onsi/ginkgo/v2/reporters/json_report.go +++ b/vendor/github.com/onsi/ginkgo/v2/reporters/json_report.go @@ -18,6 +18,7 @@ func GenerateJSONReport(report types.Report, destination string) error { if err != nil { return err } + defer f.Close() enc := json.NewEncoder(f) enc.SetIndent("", " ") err = enc.Encode([]types.Report{ @@ -26,7 +27,7 @@ func GenerateJSONReport(report types.Report, destination string) error { if err != nil { return err } - return f.Close() + return nil } // MergeJSONReports produces a single JSON-formatted report at the passed in destination by merging the JSON-formatted reports provided in sources @@ -57,11 +58,12 @@ func MergeAndCleanupJSONReports(sources []string, destination string) ([]string, if err != nil { return messages, err } + defer f.Close() enc := json.NewEncoder(f) enc.SetIndent("", " ") err = enc.Encode(allReports) if err != nil { return messages, err } - return messages, f.Close() + return messages, nil } diff --git a/vendor/github.com/onsi/ginkgo/v2/reporters/junit_report.go b/vendor/github.com/onsi/ginkgo/v2/reporters/junit_report.go index 8160422..562e0f6 100644 --- a/vendor/github.com/onsi/ginkgo/v2/reporters/junit_report.go +++ b/vendor/github.com/onsi/ginkgo/v2/reporters/junit_report.go @@ -15,6 +15,7 @@ import ( "fmt" "os" "path" + "regexp" "strings" "github.com/onsi/ginkgo/v2/config" @@ -104,6 +105,8 @@ type JUnitProperty struct { Value string `xml:"value,attr"` } +var ownerRE = regexp.MustCompile(`(?i)^owner:(.*)$`) + type JUnitTestCase struct { // Name maps onto the full text of the spec - equivalent to "[SpecReport.LeafNodeType] SpecReport.FullText()" Name string `xml:"name,attr"` @@ -113,6 +116,8 @@ type JUnitTestCase struct { Status string `xml:"status,attr"` // Time is the time in seconds to execute the spec - maps onto SpecReport.RunTime Time float64 `xml:"time,attr"` + // Owner is the owner the spec - is set if a label matching Label("owner:X") is provided. The last matching label is used as the owner, thereby allowing specs to override owners specified in container nodes. + Owner string `xml:"owner,attr,omitempty"` //Skipped is populated with a message if the test was skipped or pending Skipped *JUnitSkipped `xml:"skipped,omitempty"` //Error is populated if the test panicked or was interrupted @@ -172,6 +177,7 @@ func GenerateJUnitReportWithConfig(report types.Report, dst string, config Junit {"FocusFiles", strings.Join(report.SuiteConfig.FocusFiles, ";")}, {"SkipFiles", strings.Join(report.SuiteConfig.SkipFiles, ";")}, {"FailOnPending", fmt.Sprintf("%t", report.SuiteConfig.FailOnPending)}, + {"FailOnEmpty", fmt.Sprintf("%t", report.SuiteConfig.FailOnEmpty)}, {"FailFast", fmt.Sprintf("%t", report.SuiteConfig.FailFast)}, {"FlakeAttempts", fmt.Sprintf("%d", report.SuiteConfig.FlakeAttempts)}, {"DryRun", fmt.Sprintf("%t", report.SuiteConfig.DryRun)}, @@ -195,6 +201,12 @@ func GenerateJUnitReportWithConfig(report types.Report, dst string, config Junit if len(labels) > 0 && !config.OmitSpecLabels { name = name + " [" + strings.Join(labels, ", ") + "]" } + owner := "" + for _, label := range labels { + if matches := ownerRE.FindStringSubmatch(label); len(matches) == 2 { + owner = matches[1] + } + } name = strings.TrimSpace(name) test := JUnitTestCase{ @@ -202,6 +214,7 @@ func GenerateJUnitReportWithConfig(report types.Report, dst string, config Junit Classname: report.SuiteDescription, Status: spec.State.String(), Time: spec.RunTime.Seconds(), + Owner: owner, } if !spec.State.Is(config.OmitTimelinesForSpecState) { test.SystemErr = systemErrForUnstructuredReporters(spec) @@ -312,6 +325,7 @@ func MergeAndCleanupJUnitReports(sources []string, dst string) ([]string, error) continue } err = xml.NewDecoder(f).Decode(&report) + _ = f.Close() if err != nil { messages = append(messages, fmt.Sprintf("Could not decode %s:\n%s", source, err.Error())) continue diff --git a/vendor/github.com/onsi/ginkgo/v2/types/code_location.go b/vendor/github.com/onsi/ginkgo/v2/types/code_location.go index 9cd5768..57e8751 100644 --- a/vendor/github.com/onsi/ginkgo/v2/types/code_location.go +++ b/vendor/github.com/onsi/ginkgo/v2/types/code_location.go @@ -149,7 +149,7 @@ func PruneStack(fullStackTrace string, skip int) string { re := regexp.MustCompile(`\/ginkgo\/|\/pkg\/testing\/|\/pkg\/runtime\/`) for i := 0; i < len(stack)/2; i++ { // We filter out based on the source code file name. - if !re.Match([]byte(stack[i*2+1])) { + if !re.MatchString(stack[i*2+1]) { prunedStack = append(prunedStack, stack[i*2]) prunedStack = append(prunedStack, stack[i*2+1]) } diff --git a/vendor/github.com/onsi/ginkgo/v2/types/config.go b/vendor/github.com/onsi/ginkgo/v2/types/config.go index c88fc85..97a049e 100644 --- a/vendor/github.com/onsi/ginkgo/v2/types/config.go +++ b/vendor/github.com/onsi/ginkgo/v2/types/config.go @@ -25,6 +25,7 @@ type SuiteConfig struct { SkipFiles []string LabelFilter string FailOnPending bool + FailOnEmpty bool FailFast bool FlakeAttempts int MustPassRepeatedly int @@ -89,6 +90,9 @@ type ReporterConfig struct { VeryVerbose bool FullTrace bool ShowNodeEvents bool + GithubOutput bool + SilenceSkips bool + ForceNewlines bool JSONReport string JUnitReport string @@ -198,6 +202,7 @@ type GoFlagsConfig struct { A bool ASMFlags string BuildMode string + BuildVCS bool Compiler string GCCGoFlags string GCFlags string @@ -215,6 +220,7 @@ type GoFlagsConfig struct { ToolExec string Work bool X bool + O string } func NewDefaultGoFlagsConfig() GoFlagsConfig { @@ -264,7 +270,7 @@ var FlagSections = GinkgoFlagSections{ // SuiteConfigFlags provides flags for the Ginkgo test process, and CLI var SuiteConfigFlags = GinkgoFlags{ {KeyPath: "S.RandomSeed", Name: "seed", SectionKey: "order", UsageDefaultValue: "randomly generated by Ginkgo", - Usage: "The seed used to randomize the spec suite."}, + Usage: "The seed used to randomize the spec suite.", AlwaysExport: true}, {KeyPath: "S.RandomizeAllSpecs", Name: "randomize-all", SectionKey: "order", DeprecatedName: "randomizeAllSpecs", DeprecatedDocLink: "changed-command-line-flags", Usage: "If set, ginkgo will randomize all specs together. By default, ginkgo only randomizes the top level Describe, Context and When containers."}, @@ -274,6 +280,8 @@ var SuiteConfigFlags = GinkgoFlags{ Usage: "If set, ginkgo will stop running a test suite after a failure occurs."}, {KeyPath: "S.FlakeAttempts", Name: "flake-attempts", SectionKey: "failure", UsageDefaultValue: "0 - failed tests are not retried", DeprecatedName: "flakeAttempts", DeprecatedDocLink: "changed-command-line-flags", Usage: "Make up to this many attempts to run each spec. If any of the attempts succeed, the suite will not be failed."}, + {KeyPath: "S.FailOnEmpty", Name: "fail-on-empty", SectionKey: "failure", + Usage: "If set, ginkgo will mark the test suite as failed if no specs are run."}, {KeyPath: "S.DryRun", Name: "dry-run", SectionKey: "debug", DeprecatedName: "dryRun", DeprecatedDocLink: "changed-command-line-flags", Usage: "If set, ginkgo will walk the test hierarchy without actually running anything. Best paired with -v."}, @@ -331,6 +339,12 @@ var ReporterConfigFlags = GinkgoFlags{ Usage: "If set, default reporter prints out the full stack trace when a failure occurs"}, {KeyPath: "R.ShowNodeEvents", Name: "show-node-events", SectionKey: "output", Usage: "If set, default reporter prints node > Enter and < Exit events when specs fail"}, + {KeyPath: "R.GithubOutput", Name: "github-output", SectionKey: "output", + Usage: "If set, default reporter prints easier to manage output in Github Actions."}, + {KeyPath: "R.SilenceSkips", Name: "silence-skips", SectionKey: "output", + Usage: "If set, default reporter will not print out skipped tests."}, + {KeyPath: "R.ForceNewlines", Name: "force-newlines", SectionKey: "output", + Usage: "If set, default reporter will ensure a newline appears after each test."}, {KeyPath: "R.JSONReport", Name: "json-report", UsageArgument: "filename.json", SectionKey: "output", Usage: "If set, Ginkgo will generate a JSON-formatted test report at the specified location."}, @@ -499,7 +513,7 @@ var GinkgoCLIWatchFlags = GinkgoFlags{ // GoBuildFlags provides flags for the Ginkgo CLI build, run, and watch commands that capture go's build-time flags. These are passed to go test -c by the ginkgo CLI var GoBuildFlags = GinkgoFlags{ {KeyPath: "Go.Race", Name: "race", SectionKey: "code-and-coverage-analysis", - Usage: "enable data race detection. Supported only on linux/amd64, freebsd/amd64, darwin/amd64, windows/amd64, linux/ppc64le and linux/arm64 (only for 48-bit VMA)."}, + Usage: "enable data race detection. Supported on linux/amd64, linux/ppc64le, linux/arm64, linux/s390x, freebsd/amd64, netbsd/amd64, darwin/amd64, darwin/arm64, and windows/amd64."}, {KeyPath: "Go.Vet", Name: "vet", UsageArgument: "list", SectionKey: "code-and-coverage-analysis", Usage: `Configure the invocation of "go vet" during "go test" to use the comma-separated list of vet checks. If list is empty, "go test" runs "go vet" with a curated list of checks believed to be always worth addressing. If list is "off", "go test" does not run "go vet" at all. Available checks can be found by running 'go doc cmd/vet'`}, {KeyPath: "Go.Cover", Name: "cover", SectionKey: "code-and-coverage-analysis", @@ -515,6 +529,8 @@ var GoBuildFlags = GinkgoFlags{ Usage: "arguments to pass on each go tool asm invocation."}, {KeyPath: "Go.BuildMode", Name: "buildmode", UsageArgument: "mode", SectionKey: "go-build", Usage: "build mode to use. See 'go help buildmode' for more."}, + {KeyPath: "Go.BuildVCS", Name: "buildvcs", SectionKey: "go-build", + Usage: "adds version control information."}, {KeyPath: "Go.Compiler", Name: "compiler", UsageArgument: "name", SectionKey: "go-build", Usage: "name of compiler to use, as in runtime.Compiler (gccgo or gc)."}, {KeyPath: "Go.GCCGoFlags", Name: "gccgoflags", UsageArgument: "'[pattern=]arg list'", SectionKey: "go-build", @@ -549,6 +565,8 @@ var GoBuildFlags = GinkgoFlags{ Usage: "print the name of the temporary work directory and do not delete it when exiting."}, {KeyPath: "Go.X", Name: "x", SectionKey: "go-build", Usage: "print the commands."}, + {KeyPath: "Go.O", Name: "o", SectionKey: "go-build", + Usage: "output binary path (including name)."}, } // GoRunFlags provides flags for the Ginkgo CLI run, and watch commands that capture go's run-time flags. These are passed to the compiled test binary by the ginkgo CLI @@ -602,7 +620,7 @@ func VetAndInitializeCLIAndGoConfig(cliConfig CLIConfig, goFlagsConfig GoFlagsCo } // GenerateGoTestCompileArgs is used by the Ginkgo CLI to generate command line arguments to pass to the go test -c command when compiling the test -func GenerateGoTestCompileArgs(goFlagsConfig GoFlagsConfig, destination string, packageToBuild string, pathToInvocationPath string) ([]string, error) { +func GenerateGoTestCompileArgs(goFlagsConfig GoFlagsConfig, packageToBuild string, pathToInvocationPath string) ([]string, error) { // if the user has set the CoverProfile run-time flag make sure to set the build-time cover flag to make sure // the built test binary can generate a coverprofile if goFlagsConfig.CoverProfile != "" { @@ -625,7 +643,7 @@ func GenerateGoTestCompileArgs(goFlagsConfig GoFlagsConfig, destination string, goFlagsConfig.CoverPkg = strings.Join(adjustedCoverPkgs, ",") } - args := []string{"test", "-c", "-o", destination, packageToBuild} + args := []string{"test", "-c", packageToBuild} goArgs, err := GenerateFlagArgs( GoBuildFlags, map[string]interface{}{ diff --git a/vendor/github.com/onsi/ginkgo/v2/types/errors.go b/vendor/github.com/onsi/ginkgo/v2/types/errors.go index 4fbdc3e..6bb72d0 100644 --- a/vendor/github.com/onsi/ginkgo/v2/types/errors.go +++ b/vendor/github.com/onsi/ginkgo/v2/types/errors.go @@ -505,6 +505,15 @@ func (g ginkgoErrors) IncorrectVariadicParameterTypeToTableFunction(expected, ac } } +func (g ginkgoErrors) ContextsCannotBeUsedInSubtreeTables(cl CodeLocation) error { + return GinkgoError{ + Heading: "Contexts cannot be used in subtree tables", + Message: "You''ve defined a subtree body function that accepts a context but did not provide one in the table entry. Ginkgo SpecContexts can only be passed in to subject and setup nodes - so if you are trying to implement a spec timeout you should request a context in the It function within your subtree body function, not in the subtree body function itself.", + CodeLocation: cl, + DocLink: "table-specs", + } +} + /* Parallel Synchronization errors */ func (g ginkgoErrors) AggregatedReportUnavailableDueToNodeDisappearing() error { diff --git a/vendor/github.com/onsi/ginkgo/v2/types/flags.go b/vendor/github.com/onsi/ginkgo/v2/types/flags.go index 9186ae8..de69f30 100644 --- a/vendor/github.com/onsi/ginkgo/v2/types/flags.go +++ b/vendor/github.com/onsi/ginkgo/v2/types/flags.go @@ -24,7 +24,8 @@ type GinkgoFlag struct { DeprecatedDocLink string DeprecatedVersion string - ExportAs string + ExportAs string + AlwaysExport bool } type GinkgoFlags []GinkgoFlag @@ -431,7 +432,7 @@ func (ssv stringSliceVar) Set(s string) error { return nil } -//given a set of GinkgoFlags and bindings, generate flag arguments suitable to be passed to an application with that set of flags configured. +// given a set of GinkgoFlags and bindings, generate flag arguments suitable to be passed to an application with that set of flags configured. func GenerateFlagArgs(flags GinkgoFlags, bindings interface{}) ([]string, error) { result := []string{} for _, flag := range flags { @@ -451,19 +452,19 @@ func GenerateFlagArgs(flags GinkgoFlags, bindings interface{}) ([]string, error) iface := value.Interface() switch value.Type() { case reflect.TypeOf(string("")): - if iface.(string) != "" { + if iface.(string) != "" || flag.AlwaysExport { result = append(result, fmt.Sprintf("--%s=%s", name, iface)) } case reflect.TypeOf(int64(0)): - if iface.(int64) != 0 { + if iface.(int64) != 0 || flag.AlwaysExport { result = append(result, fmt.Sprintf("--%s=%d", name, iface)) } case reflect.TypeOf(float64(0)): - if iface.(float64) != 0 { + if iface.(float64) != 0 || flag.AlwaysExport { result = append(result, fmt.Sprintf("--%s=%f", name, iface)) } case reflect.TypeOf(int(0)): - if iface.(int) != 0 { + if iface.(int) != 0 || flag.AlwaysExport { result = append(result, fmt.Sprintf("--%s=%d", name, iface)) } case reflect.TypeOf(bool(true)): @@ -471,7 +472,7 @@ func GenerateFlagArgs(flags GinkgoFlags, bindings interface{}) ([]string, error) result = append(result, fmt.Sprintf("--%s", name)) } case reflect.TypeOf(time.Duration(0)): - if iface.(time.Duration) != time.Duration(0) { + if iface.(time.Duration) != time.Duration(0) || flag.AlwaysExport { result = append(result, fmt.Sprintf("--%s=%s", name, iface)) } diff --git a/vendor/github.com/onsi/ginkgo/v2/types/label_filter.go b/vendor/github.com/onsi/ginkgo/v2/types/label_filter.go index b0d3b65..7fdc8aa 100644 --- a/vendor/github.com/onsi/ginkgo/v2/types/label_filter.go +++ b/vendor/github.com/onsi/ginkgo/v2/types/label_filter.go @@ -45,6 +45,83 @@ func orAction(a, b LabelFilter) LabelFilter { return func(labels []string) bool { return a(labels) || b(labels) } } +func labelSetFor(key string, labels []string) map[string]bool { + key = strings.ToLower(strings.TrimSpace(key)) + out := map[string]bool{} + for _, label := range labels { + components := strings.SplitN(label, ":", 2) + if len(components) < 2 { + continue + } + if key == strings.ToLower(strings.TrimSpace(components[0])) { + out[strings.ToLower(strings.TrimSpace(components[1]))] = true + } + } + + return out +} + +func isEmptyLabelSetAction(key string) LabelFilter { + return func(labels []string) bool { + return len(labelSetFor(key, labels)) == 0 + } +} + +func containsAnyLabelSetAction(key string, expectedValues []string) LabelFilter { + return func(labels []string) bool { + set := labelSetFor(key, labels) + for _, value := range expectedValues { + if set[value] { + return true + } + } + return false + } +} + +func containsAllLabelSetAction(key string, expectedValues []string) LabelFilter { + return func(labels []string) bool { + set := labelSetFor(key, labels) + for _, value := range expectedValues { + if !set[value] { + return false + } + } + return true + } +} + +func consistsOfLabelSetAction(key string, expectedValues []string) LabelFilter { + return func(labels []string) bool { + set := labelSetFor(key, labels) + if len(set) != len(expectedValues) { + return false + } + for _, value := range expectedValues { + if !set[value] { + return false + } + } + return true + } +} + +func isSubsetOfLabelSetAction(key string, expectedValues []string) LabelFilter { + expectedSet := map[string]bool{} + for _, value := range expectedValues { + expectedSet[value] = true + } + return func(labels []string) bool { + set := labelSetFor(key, labels) + for value := range set { + if !expectedSet[value] { + return false + } + } + return true + } +} + type lfToken uint const ( @@ -58,6 +135,9 @@ const ( lfTokenOr lfTokenRegexp lfTokenLabel + lfTokenSetKey + lfTokenSetOperation + lfTokenSetArgument lfTokenEOF ) @@ -71,6 +151,8 @@ func (l lfToken) Precedence() int { return 2 case lfTokenNot: return 3 + case lfTokenSetOperation: + return 4 } return -1 } @@ -93,6 +175,12 @@ func (l lfToken) String() string { return "/regexp/" case lfTokenLabel: return "label" + case lfTokenSetKey: + return "set_key" + case lfTokenSetOperation: + return "set_operation" + case lfTokenSetArgument: + return "set_argument" case lfTokenEOF: return "EOF" } @@ -148,6 +236,35 @@ func (tn *treeNode) constructLabelFilter(input string) (LabelFilter, error) { return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, tn.location, fmt.Sprintf("RegExp compilation error: %s", err)) } return matchLabelRegexAction(re), nil + case lfTokenSetOperation: + tokenSetOperation := strings.ToLower(tn.value) + if tokenSetOperation == "isempty" { + return isEmptyLabelSetAction(tn.leftNode.value), nil + } + if tn.rightNode == nil { + return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, tn.location, fmt.Sprintf("Set operation '%s' is missing an argument.", tn.value)) + } + + rawValues := strings.Split(tn.rightNode.value, ",") + values := make([]string, len(rawValues)) + for i := range rawValues { + values[i] = strings.ToLower(strings.TrimSpace(rawValues[i])) + if strings.ContainsAny(values[i], "&|!,()/") { + return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, tn.rightNode.location, fmt.Sprintf("Invalid label value '%s' in set operation argument.", values[i])) + } else if values[i] == "" { + return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, tn.rightNode.location, "Empty label value in set operation argument.") + } + } + switch tokenSetOperation { + case "containsany": + return containsAnyLabelSetAction(tn.leftNode.value, values), nil + case "containsall": + return containsAllLabelSetAction(tn.leftNode.value, values), nil + case "consistsof": + return consistsOfLabelSetAction(tn.leftNode.value, values), nil + case "issubsetof": + return isSubsetOfLabelSetAction(tn.leftNode.value, values), nil + } } if tn.rightNode == nil { @@ -203,7 +320,17 @@ func (tn *treeNode) toString(indent int) string { return out } +var validSetOperations = map[string]string{ + "containsany": "containsAny", + "containsall": "containsAll", + "consistsof": "consistsOf", + "issubsetof": "isSubsetOf", + "isempty": "isEmpty", +} + func tokenize(input string) func() (*treeNode, error) { + lastToken := lfTokenInvalid + lastValue := "" runes, i := []rune(input), 0 peekIs := func(r rune) bool { @@ -233,6 +360,53 @@ func tokenize(input string) func() (*treeNode, error) { } node := &treeNode{location: i} + defer func() { + lastToken = node.token + lastValue = node.value + }() + + if lastToken == lfTokenSetKey { + //we should get a valid set operation next + value, n := consumeUntil(" )") + if validSetOperations[strings.ToLower(value)] == "" { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, i, fmt.Sprintf("Invalid set operation '%s'.", value)) + } + i += n + node.token, node.value = lfTokenSetOperation, value + return node, nil + } + if lastToken == lfTokenSetOperation { + //we should get an argument next, if we aren't isempty + var arg = "" + origI := i + if runes[i] == '{' { + i += 1 + value, n := consumeUntil("}") + if i+n >= len(runes) { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, i-1, "Missing closing '}' in set operation argument?") + } + i += n + 1 + arg = value + } else { + value, n := consumeUntil("&|!,()/") + i += n + arg = strings.TrimSpace(value) + } + if strings.ToLower(lastValue) == "isempty" && arg != "" { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, origI, fmt.Sprintf("isEmpty does not take arguments, was passed '%s'.", arg)) + } + if arg == "" && strings.ToLower(lastValue) != "isempty" { + if i < len(runes) && runes[i] == '/' { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, origI, "Set operations do not support regular expressions.") + } else { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, origI, fmt.Sprintf("Set operation '%s' requires an argument.", lastValue)) + } + } + // note that we sent an empty SetArgument token if we are isempty + node.token, node.value = lfTokenSetArgument, arg + return node, nil + } + switch runes[i] { case '&': if !peekIs('&') { @@ -264,8 +438,38 @@ func tokenize(input string) func() (*treeNode, error) { i += n + 1 node.token, node.value = lfTokenRegexp, value default: - value, n := consumeUntil("&|!,()/") + value, n := consumeUntil("&|!,()/:") i += n + value = strings.TrimSpace(value) + + //are we the beginning of a set operation? + if i < len(runes) && runes[i] == ':' { + if peekIs(' ') { + if value == "" { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, i, "Missing set key.") + } + i += 1 + //we are the beginning of a set operation + node.token, node.value = lfTokenSetKey, value + return node, nil + } + additionalValue, n := consumeUntil("&|!,()/") + additionalValue = strings.TrimSpace(additionalValue) + if additionalValue == ":" { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, i, "Missing set operation.") + } + i += n + value += additionalValue + } + + valueToCheckForSetOperation := strings.ToLower(value) + for setOperation := range validSetOperations { + idx := strings.Index(valueToCheckForSetOperation, " "+setOperation) + if idx > 0 { + return &treeNode{}, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, i-n+idx+1, fmt.Sprintf("Looks like you are using the set operator '%s' but did not provide a set key. Did you forget the ':'?", validSetOperations[setOperation])) + } + } + node.token, node.value = lfTokenLabel, strings.TrimSpace(value) } return node, nil @@ -307,7 +511,7 @@ LOOP: switch node.token { case lfTokenEOF: break LOOP - case lfTokenLabel, lfTokenRegexp: + case lfTokenLabel, lfTokenRegexp, lfTokenSetKey: if current.rightNode != nil { return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, node.location, "Found two adjacent labels. You need an operator between them.") } @@ -326,6 +530,18 @@ LOOP: node.setLeftNode(nodeToStealFrom.rightNode) nodeToStealFrom.setRightNode(node) current = node + case lfTokenSetOperation: + if current.rightNode == nil { + return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, node.location, fmt.Sprintf("Set operation '%s' missing left hand operand.", node.value)) + } + node.setLeftNode(current.rightNode) + current.setRightNode(node) + current = node + case lfTokenSetArgument: + if current.rightNode != nil { + return nil, GinkgoErrors.SyntaxErrorParsingLabelFilter(input, node.location, fmt.Sprintf("Unexpected set argument '%s'.", node.token)) + } + current.setRightNode(node) case lfTokenCloseGroup: firstUnmatchedOpenNode := current.firstUnmatchedOpenNode() if firstUnmatchedOpenNode == nil { @@ -354,5 +570,14 @@ func ValidateAndCleanupLabel(label string, cl CodeLocation) (string, error) { if strings.ContainsAny(out, "&|!,()/") { return "", GinkgoErrors.InvalidLabel(label, cl) } + if out[0] == ':' { + return "", GinkgoErrors.InvalidLabel(label, cl) + } + if strings.Contains(out, ":") { + components := strings.SplitN(out, ":", 2) + if len(components) < 2 || components[1] == "" { + return "", GinkgoErrors.InvalidLabel(label, cl) + } + } return out, nil } diff --git a/vendor/github.com/onsi/ginkgo/v2/types/version.go b/vendor/github.com/onsi/ginkgo/v2/types/version.go index 21fb22b..6dfb25f 100644 --- a/vendor/github.com/onsi/ginkgo/v2/types/version.go +++ b/vendor/github.com/onsi/ginkgo/v2/types/version.go @@ -1,3 +1,3 @@ package types -const VERSION = "2.12.0" +const VERSION = "2.20.2" diff --git a/vendor/github.com/pires/go-proxyproto/header.go b/vendor/github.com/pires/go-proxyproto/header.go index 81ebeb3..209c2cc 100644 --- a/vendor/github.com/pires/go-proxyproto/header.go +++ b/vendor/github.com/pires/go-proxyproto/header.go @@ -155,11 +155,11 @@ func (header *Header) EqualsTo(otherHeader *Header) bool { if otherHeader == nil { return false } - // TLVs only exist for version 2 - if header.Version == 2 && !bytes.Equal(header.rawTLVs, otherHeader.rawTLVs) { + if header.Version != otherHeader.Version || header.Command != otherHeader.Command || header.TransportProtocol != otherHeader.TransportProtocol { return false } - if header.Version != otherHeader.Version || header.Command != otherHeader.Command || header.TransportProtocol != otherHeader.TransportProtocol { + // TLVs only exist for version 2 + if header.Version == 2 && !bytes.Equal(header.rawTLVs, otherHeader.rawTLVs) { return false } // Return early for header with LOCAL command, which contains no address information diff --git a/vendor/github.com/pires/go-proxyproto/policy.go b/vendor/github.com/pires/go-proxyproto/policy.go index 71ad62b..ebef8b9 100644 --- a/vendor/github.com/pires/go-proxyproto/policy.go +++ b/vendor/github.com/pires/go-proxyproto/policy.go @@ -14,6 +14,21 @@ import ( // In case an error is returned the connection is denied. type PolicyFunc func(upstream net.Addr) (Policy, error) +// ConnPolicyFunc can be used to decide whether to trust the PROXY info +// based on connection policy options. If set, the connecting addresses +// (remote and local) are passed in as argument. +// +// See below for the different policies. +// +// In case an error is returned the connection is denied. +type ConnPolicyFunc func(connPolicyOptions ConnPolicyOptions) (Policy, error) + +// ConnPolicyOptions contains the remote and local addresses of a connection. +type ConnPolicyOptions struct { + Upstream net.Addr + Downstream net.Addr +} + // Policy defines how a connection with a PROXY header address is treated. type Policy int @@ -32,8 +47,31 @@ const ( // a PROXY header is not present, subsequent reads do not. It is the task // of the code using the connection to handle that case properly. REQUIRE + // SKIP accepts a connection without requiring the PROXY header + // Note: an example usage can be found in the SkipProxyHeaderForCIDR + // function. + SKIP ) +// SkipProxyHeaderForCIDR returns a PolicyFunc which can be used to accept a +// connection from a skipHeaderCIDR without requiring a PROXY header, e.g. +// Kubernetes pods local traffic. The def is a policy to use when an upstream +// address doesn't match the skipHeaderCIDR. +func SkipProxyHeaderForCIDR(skipHeaderCIDR *net.IPNet, def Policy) PolicyFunc { + return func(upstream net.Addr) (Policy, error) { + ip, err := ipFromAddr(upstream) + if err != nil { + return def, err + } + + if skipHeaderCIDR != nil && skipHeaderCIDR.Contains(ip) { + return SKIP, nil + } + + return def, nil + } +} + // WithPolicy adds given policy to a connection when passed as option to NewConn() func WithPolicy(p Policy) func(*Conn) { return func(c *Conn) { @@ -147,3 +185,22 @@ func ipFromAddr(upstream net.Addr) (net.IP, error) { return upstreamIP, nil } + +// IgnoreProxyHeaderNotOnInterface retuns a ConnPolicyFunc which can be used to +// decide whether to use or ignore PROXY headers depending on the connection +// being made on a specific interface. This policy can be used when the server +// is bound to multiple interfaces but wants to allow on only one interface. +func IgnoreProxyHeaderNotOnInterface(allowedIP net.IP) ConnPolicyFunc { + return func(connOpts ConnPolicyOptions) (Policy, error) { + ip, err := ipFromAddr(connOpts.Downstream) + if err != nil { + return REJECT, err + } + + if allowedIP.Equal(ip) { + return USE, nil + } + + return IGNORE, nil + } +} diff --git a/vendor/github.com/pires/go-proxyproto/protocol.go b/vendor/github.com/pires/go-proxyproto/protocol.go index 6f5641d..270b90d 100644 --- a/vendor/github.com/pires/go-proxyproto/protocol.go +++ b/vendor/github.com/pires/go-proxyproto/protocol.go @@ -2,6 +2,8 @@ package proxyproto import ( "bufio" + "errors" + "fmt" "io" "net" "sync" @@ -9,11 +11,17 @@ import ( "time" ) -// DefaultReadHeaderTimeout is how long header processing waits for header to -// be read from the wire, if Listener.ReaderHeaderTimeout is not set. -// It's kept as a global variable so to make it easier to find and override, -// e.g. go build -ldflags -X "github.com/pires/go-proxyproto.DefaultReadHeaderTimeout=1s" -var DefaultReadHeaderTimeout = 200 * time.Millisecond +var ( + // DefaultReadHeaderTimeout is how long header processing waits for header to + // be read from the wire, if Listener.ReaderHeaderTimeout is not set. + // It's kept as a global variable so to make it easier to find and override, + // e.g. go build -ldflags -X "github.com/pires/go-proxyproto.DefaultReadHeaderTimeout=1s" + DefaultReadHeaderTimeout = 10 * time.Second + + // ErrInvalidUpstream should be returned when an upstream connection address + // is not trusted, and therefore is invalid. + ErrInvalidUpstream = fmt.Errorf("proxyproto: upstream connection address not trusted for PROXY information") +) // Listener is used to wrap an underlying listener, // whose connections may be using the HAProxy Proxy Protocol. @@ -22,9 +30,14 @@ var DefaultReadHeaderTimeout = 200 * time.Millisecond // connections in order to prevent blocking operations. If no ReadHeaderTimeout // is set, a default of 200ms will be used. This can be disabled by setting the // timeout to < 0. +// +// Only one of Policy or ConnPolicy should be provided. If both are provided then +// a panic would occur during accept. type Listener struct { - Listener net.Listener + Listener net.Listener + // Deprecated: use ConnPolicyFunc instead. This will be removed in future release. Policy PolicyFunc + ConnPolicy ConnPolicyFunc ValidateHeader Validator ReadHeaderTimeout time.Duration } @@ -38,10 +51,11 @@ type Conn struct { once sync.Once readErr error conn net.Conn - Validate Validator bufReader *bufio.Reader + reader io.Reader header *Header ProxyHeaderPolicy Policy + Validate Validator readHeaderTimeout time.Duration } @@ -58,39 +72,70 @@ func ValidateHeader(v Validator) func(*Conn) { } } -// Accept waits for and returns the next connection to the listener. -func (p *Listener) Accept() (net.Conn, error) { - // Get the underlying connection - conn, err := p.Listener.Accept() - if err != nil { - return nil, err +// SetReadHeaderTimeout sets the readHeaderTimeout for a connection when passed as option to NewConn() +func SetReadHeaderTimeout(t time.Duration) func(*Conn) { + return func(c *Conn) { + if t >= 0 { + c.readHeaderTimeout = t + } } +} - proxyHeaderPolicy := USE - if p.Policy != nil { - proxyHeaderPolicy, err = p.Policy(conn.RemoteAddr()) +// Accept waits for and returns the next valid connection to the listener. +func (p *Listener) Accept() (net.Conn, error) { + for { + // Get the underlying connection + conn, err := p.Listener.Accept() if err != nil { - // can't decide the policy, we can't accept the connection - conn.Close() return nil, err } - } - newConn := NewConn( - conn, - WithPolicy(proxyHeaderPolicy), - ValidateHeader(p.ValidateHeader), - ) + proxyHeaderPolicy := USE + if p.Policy != nil && p.ConnPolicy != nil { + panic("only one of policy or connpolicy must be provided.") + } + if p.Policy != nil || p.ConnPolicy != nil { + if p.Policy != nil { + proxyHeaderPolicy, err = p.Policy(conn.RemoteAddr()) + } else { + proxyHeaderPolicy, err = p.ConnPolicy(ConnPolicyOptions{ + Upstream: conn.RemoteAddr(), + Downstream: conn.LocalAddr(), + }) + } + if err != nil { + // can't decide the policy, we can't accept the connection + conn.Close() - // If the ReadHeaderTimeout for the listener is unset, use the default timeout. - if p.ReadHeaderTimeout == 0 { - p.ReadHeaderTimeout = DefaultReadHeaderTimeout - } + if errors.Is(err, ErrInvalidUpstream) { + // keep listening for other connections + continue + } + + return nil, err + } + // Handle a connection as a regular one + if proxyHeaderPolicy == SKIP { + return conn, nil + } + } - // Set the readHeaderTimeout of the new conn to the value of the listener - newConn.readHeaderTimeout = p.ReadHeaderTimeout + newConn := NewConn( + conn, + WithPolicy(proxyHeaderPolicy), + ValidateHeader(p.ValidateHeader), + ) - return newConn, nil + // If the ReadHeaderTimeout for the listener is unset, use the default timeout. + if p.ReadHeaderTimeout == 0 { + p.ReadHeaderTimeout = DefaultReadHeaderTimeout + } + + // Set the readHeaderTimeout of the new conn to the value of the listener + newConn.readHeaderTimeout = p.ReadHeaderTimeout + + return newConn, nil + } } // Close closes the underlying listener. @@ -106,8 +151,15 @@ func (p *Listener) Addr() net.Addr { // NewConn is used to wrap a net.Conn that may be speaking // the proxy protocol into a proxyproto.Conn func NewConn(conn net.Conn, opts ...func(*Conn)) *Conn { + // For v1 the header length is at most 108 bytes. + // For v2 the header length is at most 52 bytes plus the length of the TLVs. + // We use 256 bytes to be safe. + const bufSize = 256 + br := bufio.NewReaderSize(conn, bufSize) + pConn := &Conn{ - bufReader: bufio.NewReader(conn), + bufReader: br, + reader: io.MultiReader(br, conn), conn: conn, } @@ -129,7 +181,7 @@ func (p *Conn) Read(b []byte) (int, error) { return 0, p.readErr } - return p.bufReader.Read(b) + return p.reader.Read(b) } // Write wraps original conn.Write @@ -240,7 +292,9 @@ func (p *Conn) readHeader() error { // run on the connection, as we don't want to override the previous // read deadline the user may have used. if p.readHeaderTimeout > 0 { - p.conn.SetReadDeadline(time.Now().Add(p.readHeaderTimeout)) + if err := p.conn.SetReadDeadline(time.Now().Add(p.readHeaderTimeout)); err != nil { + return err + } } header, err := Read(p.bufReader) @@ -255,7 +309,9 @@ func (p *Conn) readHeader() error { if t == nil { t = time.Time{} } - p.conn.SetReadDeadline(t.(time.Time)) + if err := p.conn.SetReadDeadline(t.(time.Time)); err != nil { + return err + } if netErr, ok := err.(net.Error); ok && netErr.Timeout() { err = ErrNoProxyProtocol } @@ -307,5 +363,27 @@ func (p *Conn) WriteTo(w io.Writer) (int64, error) { if p.readErr != nil { return 0, p.readErr } - return p.bufReader.WriteTo(w) + + b := make([]byte, p.bufReader.Buffered()) + if _, err := p.bufReader.Read(b); err != nil { + return 0, err // this should never as we read buffered data + } + + var n int64 + { + nn, err := w.Write(b) + n += int64(nn) + if err != nil { + return n, err + } + } + { + nn, err := io.Copy(w, p.conn) + n += nn + if err != nil { + return n, err + } + } + + return n, nil } diff --git a/vendor/github.com/pires/go-proxyproto/v1.go b/vendor/github.com/pires/go-proxyproto/v1.go index 23de95e..0d34ba5 100644 --- a/vendor/github.com/pires/go-proxyproto/v1.go +++ b/vendor/github.com/pires/go-proxyproto/v1.go @@ -5,6 +5,7 @@ import ( "bytes" "fmt" "net" + "net/netip" "strconv" "strings" ) @@ -221,11 +222,22 @@ func parseV1PortNumber(portStr string) (int, error) { return port, nil } -func parseV1IPAddress(protocol AddressFamilyAndProtocol, addrStr string) (addr net.IP, err error) { - addr = net.ParseIP(addrStr) - tryV4 := addr.To4() - if (protocol == TCPv4 && tryV4 == nil) || (protocol == TCPv6 && tryV4 != nil) { - err = ErrInvalidAddress +func parseV1IPAddress(protocol AddressFamilyAndProtocol, addrStr string) (net.IP, error) { + addr, err := netip.ParseAddr(addrStr) + if err != nil { + return nil, ErrInvalidAddress } - return + + switch protocol { + case TCPv4: + if addr.Is4() { + return net.IP(addr.AsSlice()), nil + } + case TCPv6: + if addr.Is6() || addr.Is4In6() { + return net.IP(addr.AsSlice()), nil + } + } + + return nil, ErrInvalidAddress } diff --git a/vendor/github.com/quic-go/quic-go/.gitignore b/vendor/github.com/quic-go/quic-go/.gitignore index 3cc06f2..b454729 100644 --- a/vendor/github.com/quic-go/quic-go/.gitignore +++ b/vendor/github.com/quic-go/quic-go/.gitignore @@ -4,6 +4,7 @@ main mockgen_tmp.go *.qtr *.qlog +*.sqlog *.txt race.[0-9]* diff --git a/vendor/github.com/quic-go/quic-go/.golangci.yml b/vendor/github.com/quic-go/quic-go/.golangci.yml index 469d54c..63b40cc 100644 --- a/vendor/github.com/quic-go/quic-go/.golangci.yml +++ b/vendor/github.com/quic-go/quic-go/.golangci.yml @@ -1,21 +1,29 @@ -run: - skip-files: - - internal/handshake/cipher_suite.go linters-settings: misspell: ignore-words: - ect + depguard: + rules: + quicvarint: + list-mode: strict + files: + - "**/github.com/quic-go/quic-go/quicvarint/*" + - "!$test" + allow: + - $gostd linters: disable-all: true enable: - asciicheck + - depguard - exhaustive - exportloopref - goimports - gofmt # redundant, since gofmt *should* be a no-op after gofumpt - gofumpt - gosimple + - govet - ineffassign - misspell - prealloc @@ -24,10 +32,15 @@ linters: - unconvert - unparam - unused - - vet issues: + exclude-files: + - internal/handshake/cipher_suite.go exclude-rules: - path: internal/qtls linters: - depguard + - path: _test\.go + linters: + - exhaustive + - prealloc diff --git a/vendor/github.com/quic-go/quic-go/README.md b/vendor/github.com/quic-go/quic-go/README.md index faba82f..94823d9 100644 --- a/vendor/github.com/quic-go/quic-go/README.md +++ b/vendor/github.com/quic-go/quic-go/README.md @@ -2,11 +2,12 @@ +[![Documentation](https://img.shields.io/badge/docs-quic--go.net-red?style=flat)](https://quic-go.net/docs/) [![PkgGoDev](https://pkg.go.dev/badge/github.com/quic-go/quic-go)](https://pkg.go.dev/github.com/quic-go/quic-go) [![Code Coverage](https://img.shields.io/codecov/c/github/quic-go/quic-go/master.svg?style=flat-square)](https://codecov.io/gh/quic-go/quic-go/) [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/quic-go.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:quic-go) -quic-go is an implementation of the QUIC protocol ([RFC 9000](https://datatracker.ietf.org/doc/html/rfc9000), [RFC 9001](https://datatracker.ietf.org/doc/html/rfc9001), [RFC 9002](https://datatracker.ietf.org/doc/html/rfc9002)) in Go. It has support for HTTP/3 ([RFC 9114](https://datatracker.ietf.org/doc/html/rfc9114)), including QPACK ([RFC 9204](https://datatracker.ietf.org/doc/html/rfc9204)). +quic-go is an implementation of the QUIC protocol ([RFC 9000](https://datatracker.ietf.org/doc/html/rfc9000), [RFC 9001](https://datatracker.ietf.org/doc/html/rfc9001), [RFC 9002](https://datatracker.ietf.org/doc/html/rfc9002)) in Go. It has support for HTTP/3 ([RFC 9114](https://datatracker.ietf.org/doc/html/rfc9114)), including QPACK ([RFC 9204](https://datatracker.ietf.org/doc/html/rfc9204)) and HTTP Datagrams ([RFC 9297](https://datatracker.ietf.org/doc/html/rfc9297)). In addition to these base RFCs, it also implements the following RFCs: * Unreliable Datagram Extension ([RFC 9221](https://datatracker.ietf.org/doc/html/rfc9221)) @@ -16,207 +17,7 @@ In addition to these base RFCs, it also implements the following RFCs: Support for WebTransport over HTTP/3 ([draft-ietf-webtrans-http3](https://datatracker.ietf.org/doc/draft-ietf-webtrans-http3/)) is implemented in [webtransport-go](https://github.com/quic-go/webtransport-go). -## Using QUIC - -### Running a Server - -The central entry point is the `quic.Transport`. A transport manages QUIC connections running on a single UDP socket. Since QUIC uses Connection IDs, it can demultiplex a listener (accepting incoming connections) and an arbitrary number of outgoing QUIC connections on the same UDP socket. - -```go -udpConn, err := net.ListenUDP("udp4", &net.UDPAddr{Port: 1234}) -// ... error handling -tr := quic.Transport{ - Conn: udpConn, -} -ln, err := tr.Listen(tlsConf, quicConf) -// ... error handling -go func() { - for { - conn, err := ln.Accept() - // ... error handling - // handle the connection, usually in a new Go routine - } -}() -``` - -The listener `ln` can now be used to accept incoming QUIC connections by (repeatedly) calling the `Accept` method (see below for more information on the `quic.Connection`). - -As a shortcut, `quic.Listen` and `quic.ListenAddr` can be used without explicitly initializing a `quic.Transport`: - -``` -ln, err := quic.Listen(udpConn, tlsConf, quicConf) -``` - -When using the shortcut, it's not possible to reuse the same UDP socket for outgoing connections. - -### Running a Client - -As mentioned above, multiple outgoing connections can share a single UDP socket, since QUIC uses Connection IDs to demultiplex connections. - -```go -ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) // 3s handshake timeout -defer cancel() -conn, err := tr.Dial(ctx, , , ) -// ... error handling -``` - -As a shortcut, `quic.Dial` and `quic.DialAddr` can be used without explictly initializing a `quic.Transport`: - -```go -ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) // 3s handshake timeout -defer cancel() -conn, err := quic.Dial(ctx, conn, , , ) -``` - -Just as we saw before when used a similar shortcut to run a server, it's also not possible to reuse the same UDP socket for other outgoing connections, or to listen for incoming connections. - -### Using a QUIC Connection - -#### Accepting Streams - -QUIC is a stream-multiplexed transport. A `quic.Connection` fundamentally differs from the `net.Conn` and the `net.PacketConn` interface defined in the standard library. Data is sent and received on (unidirectional and bidirectional) streams (and, if supported, in [datagrams](#quic-datagrams)), not on the connection itself. The stream state machine is described in detail in [Section 3 of RFC 9000](https://datatracker.ietf.org/doc/html/rfc9000#section-3). - -Note: A unidirectional stream is a stream that the initiator can only write to (`quic.SendStream`), and the receiver can only read from (`quic.ReceiveStream`). A bidirectional stream (`quic.Stream`) allows reading from and writing to for both sides. - -On the receiver side, streams are accepted using the `AcceptStream` (for bidirectional) and `AcceptUniStream` functions. For most user cases, it makes sense to call these functions in a loop: - -```go -for { - str, err := conn.AcceptStream(context.Background()) // for bidirectional streams - // ... error handling - // handle the stream, usually in a new Go routine -} -``` - -These functions return an error when the underlying QUIC connection is closed. - -#### Opening Streams - -There are two slightly different ways to open streams, one synchronous and one (potentially) asynchronous. This API is necessary since the receiver grants us a certain number of streams that we're allowed to open. It may grant us additional streams later on (typically when existing streams are closed), but it means that at the time we want to open a new stream, we might not be able to do so. - -Using the synchronous method `OpenStreamSync` for bidirectional streams, and `OpenUniStreamSync` for unidirectional streams, an application can block until the peer allows opening additional streams. In case that we're allowed to open a new stream, these methods return right away: - -```go -ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) -defer cancel() -str, err := conn.OpenStreamSync(ctx) // wait up to 5s to open a new bidirectional stream -``` - -The asynchronous version never blocks. If it's currently not possible to open a new stream, it returns a `net.Error` timeout error: - -```go -str, err := conn.OpenStream() -if nerr, ok := err.(net.Error); ok && nerr.Timeout() { - // It's currently not possible to open another stream, - // but it might be possible later, once the peer allowed us to do so. -} -``` - -These functions return an error when the underlying QUIC connection is closed. - -#### Using Streams - -Using QUIC streams is pretty straightforward. The `quic.ReceiveStream` implements the `io.Reader` interface, and the `quic.SendStream` implements the `io.Writer` interface. A bidirectional stream (`quic.Stream`) implements both these interfaces. Conceptually, a bidirectional stream can be thought of as the composition of two unidirectional streams in opposite directions. - -Calling `Close` on a `quic.SendStream` or a `quic.Stream` closes the send side of the stream. On the receiver side, this will be surfaced as an `io.EOF` returned from the `io.Reader` once all data has been consumed. Note that for bidirectional streams, `Close` _only_ closes the send side of the stream. It is still possible to read from the stream until the peer closes or resets the stream. - -In case the application wishes to abort sending on a `quic.SendStream` or a `quic.Stream` , it can reset the send side by calling `CancelWrite` with an application-defined error code (an unsigned 62-bit number). On the receiver side, this surfaced as a `quic.StreamError` containing that error code on the `io.Reader`. Note that for bidirectional streams, `CancelWrite` _only_ resets the send side of the stream. It is still possible to read from the stream until the peer closes or resets the stream. - -Conversely, in case the application wishes to abort receiving from a `quic.ReceiveStream` or a `quic.Stream`, it can ask the sender to abort data transmission by calling `CancelRead` with an application-defined error code (an unsigned 62-bit number). On the receiver side, this surfaced as a `quic.StreamError` containing that error code on the `io.Writer`. Note that for bidirectional streams, `CancelWrite` _only_ resets the receive side of the stream. It is still possible to write to the stream. - -A bidirectional stream is only closed once both the read and the write side of the stream have been either closed or reset. Only then the peer is granted a new stream according to the maximum number of concurrent streams configured via `quic.Config.MaxIncomingStreams`. - -### Configuring QUIC - -The `quic.Config` struct passed to both the listen and dial calls (see above) contains a wide range of configuration options for QUIC connections, incl. the ability to fine-tune flow control limits, the number of streams that the peer is allowed to open concurrently, keep-alives, idle timeouts, and many more. Please refer to the documentation for the `quic.Config` for details. - -The `quic.Transport` contains a few configuration options that don't apply to any single QUIC connection, but to all connections handled by that transport. It is highly recommend to set the `StatelessResetToken`, which allows endpoints to quickly recover from crashes / reboots of our node (see [Section 10.3 of RFC 9000](https://datatracker.ietf.org/doc/html/rfc9000#section-10.3)). - -### Closing a Connection - -#### When the remote Peer closes the Connection - -In case the peer closes the QUIC connection, all calls to open streams, accept streams, as well as all methods on streams immediately return an error. Additionally, it is set as cancellation cause of the connection context. Users can use errors assertions to find out what exactly went wrong: - -* `quic.VersionNegotiationError`: Happens during the handshake, if there is no overlap between our and the remote's supported QUIC versions. -* `quic.HandshakeTimeoutError`: Happens if the QUIC handshake doesn't complete within the time specified in `quic.Config.HandshakeTimeout`. -* `quic.IdleTimeoutError`: Happens after completion of the handshake if the connection is idle for longer than the minimum of both peers idle timeouts (as configured by `quic.Config.IdleTimeout`). The connection is considered idle when no stream data (and datagrams, if applicable) are exchanged for that period. The QUIC connection can be instructed to regularly send a packet to prevent a connection from going idle by setting `quic.Config.KeepAlive`. However, this is no guarantee that the peer doesn't suddenly go away (e.g. by abruptly shutting down the node or by crashing), or by a NAT binding expiring, in which case this error might still occur. -* `quic.StatelessResetError`: Happens when the remote peer lost the state required to decrypt the packet. This requires the `quic.Transport.StatelessResetToken` to be configured by the peer. -* `quic.TransportError`: Happens if when the QUIC protocol is violated. Unless the error code is `APPLICATION_ERROR`, this will not happen unless one of the QUIC stacks involved is misbehaving. Please open an issue if you encounter this error. -* `quic.ApplicationError`: Happens when the remote decides to close the connection, see below. - -#### Initiated by the Application - -A `quic.Connection` can be closed using `CloseWithError`: - -```go -conn.CloseWithError(0x42, "error 0x42 occurred") -``` - -Applications can transmit both an error code (an unsigned 62-bit number) as well as a UTF-8 encoded human-readable reason. The error code allows the receiver to learn why the connection was closed, and the reason can be useful for debugging purposes. - -On the receiver side, this is surfaced as a `quic.ApplicationError`. - -### QUIC Datagrams - -Unreliable datagrams are a QUIC extension ([RFC 9221](https://datatracker.ietf.org/doc/html/rfc9221)) that is negotiated during the handshake. Support can be enabled by setting the `quic.Config.EnableDatagram` flag. Note that this doesn't guarantee that the peer also supports datagrams. Whether or not the feature negotiation succeeded can be learned from the `quic.ConnectionState.SupportsDatagrams` obtained from `quic.Connection.ConnectionState()`. - -QUIC DATAGRAMs are a new QUIC frame type sent in QUIC 1-RTT packets (i.e. after completion of the handshake). Therefore, they're end-to-end encrypted and congestion-controlled. However, if a DATAGRAM frame is deemed lost by QUIC's loss detection mechanism, they are not retransmitted. - -Datagrams are sent using the `SendDatagram` method on the `quic.Connection`: - -```go -conn.SendDatagram([]byte("foobar")) -``` - -And received using `ReceiveDatagram`: - -```go -msg, err := conn.ReceiveDatagram() -``` - -Note that this code path is currently not optimized. It works for datagrams that are sent occasionally, but it doesn't achieve the same throughput as writing data on a stream. Please get in touch on issue #3766 if your use case relies on high datagram throughput, or if you'd like to help fix this issue. There are also some restrictions regarding the maximum message size (see #3599). - -### QUIC Event Logging using qlog - -quic-go logs a wide range of events defined in [draft-ietf-quic-qlog-quic-events](https://datatracker.ietf.org/doc/draft-ietf-quic-qlog-quic-events/), providing comprehensive insights in the internals of a QUIC connection. - -qlog files can be processed by a number of 3rd-party tools. [qviz](https://qvis.quictools.info/) has proven very useful for debugging all kinds of QUIC connection failures. - -qlog can be activated by setting the `Tracer` callback on the `Config`. It is called as soon as quic-go decides to start the QUIC handshake on a new connection. -`qlog.DefaultTracer` provides a tracer implementation which writes qlog files to a directory specified by the `QLOGDIR` environment variable, if set. -The default qlog tracer can be used like this: -```go -quic.Config{ - Tracer: qlog.DefaultTracer, -} -``` - -This example creates a new qlog file under `/_.qlog`, e.g. `qlogs/2e0407da_client.qlog`. - - -For custom qlog behavior, `qlog.NewConnectionTracer` can be used. - -## Using HTTP/3 - -### As a server - -See the [example server](example/main.go). Starting a QUIC server is very similar to the standard library http package in Go: - -```go -http.Handle("/", http.FileServer(http.Dir(wwwDir))) -http3.ListenAndServeQUIC("localhost:4242", "/path/to/cert/chain.pem", "/path/to/privkey.pem", nil) -``` - -### As a client - -See the [example client](example/client/main.go). Use a `http3.RoundTripper` as a `Transport` in a `http.Client`. - -```go -http.Client{ - Transport: &http3.RoundTripper{}, -} -``` +Detailed documentation can be found on [quic-go.net](https://quic-go.net/docs/). ## Projects using quic-go diff --git a/vendor/github.com/quic-go/quic-go/client.go b/vendor/github.com/quic-go/quic-go/client.go index 70dd5e1..1c5654f 100644 --- a/vendor/github.com/quic-go/quic-go/client.go +++ b/vendor/github.com/quic-go/quic-go/client.go @@ -35,7 +35,7 @@ type client struct { conn quicConn tracer *logging.ConnectionTracer - tracingID uint64 + tracingID ConnectionTracingID logger utils.Logger } @@ -191,6 +191,7 @@ func (c *client) dial(ctx context.Context) error { c.logger.Infof("Starting new connection to %s (%s -> %s), source connection ID %s, destination connection ID %s, version %s", c.tlsConf.ServerName, c.sendConn.LocalAddr(), c.sendConn.RemoteAddr(), c.srcConnID, c.destConnID, c.version) c.conn = newClientConnection( + context.WithValue(context.WithoutCancel(ctx), ConnectionTracingKey, c.tracingID), c.sendConn, c.packetHandlers, c.destConnID, @@ -202,7 +203,6 @@ func (c *client) dial(ctx context.Context) error { c.use0RTT, c.hasNegotiatedVersion, c.tracer, - c.tracingID, c.logger, c.version, ) diff --git a/vendor/github.com/quic-go/quic-go/config.go b/vendor/github.com/quic-go/quic-go/config.go index ee032e6..d42bdc1 100644 --- a/vendor/github.com/quic-go/quic-go/config.go +++ b/vendor/github.com/quic-go/quic-go/config.go @@ -39,6 +39,12 @@ func validateConfig(config *Config) error { if config.MaxConnectionReceiveWindow > quicvarint.Max { config.MaxConnectionReceiveWindow = quicvarint.Max } + if config.InitialPacketSize > 0 && config.InitialPacketSize < protocol.MinInitialPacketSize { + config.InitialPacketSize = protocol.MinInitialPacketSize + } + if config.InitialPacketSize > protocol.MaxPacketBufferSize { + config.InitialPacketSize = protocol.MaxPacketBufferSize + } // check that all QUIC versions are actually supported for _, v := range config.Versions { if !protocol.IsValidVersion(v) { @@ -94,6 +100,10 @@ func populateConfig(config *Config) *Config { } else if maxIncomingUniStreams < 0 { maxIncomingUniStreams = 0 } + initialPacketSize := config.InitialPacketSize + if initialPacketSize == 0 { + initialPacketSize = protocol.InitialPacketSize + } return &Config{ GetConfigForClient: config.GetConfigForClient, @@ -110,6 +120,7 @@ func populateConfig(config *Config) *Config { MaxIncomingUniStreams: maxIncomingUniStreams, TokenStore: config.TokenStore, EnableDatagrams: config.EnableDatagrams, + InitialPacketSize: initialPacketSize, DisablePathMTUDiscovery: config.DisablePathMTUDiscovery, Allow0RTT: config.Allow0RTT, Tracer: config.Tracer, diff --git a/vendor/github.com/quic-go/quic-go/connection.go b/vendor/github.com/quic-go/quic-go/connection.go index f8bcd61..1411a77 100644 --- a/vendor/github.com/quic-go/quic-go/connection.go +++ b/vendor/github.com/quic-go/quic-go/connection.go @@ -16,7 +16,6 @@ import ( "github.com/quic-go/quic-go/internal/ackhandler" "github.com/quic-go/quic-go/internal/flowcontrol" "github.com/quic-go/quic-go/internal/handshake" - "github.com/quic-go/quic-go/internal/logutils" "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/qerr" "github.com/quic-go/quic-go/internal/utils" @@ -25,15 +24,10 @@ import ( ) type unpacker interface { - UnpackLongHeader(hdr *wire.Header, rcvTime time.Time, data []byte, v protocol.Version) (*unpackedPacket, error) + UnpackLongHeader(hdr *wire.Header, data []byte) (*unpackedPacket, error) UnpackShortHeader(rcvTime time.Time, data []byte) (protocol.PacketNumber, protocol.PacketNumberLen, protocol.KeyPhaseBit, []byte, error) } -type streamGetter interface { - GetOrOpenReceiveStream(protocol.StreamID) (receiveStreamI, error) - GetOrOpenSendStream(protocol.StreamID) (sendStreamI, error) -} - type streamManager interface { GetOrOpenSendStream(protocol.StreamID) (sendStreamI, error) GetOrOpenReceiveStream(protocol.StreamID) (receiveStreamI, error) @@ -52,13 +46,14 @@ type streamManager interface { } type cryptoStreamHandler interface { - StartHandshake() error + StartHandshake(context.Context) error ChangeConnectionID(protocol.ConnectionID) SetLargest1RTTAcked(protocol.PacketNumber) error SetHandshakeConfirmed() GetSessionTicket() ([]byte, error) NextEvent() handshake.Event DiscardInitialKeys() + HandleMessage([]byte, protocol.EncryptionLevel) error io.Closer ConnectionState() handshake.ConnectionState } @@ -113,8 +108,8 @@ func (e *errCloseForRecreating) Error() string { return "closing connection in order to recreate it" } -var connTracingID uint64 // to be accessed atomically -func nextConnTracingID() uint64 { return atomic.AddUint64(&connTracingID, 1) } +var connTracingID atomic.Uint64 // to be accessed atomically +func nextConnTracingID() ConnectionTracingID { return ConnectionTracingID(connTracingID.Add(1)) } // A Connection is a QUIC connection type connection struct { @@ -144,8 +139,7 @@ type connection struct { sentPacketHandler ackhandler.SentPacketHandler receivedPacketHandler ackhandler.ReceivedPacketHandler retransmissionQueue *retransmissionQueue - framer framer - windowUpdateQueue *windowUpdateQueue + framer *framer connFlowController flowcontrol.ConnectionFlowController tokenStoreKey string // only set for the client tokenGenerator *handshake.TokenGenerator // only set for the server @@ -153,11 +147,13 @@ type connection struct { unpacker unpacker frameParser wire.FrameParser packer packer - mtuDiscoverer mtuDiscoverer // initialized when the handshake completes + mtuDiscoverer mtuDiscoverer // initialized when the transport parameters are received + + maxPayloadSizeEstimate atomic.Uint32 - initialStream cryptoStream - handshakeStream cryptoStream - oneRTTStream cryptoStream // only set for the server + initialStream *cryptoStream + handshakeStream *cryptoStream + oneRTTStream *cryptoStream // only set for the server cryptoStreamHandler cryptoStreamHandler receivedPackets chan receivedPacket @@ -167,10 +163,9 @@ type connection struct { // closeChan is used to notify the run loop that it should terminate closeChan chan closeError - ctx context.Context - ctxCancel context.CancelCauseFunc - handshakeCtx context.Context - handshakeCtxCancel context.CancelFunc + ctx context.Context + ctxCancel context.CancelCauseFunc + handshakeCompleteChan chan struct{} undecryptablePackets []receivedPacket // undecryptable packets, waiting for a change in encryption level undecryptablePacketsToProcess []receivedPacket @@ -220,6 +215,8 @@ var ( ) var newConnection = func( + ctx context.Context, + ctxCancel context.CancelCauseFunc, conn sendConn, runner connRunner, origDestConnID protocol.ConnectionID, @@ -234,11 +231,12 @@ var newConnection = func( tokenGenerator *handshake.TokenGenerator, clientAddressValidated bool, tracer *logging.ConnectionTracer, - tracingID uint64, logger utils.Logger, v protocol.Version, ) quicConn { s := &connection{ + ctx: ctx, + ctxCancel: ctxCancel, conn: conn, config: conf, handshakeDestConnID: destConnID, @@ -273,10 +271,9 @@ var newConnection = func( connIDGenerator, ) s.preSetup() - s.ctx, s.ctxCancel = context.WithCancelCause(context.WithValue(context.Background(), ConnectionTracingKey, tracingID)) s.sentPacketHandler, s.receivedPacketHandler = ackhandler.NewAckHandler( 0, - getMaxPacketSize(s.conn.RemoteAddr()), + protocol.ByteCount(s.config.InitialPacketSize), s.rttStats, clientAddressValidated, s.conn.capabilities().ECN, @@ -284,7 +281,7 @@ var newConnection = func( s.tracer, s.logger, ) - s.mtuDiscoverer = newMTUDiscoverer(s.rttStats, getMaxPacketSize(s.conn.RemoteAddr()), s.sentPacketHandler.SetMaxDatagramSize) + s.maxPayloadSizeEstimate.Store(uint32(estimateMaxPayloadSize(protocol.ByteCount(s.config.InitialPacketSize)))) params := &wire.TransportParameters{ InitialMaxStreamDataBidiLocal: protocol.ByteCount(s.config.InitialStreamReceiveWindow), InitialMaxStreamDataBidiRemote: protocol.ByteCount(s.config.InitialStreamReceiveWindow), @@ -295,6 +292,7 @@ var newConnection = func( MaxUniStreamNum: protocol.StreamNum(s.config.MaxIncomingUniStreams), MaxAckDelay: protocol.MaxAckDelayInclGranularity, AckDelayExponent: protocol.AckDelayExponent, + MaxUDPPayloadSize: protocol.MaxPacketBufferSize, DisableActiveMigration: true, StatelessResetToken: &statelessResetToken, OriginalDestinationConnectionID: origDestConnID, @@ -330,12 +328,13 @@ var newConnection = func( s.cryptoStreamHandler = cs s.packer = newPacketPacker(srcConnID, s.connIDManager.Get, s.initialStream, s.handshakeStream, s.sentPacketHandler, s.retransmissionQueue, cs, s.framer, s.receivedPacketHandler, s.datagramQueue, s.perspective) s.unpacker = newPacketUnpacker(cs, s.srcConnIDLen) - s.cryptoStreamManager = newCryptoStreamManager(cs, s.initialStream, s.handshakeStream, s.oneRTTStream) + s.cryptoStreamManager = newCryptoStreamManager(s.initialStream, s.handshakeStream, s.oneRTTStream) return s } // declare this as a variable, such that we can it mock it in the tests var newClientConnection = func( + ctx context.Context, conn sendConn, runner connRunner, destConnID protocol.ConnectionID, @@ -347,7 +346,6 @@ var newClientConnection = func( enable0RTT bool, hasNegotiatedVersion bool, tracer *logging.ConnectionTracer, - tracingID uint64, logger utils.Logger, v protocol.Version, ) quicConn { @@ -381,11 +379,11 @@ var newClientConnection = func( s.queueControlFrame, connIDGenerator, ) + s.ctx, s.ctxCancel = context.WithCancelCause(ctx) s.preSetup() - s.ctx, s.ctxCancel = context.WithCancelCause(context.WithValue(context.Background(), ConnectionTracingKey, tracingID)) s.sentPacketHandler, s.receivedPacketHandler = ackhandler.NewAckHandler( initialPacketNumber, - getMaxPacketSize(s.conn.RemoteAddr()), + protocol.ByteCount(s.config.InitialPacketSize), s.rttStats, false, // has no effect s.conn.capabilities().ECN, @@ -393,7 +391,7 @@ var newClientConnection = func( s.tracer, s.logger, ) - s.mtuDiscoverer = newMTUDiscoverer(s.rttStats, getMaxPacketSize(s.conn.RemoteAddr()), s.sentPacketHandler.SetMaxDatagramSize) + s.maxPayloadSizeEstimate.Store(uint32(estimateMaxPayloadSize(protocol.ByteCount(s.config.InitialPacketSize)))) oneRTTStream := newCryptoStream() params := &wire.TransportParameters{ InitialMaxStreamDataBidiRemote: protocol.ByteCount(s.config.InitialStreamReceiveWindow), @@ -404,6 +402,7 @@ var newClientConnection = func( MaxBidiStreamNum: protocol.StreamNum(s.config.MaxIncomingStreams), MaxUniStreamNum: protocol.StreamNum(s.config.MaxIncomingUniStreams), MaxAckDelay: protocol.MaxAckDelayInclGranularity, + MaxUDPPayloadSize: protocol.MaxPacketBufferSize, AckDelayExponent: protocol.AckDelayExponent, DisableActiveMigration: true, // For interoperability with quic-go versions before May 2023, this value must be set to a value @@ -433,7 +432,7 @@ var newClientConnection = func( s.version, ) s.cryptoStreamHandler = cs - s.cryptoStreamManager = newCryptoStreamManager(cs, s.initialStream, s.handshakeStream, oneRTTStream) + s.cryptoStreamManager = newCryptoStreamManager(s.initialStream, s.handshakeStream, oneRTTStream) s.unpacker = newPacketUnpacker(cs, s.srcConnIDLen) s.packer = newPacketPacker(srcConnID, s.connIDManager.Get, s.initialStream, s.handshakeStream, s.sentPacketHandler, s.retransmissionQueue, cs, s.framer, s.receivedPacketHandler, s.datagramQueue, s.perspective) if len(tlsConf.ServerName) > 0 { @@ -459,7 +458,6 @@ func (s *connection) preSetup() { s.connFlowController = flowcontrol.NewConnectionFlowController( protocol.ByteCount(s.config.InitialConnectionReceiveWindow), protocol.ByteCount(s.config.MaxConnectionReceiveWindow), - s.onHasConnectionWindowUpdate, func(size protocol.ByteCount) bool { if s.config.AllowConnectionWindowIncrease == nil { return true @@ -471,23 +469,24 @@ func (s *connection) preSetup() { ) s.earlyConnReadyChan = make(chan struct{}) s.streamsMap = newStreamsMap( + s.ctx, s, + s.queueControlFrame, s.newFlowController, uint64(s.config.MaxIncomingStreams), uint64(s.config.MaxIncomingUniStreams), s.perspective, ) - s.framer = newFramer(s.streamsMap) + s.framer = newFramer() s.receivedPackets = make(chan receivedPacket, protocol.MaxConnUnprocessedPackets) s.closeChan = make(chan closeError, 1) s.sendingScheduled = make(chan struct{}, 1) - s.handshakeCtx, s.handshakeCtxCancel = context.WithCancel(context.Background()) + s.handshakeCompleteChan = make(chan struct{}) now := time.Now() s.lastPacketReceivedTime = now s.creationTime = now - s.windowUpdateQueue = newWindowUpdateQueue(s.streamsMap, s.connFlowController, s.framer.QueueControlFrame) s.datagramQueue = newDatagramQueue(s.scheduleSending, s.logger) s.connState.Version = s.version } @@ -495,13 +494,11 @@ func (s *connection) preSetup() { // run the connection main loop func (s *connection) run() error { var closeErr closeError - defer func() { - s.ctxCancel(closeErr.err) - }() + defer func() { s.ctxCancel(closeErr.err) }() s.timer = *newTimer() - if err := s.cryptoStreamHandler.StartHandshake(); err != nil { + if err := s.cryptoStreamHandler.StartHandshake(s.ctx); err != nil { return err } if err := s.handleHandshakeEvents(); err != nil { @@ -662,7 +659,7 @@ func (s *connection) earlyConnReady() <-chan struct{} { } func (s *connection) HandshakeComplete() <-chan struct{} { - return s.handshakeCtx.Done() + return s.handshakeCompleteChan } func (s *connection) Context() context.Context { @@ -702,10 +699,10 @@ func (s *connection) nextKeepAliveTime() time.Time { func (s *connection) maybeResetTimer() { var deadline time.Time if !s.handshakeComplete { - deadline = utils.MinTime( - s.creationTime.Add(s.config.handshakeTimeout()), - s.idleTimeoutStartTime().Add(s.config.HandshakeIdleTimeout), - ) + deadline = s.creationTime.Add(s.config.handshakeTimeout()) + if t := s.idleTimeoutStartTime().Add(s.config.HandshakeIdleTimeout); t.Before(deadline) { + deadline = t + } } else { if keepAliveTime := s.nextKeepAliveTime(); !keepAliveTime.IsZero() { deadline = keepAliveTime @@ -723,11 +720,15 @@ func (s *connection) maybeResetTimer() { } func (s *connection) idleTimeoutStartTime() time.Time { - return utils.MaxTime(s.lastPacketReceivedTime, s.firstAckElicitingPacketAfterIdleSentTime) + startTime := s.lastPacketReceivedTime + if t := s.firstAckElicitingPacketAfterIdleSentTime; t.After(startTime) { + startTime = t + } + return startTime } func (s *connection) handleHandshakeComplete() error { - defer s.handshakeCtxCancel() + defer close(s.handshakeCompleteChan) // Once the handshake completes, we have derived 1-RTT keys. // There's no point in queueing undecryptable packets for later decryption anymore. s.undecryptablePackets = nil @@ -780,11 +781,7 @@ func (s *connection) handleHandshakeConfirmed() error { s.cryptoStreamHandler.SetHandshakeConfirmed() if !s.config.DisablePathMTUDiscovery && s.conn.capabilities().DF { - maxPacketSize := s.peerParams.MaxUDPPayloadSize - if maxPacketSize == 0 { - maxPacketSize = protocol.MaxByteCount - } - s.mtuDiscoverer.Start(min(maxPacketSize, protocol.MaxPacketBufferSize)) + s.mtuDiscoverer.Start() } return nil } @@ -803,13 +800,11 @@ func (s *connection) handlePacketImpl(rp receivedPacket) bool { data := rp.data p := rp for len(data) > 0 { - var destConnID protocol.ConnectionID if counter > 0 { p = *(p.Clone()) p.data = data - var err error - destConnID, err = wire.ParseConnectionID(p.data, s.srcConnIDLen) + destConnID, err := wire.ParseConnectionID(p.data, s.srcConnIDLen) if err != nil { if s.tracer != nil && s.tracer.DroppedPacket != nil { s.tracer.DroppedPacket(logging.PacketTypeNotDetermined, protocol.InvalidPacketNumber, protocol.ByteCount(len(data)), logging.PacketDropHeaderParseError) @@ -869,7 +864,9 @@ func (s *connection) handlePacketImpl(rp receivedPacket) bool { if counter > 0 { p.buffer.Split() } - processed = s.handleShortHeaderPacket(p, destConnID) + if wasProcessed := s.handleShortHeaderPacket(p); wasProcessed { + processed = true + } break } } @@ -878,7 +875,7 @@ func (s *connection) handlePacketImpl(rp receivedPacket) bool { return processed } -func (s *connection) handleShortHeaderPacket(p receivedPacket, destConnID protocol.ConnectionID) bool { +func (s *connection) handleShortHeaderPacket(p receivedPacket) bool { var wasQueued bool defer func() { @@ -888,6 +885,11 @@ func (s *connection) handleShortHeaderPacket(p receivedPacket, destConnID protoc } }() + destConnID, err := wire.ParseConnectionID(p.data, s.srcConnIDLen) + if err != nil { + s.tracer.DroppedPacket(logging.PacketType1RTT, protocol.InvalidPacketNumber, protocol.ByteCount(len(p.data)), logging.PacketDropHeaderParseError) + return false + } pn, pnLen, keyPhase, data, err := s.unpacker.UnpackShortHeader(p.rcvTime, p.data) if err != nil { wasQueued = s.handleUnpackError(err, p, logging.PacketType1RTT) @@ -961,7 +963,7 @@ func (s *connection) handleLongHeaderPacket(p receivedPacket, hdr *wire.Header) return false } - packet, err := s.unpacker.UnpackLongHeader(hdr, p.rcvTime, p.data, s.version) + packet, err := s.unpacker.UnpackLongHeader(hdr, p.data) if err != nil { wasQueued = s.handleUnpackError(err, p, logging.PacketTypeFromHeader(hdr)) return false @@ -1261,7 +1263,7 @@ func (s *connection) handleFrames( isAckEliciting = true } if log != nil { - frames = append(frames, logutils.ConvertFrame(frame)) + frames = append(frames, toLoggingFrame(frame)) } // An error occurred handling a previous frame. // Don't handle the current frame. @@ -1378,6 +1380,15 @@ func (s *connection) handleCryptoFrame(frame *wire.CryptoFrame, encLevel protoco if err := s.cryptoStreamManager.HandleCryptoFrame(frame, encLevel); err != nil { return err } + for { + data := s.cryptoStreamManager.GetCryptoData(encLevel) + if data == nil { + break + } + if err := s.cryptoStreamHandler.HandleMessage(data, encLevel); err != nil { + return err + } + } return s.handleHandshakeEvents() } @@ -1668,10 +1679,8 @@ func (s *connection) dropEncryptionLevel(encLevel protocol.EncryptionLevel) erro s.cryptoStreamHandler.DiscardInitialKeys() case protocol.Encryption0RTT: s.streamsMap.ResetFor0RTT() - if err := s.connFlowController.Reset(); err != nil { - return err - } - return s.framer.Handle0RTTRejection() + s.framer.Handle0RTTRejection() + return s.connFlowController.Reset() } return s.cryptoStreamManager.Drop(encLevel) } @@ -1758,7 +1767,11 @@ func (s *connection) checkTransportParameters(params *wire.TransportParameters) func (s *connection) applyTransportParameters() { params := s.peerParams // Our local idle timeout will always be > 0. - s.idleTimeout = utils.MinNonZeroDuration(s.config.MaxIdleTimeout, params.MaxIdleTimeout) + s.idleTimeout = s.config.MaxIdleTimeout + // If the peer advertised an idle timeout, take the minimum of the values. + if params.MaxIdleTimeout > 0 { + s.idleTimeout = min(s.idleTimeout, params.MaxIdleTimeout) + } s.keepAliveInterval = min(s.config.KeepAlivePeriod, min(s.idleTimeout/2, protocol.MaxKeepAliveInterval)) s.streamsMap.UpdateLimits(params) s.frameParser.SetAckDelayExponent(params.AckDelayExponent) @@ -1773,6 +1786,17 @@ func (s *connection) applyTransportParameters() { // Retire the connection ID. s.connIDManager.AddFromPreferredAddress(params.PreferredAddress.ConnectionID, params.PreferredAddress.StatelessResetToken) } + maxPacketSize := protocol.ByteCount(protocol.MaxPacketBufferSize) + if params.MaxUDPPayloadSize > 0 && params.MaxUDPPayloadSize < maxPacketSize { + maxPacketSize = params.MaxUDPPayloadSize + } + s.mtuDiscoverer = newMTUDiscoverer( + s.rttStats, + protocol.ByteCount(s.config.InitialPacketSize), + maxPacketSize, + s.onMTUIncreased, + s.tracer, + ) } func (s *connection) triggerSending(now time.Time) error { @@ -1855,13 +1879,15 @@ func (s *connection) sendPackets(now time.Time) error { if isBlocked, offset := s.connFlowController.IsNewlyBlocked(); isBlocked { s.framer.QueueControlFrame(&wire.DataBlockedFrame{MaximumData: offset}) } - s.windowUpdateQueue.QueueAll() + if offset := s.connFlowController.GetWindowUpdate(); offset > 0 { + s.framer.QueueControlFrame(&wire.MaxDataFrame{MaximumData: offset}) + } if cf := s.cryptoStreamManager.GetPostHandshakeData(protocol.MaxPostHandshakeCryptoFrameSize); cf != nil { s.queueControlFrame(cf) } if !s.handshakeConfirmed { - packet, err := s.packer.PackCoalescedPacket(false, s.mtuDiscoverer.CurrentSize(), s.version) + packet, err := s.packer.PackCoalescedPacket(false, s.maxPacketSize(), s.version) if err != nil || packet == nil { return err } @@ -1888,7 +1914,7 @@ func (s *connection) sendPacketsWithoutGSO(now time.Time) error { for { buf := getPacketBuffer() ecn := s.sentPacketHandler.ECNMode(true) - if _, err := s.appendOneShortHeaderPacket(buf, s.mtuDiscoverer.CurrentSize(), ecn, now); err != nil { + if _, err := s.appendOneShortHeaderPacket(buf, s.maxPacketSize(), ecn, now); err != nil { if err == errNothingToPack { buf.Release() return nil @@ -1919,7 +1945,7 @@ func (s *connection) sendPacketsWithoutGSO(now time.Time) error { func (s *connection) sendPacketsWithGSO(now time.Time) error { buf := getLargePacketBuffer() - maxSize := s.mtuDiscoverer.CurrentSize() + maxSize := s.maxPacketSize() ecn := s.sentPacketHandler.ECNMode(true) for { @@ -1988,7 +2014,7 @@ func (s *connection) resetPacingDeadline() { func (s *connection) maybeSendAckOnlyPacket(now time.Time) error { if !s.handshakeConfirmed { ecn := s.sentPacketHandler.ECNMode(false) - packet, err := s.packer.PackCoalescedPacket(true, s.mtuDiscoverer.CurrentSize(), s.version) + packet, err := s.packer.PackCoalescedPacket(true, s.maxPacketSize(), s.version) if err != nil { return err } @@ -1999,7 +2025,7 @@ func (s *connection) maybeSendAckOnlyPacket(now time.Time) error { } ecn := s.sentPacketHandler.ECNMode(true) - p, buf, err := s.packer.PackAckOnlyPacket(s.mtuDiscoverer.CurrentSize(), s.version) + p, buf, err := s.packer.PackAckOnlyPacket(s.maxPacketSize(), s.version) if err != nil { if err == errNothingToPack { return nil @@ -2021,7 +2047,7 @@ func (s *connection) sendProbePacket(encLevel protocol.EncryptionLevel, now time break } var err error - packet, err = s.packer.MaybePackProbePacket(encLevel, s.mtuDiscoverer.CurrentSize(), s.version) + packet, err = s.packer.MaybePackProbePacket(encLevel, s.maxPacketSize(), s.version) if err != nil { return err } @@ -2032,7 +2058,7 @@ func (s *connection) sendProbePacket(encLevel protocol.EncryptionLevel, now time if packet == nil { s.retransmissionQueue.AddPing(encLevel) var err error - packet, err = s.packer.MaybePackProbePacket(encLevel, s.mtuDiscoverer.CurrentSize(), s.version) + packet, err = s.packer.MaybePackProbePacket(encLevel, s.maxPacketSize(), s.version) if err != nil { return err } @@ -2111,14 +2137,14 @@ func (s *connection) sendConnectionClose(e error) ([]byte, error) { var transportErr *qerr.TransportError var applicationErr *qerr.ApplicationError if errors.As(e, &transportErr) { - packet, err = s.packer.PackConnectionClose(transportErr, s.mtuDiscoverer.CurrentSize(), s.version) + packet, err = s.packer.PackConnectionClose(transportErr, s.maxPacketSize(), s.version) } else if errors.As(e, &applicationErr) { - packet, err = s.packer.PackApplicationClose(applicationErr, s.mtuDiscoverer.CurrentSize(), s.version) + packet, err = s.packer.PackApplicationClose(applicationErr, s.maxPacketSize(), s.version) } else { packet, err = s.packer.PackConnectionClose(&qerr.TransportError{ ErrorCode: qerr.InternalError, ErrorMessage: fmt.Sprintf("connection BUG: unspecified error type (msg: %s)", e.Error()), - }, s.mtuDiscoverer.CurrentSize(), s.version) + }, s.maxPacketSize(), s.version) } if err != nil { return nil, err @@ -2128,126 +2154,22 @@ func (s *connection) sendConnectionClose(e error) ([]byte, error) { return packet.buffer.Data, s.conn.Write(packet.buffer.Data, 0, ecn) } -func (s *connection) logLongHeaderPacket(p *longHeaderPacket, ecn protocol.ECN) { - // quic-go logging - if s.logger.Debug() { - p.header.Log(s.logger) - if p.ack != nil { - wire.LogFrame(s.logger, p.ack, true) - } - for _, frame := range p.frames { - wire.LogFrame(s.logger, frame.Frame, true) - } - for _, frame := range p.streamFrames { - wire.LogFrame(s.logger, frame.Frame, true) - } - } - - // tracing - if s.tracer != nil && s.tracer.SentLongHeaderPacket != nil { - frames := make([]logging.Frame, 0, len(p.frames)) - for _, f := range p.frames { - frames = append(frames, logutils.ConvertFrame(f.Frame)) - } - for _, f := range p.streamFrames { - frames = append(frames, logutils.ConvertFrame(f.Frame)) +func (s *connection) maxPacketSize() protocol.ByteCount { + if s.mtuDiscoverer == nil { + // Use the configured packet size on the client side. + // If the server sends a max_udp_payload_size that's smaller than this size, we can ignore this: + // Apparently the server still processed the (fully padded) Initial packet anyway. + if s.perspective == protocol.PerspectiveClient { + return protocol.ByteCount(s.config.InitialPacketSize) } - var ack *logging.AckFrame - if p.ack != nil { - ack = logutils.ConvertAckFrame(p.ack) - } - s.tracer.SentLongHeaderPacket(p.header, p.length, ecn, ack, frames) - } -} - -func (s *connection) logShortHeaderPacket( - destConnID protocol.ConnectionID, - ackFrame *wire.AckFrame, - frames []ackhandler.Frame, - streamFrames []ackhandler.StreamFrame, - pn protocol.PacketNumber, - pnLen protocol.PacketNumberLen, - kp protocol.KeyPhaseBit, - ecn protocol.ECN, - size protocol.ByteCount, - isCoalesced bool, -) { - if s.logger.Debug() && !isCoalesced { - s.logger.Debugf("-> Sending packet %d (%d bytes) for connection %s, 1-RTT (ECN: %s)", pn, size, s.logID, ecn) - } - // quic-go logging - if s.logger.Debug() { - wire.LogShortHeader(s.logger, destConnID, pn, pnLen, kp) - if ackFrame != nil { - wire.LogFrame(s.logger, ackFrame, true) - } - for _, f := range frames { - wire.LogFrame(s.logger, f.Frame, true) - } - for _, f := range streamFrames { - wire.LogFrame(s.logger, f.Frame, true) - } - } - - // tracing - if s.tracer != nil && s.tracer.SentShortHeaderPacket != nil { - fs := make([]logging.Frame, 0, len(frames)+len(streamFrames)) - for _, f := range frames { - fs = append(fs, logutils.ConvertFrame(f.Frame)) - } - for _, f := range streamFrames { - fs = append(fs, logutils.ConvertFrame(f.Frame)) - } - var ack *logging.AckFrame - if ackFrame != nil { - ack = logutils.ConvertAckFrame(ackFrame) - } - s.tracer.SentShortHeaderPacket( - &logging.ShortHeader{ - DestConnectionID: destConnID, - PacketNumber: pn, - PacketNumberLen: pnLen, - KeyPhase: kp, - }, - size, - ecn, - ack, - fs, - ) - } -} - -func (s *connection) logCoalescedPacket(packet *coalescedPacket, ecn protocol.ECN) { - if s.logger.Debug() { - // There's a short period between dropping both Initial and Handshake keys and completion of the handshake, - // during which we might call PackCoalescedPacket but just pack a short header packet. - if len(packet.longHdrPackets) == 0 && packet.shortHdrPacket != nil { - s.logShortHeaderPacket( - packet.shortHdrPacket.DestConnID, - packet.shortHdrPacket.Ack, - packet.shortHdrPacket.Frames, - packet.shortHdrPacket.StreamFrames, - packet.shortHdrPacket.PacketNumber, - packet.shortHdrPacket.PacketNumberLen, - packet.shortHdrPacket.KeyPhase, - ecn, - packet.shortHdrPacket.Length, - false, - ) - return - } - if len(packet.longHdrPackets) > 1 { - s.logger.Debugf("-> Sending coalesced packet (%d parts, %d bytes) for connection %s", len(packet.longHdrPackets), packet.buffer.Len(), s.logID) - } else { - s.logger.Debugf("-> Sending packet %d (%d bytes) for connection %s, %s", packet.longHdrPackets[0].header.PacketNumber, packet.buffer.Len(), s.logID, packet.longHdrPackets[0].EncryptionLevel()) - } - } - for _, p := range packet.longHdrPackets { - s.logLongHeaderPacket(p, ecn) - } - if p := packet.shortHdrPacket; p != nil { - s.logShortHeaderPacket(p.DestConnID, p.Ack, p.Frames, p.StreamFrames, p.PacketNumber, p.PacketNumberLen, p.KeyPhase, ecn, p.Length, true) + // On the server side, there's no downside to using 1200 bytes until we received the client's transport + // parameters: + // * If the first packet didn't contain the entire ClientHello, all we can do is ACK that packet. We don't + // need a lot of bytes for that. + // * If it did, we will have processed the transport parameters and initialized the MTU discoverer. + return protocol.MinInitialPacketSize } + return s.mtuDiscoverer.CurrentSize() } // AcceptStream returns the next stream openend by the peer @@ -2291,7 +2213,6 @@ func (s *connection) newFlowController(id protocol.StreamID) flowcontrol.StreamF protocol.ByteCount(s.config.InitialStreamReceiveWindow), protocol.ByteCount(s.config.MaxStreamReceiveWindow), initialSendWindow, - s.onHasStreamWindowUpdate, s.rttStats, s.logger, ) @@ -2330,18 +2251,13 @@ func (s *connection) queueControlFrame(f wire.Frame) { s.scheduleSending() } -func (s *connection) onHasStreamWindowUpdate(id protocol.StreamID) { - s.windowUpdateQueue.AddStream(id) - s.scheduleSending() -} - -func (s *connection) onHasConnectionWindowUpdate() { - s.windowUpdateQueue.AddConnection() +func (s *connection) onHasStreamData(id protocol.StreamID, str sendStreamI) { + s.framer.AddActiveStream(id, str) s.scheduleSending() } -func (s *connection) onHasStreamData(id protocol.StreamID) { - s.framer.AddActiveStream(id) +func (s *connection) onHasStreamControlFrame(id protocol.StreamID, str streamControlFrameGetter) { + s.framer.AddStreamWithControlFrames(id, str) s.scheduleSending() } @@ -2349,6 +2265,12 @@ func (s *connection) onStreamCompleted(id protocol.StreamID) { if err := s.streamsMap.DeleteStream(id); err != nil { s.closeLocal(err) } + s.framer.RemoveActiveStream(id) +} + +func (s *connection) onMTUIncreased(mtu protocol.ByteCount) { + s.maxPayloadSizeEstimate.Store(uint32(estimateMaxPayloadSize(mtu))) + s.sentPacketHandler.SetMaxDatagramSize(mtu) } func (s *connection) SendDatagram(p []byte) error { @@ -2357,10 +2279,14 @@ func (s *connection) SendDatagram(p []byte) error { } f := &wire.DatagramFrame{DataLenPresent: true} - if protocol.ByteCount(len(p)) > f.MaxDataLen(s.peerParams.MaxDatagramFrameSize, s.version) { - return &DatagramTooLargeError{ - PeerMaxDatagramFrameSize: int64(s.peerParams.MaxDatagramFrameSize), - } + // The payload size estimate is conservative. + // Under many circumstances we could send a few more bytes. + maxDataLen := min( + f.MaxDataLen(s.peerParams.MaxDatagramFrameSize, s.version), + protocol.ByteCount(s.maxPayloadSizeEstimate.Load()), + ) + if protocol.ByteCount(len(p)) > maxDataLen { + return &DatagramTooLargeError{MaxDatagramPayloadSize: int64(maxDataLen)} } f.Data = make([]byte, len(p)) copy(f.Data, p) @@ -2386,8 +2312,22 @@ func (s *connection) GetVersion() protocol.Version { return s.version } -func (s *connection) NextConnection() Connection { - <-s.HandshakeComplete() - s.streamsMap.UseResetMaps() - return s +func (s *connection) NextConnection(ctx context.Context) (Connection, error) { + // The handshake might fail after the server rejected 0-RTT. + // This could happen if the Finished message is malformed or never received. + select { + case <-ctx.Done(): + return nil, context.Cause(ctx) + case <-s.Context().Done(): + case <-s.HandshakeComplete(): + s.streamsMap.UseResetMaps() + } + return s, nil +} + +// estimateMaxPayloadSize estimates the maximum payload size for short header packets. +// It is not very sophisticated: it just subtracts the size of header (assuming the maximum +// connection ID length), and the size of the encryption tag. +func estimateMaxPayloadSize(mtu protocol.ByteCount) protocol.ByteCount { + return mtu - 1 /* type byte */ - 20 /* maximum connection ID length */ - 16 /* tag size */ } diff --git a/vendor/github.com/quic-go/quic-go/connection_logging.go b/vendor/github.com/quic-go/quic-go/connection_logging.go new file mode 100644 index 0000000..f75b39f --- /dev/null +++ b/vendor/github.com/quic-go/quic-go/connection_logging.go @@ -0,0 +1,173 @@ +package quic + +import ( + "slices" + + "github.com/quic-go/quic-go/internal/ackhandler" + "github.com/quic-go/quic-go/internal/protocol" + "github.com/quic-go/quic-go/internal/wire" + "github.com/quic-go/quic-go/logging" +) + +// ConvertFrame converts a wire.Frame into a logging.Frame. +// This makes it possible for external packages to access the frames. +// Furthermore, it removes the data slices from CRYPTO and STREAM frames. +func toLoggingFrame(frame wire.Frame) logging.Frame { + switch f := frame.(type) { + case *wire.AckFrame: + // We use a pool for ACK frames. + // Implementations of the tracer interface may hold on to frames, so we need to make a copy here. + return toLoggingAckFrame(f) + case *wire.CryptoFrame: + return &logging.CryptoFrame{ + Offset: f.Offset, + Length: protocol.ByteCount(len(f.Data)), + } + case *wire.StreamFrame: + return &logging.StreamFrame{ + StreamID: f.StreamID, + Offset: f.Offset, + Length: f.DataLen(), + Fin: f.Fin, + } + case *wire.DatagramFrame: + return &logging.DatagramFrame{ + Length: logging.ByteCount(len(f.Data)), + } + default: + return logging.Frame(frame) + } +} + +func toLoggingAckFrame(f *wire.AckFrame) *logging.AckFrame { + ack := &logging.AckFrame{ + AckRanges: slices.Clone(f.AckRanges), + DelayTime: f.DelayTime, + ECNCE: f.ECNCE, + ECT0: f.ECT0, + ECT1: f.ECT1, + } + return ack +} + +func (s *connection) logLongHeaderPacket(p *longHeaderPacket, ecn protocol.ECN) { + // quic-go logging + if s.logger.Debug() { + p.header.Log(s.logger) + if p.ack != nil { + wire.LogFrame(s.logger, p.ack, true) + } + for _, frame := range p.frames { + wire.LogFrame(s.logger, frame.Frame, true) + } + for _, frame := range p.streamFrames { + wire.LogFrame(s.logger, frame.Frame, true) + } + } + + // tracing + if s.tracer != nil && s.tracer.SentLongHeaderPacket != nil { + frames := make([]logging.Frame, 0, len(p.frames)) + for _, f := range p.frames { + frames = append(frames, toLoggingFrame(f.Frame)) + } + for _, f := range p.streamFrames { + frames = append(frames, toLoggingFrame(f.Frame)) + } + var ack *logging.AckFrame + if p.ack != nil { + ack = toLoggingAckFrame(p.ack) + } + s.tracer.SentLongHeaderPacket(p.header, p.length, ecn, ack, frames) + } +} + +func (s *connection) logShortHeaderPacket( + destConnID protocol.ConnectionID, + ackFrame *wire.AckFrame, + frames []ackhandler.Frame, + streamFrames []ackhandler.StreamFrame, + pn protocol.PacketNumber, + pnLen protocol.PacketNumberLen, + kp protocol.KeyPhaseBit, + ecn protocol.ECN, + size protocol.ByteCount, + isCoalesced bool, +) { + if s.logger.Debug() && !isCoalesced { + s.logger.Debugf("-> Sending packet %d (%d bytes) for connection %s, 1-RTT (ECN: %s)", pn, size, s.logID, ecn) + } + // quic-go logging + if s.logger.Debug() { + wire.LogShortHeader(s.logger, destConnID, pn, pnLen, kp) + if ackFrame != nil { + wire.LogFrame(s.logger, ackFrame, true) + } + for _, f := range frames { + wire.LogFrame(s.logger, f.Frame, true) + } + for _, f := range streamFrames { + wire.LogFrame(s.logger, f.Frame, true) + } + } + + // tracing + if s.tracer != nil && s.tracer.SentShortHeaderPacket != nil { + fs := make([]logging.Frame, 0, len(frames)+len(streamFrames)) + for _, f := range frames { + fs = append(fs, toLoggingFrame(f.Frame)) + } + for _, f := range streamFrames { + fs = append(fs, toLoggingFrame(f.Frame)) + } + var ack *logging.AckFrame + if ackFrame != nil { + ack = toLoggingAckFrame(ackFrame) + } + s.tracer.SentShortHeaderPacket( + &logging.ShortHeader{ + DestConnectionID: destConnID, + PacketNumber: pn, + PacketNumberLen: pnLen, + KeyPhase: kp, + }, + size, + ecn, + ack, + fs, + ) + } +} + +func (s *connection) logCoalescedPacket(packet *coalescedPacket, ecn protocol.ECN) { + if s.logger.Debug() { + // There's a short period between dropping both Initial and Handshake keys and completion of the handshake, + // during which we might call PackCoalescedPacket but just pack a short header packet. + if len(packet.longHdrPackets) == 0 && packet.shortHdrPacket != nil { + s.logShortHeaderPacket( + packet.shortHdrPacket.DestConnID, + packet.shortHdrPacket.Ack, + packet.shortHdrPacket.Frames, + packet.shortHdrPacket.StreamFrames, + packet.shortHdrPacket.PacketNumber, + packet.shortHdrPacket.PacketNumberLen, + packet.shortHdrPacket.KeyPhase, + ecn, + packet.shortHdrPacket.Length, + false, + ) + return + } + if len(packet.longHdrPackets) > 1 { + s.logger.Debugf("-> Sending coalesced packet (%d parts, %d bytes) for connection %s", len(packet.longHdrPackets), packet.buffer.Len(), s.logID) + } else { + s.logger.Debugf("-> Sending packet %d (%d bytes) for connection %s, %s", packet.longHdrPackets[0].header.PacketNumber, packet.buffer.Len(), s.logID, packet.longHdrPackets[0].EncryptionLevel()) + } + } + for _, p := range packet.longHdrPackets { + s.logLongHeaderPacket(p, ecn) + } + if p := packet.shortHdrPacket; p != nil { + s.logShortHeaderPacket(p.DestConnID, p.Ack, p.Frames, p.StreamFrames, p.PacketNumber, p.PacketNumberLen, p.KeyPhase, ecn, p.Length, true) + } +} diff --git a/vendor/github.com/quic-go/quic-go/crypto_stream.go b/vendor/github.com/quic-go/quic-go/crypto_stream.go index abc7ddc..9a387ba 100644 --- a/vendor/github.com/quic-go/quic-go/crypto_stream.go +++ b/vendor/github.com/quic-go/quic-go/crypto_stream.go @@ -2,27 +2,14 @@ package quic import ( "fmt" - "io" "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/qerr" "github.com/quic-go/quic-go/internal/wire" ) -type cryptoStream interface { - // for receiving data - HandleCryptoFrame(*wire.CryptoFrame) error - GetCryptoData() []byte - Finish() error - // for sending data - io.Writer - HasData() bool - PopCryptoFrame(protocol.ByteCount) *wire.CryptoFrame -} - -type cryptoStreamImpl struct { - queue *frameSorter - msgBuf []byte +type cryptoStream struct { + queue frameSorter highestOffset protocol.ByteCount finished bool @@ -31,11 +18,11 @@ type cryptoStreamImpl struct { writeBuf []byte } -func newCryptoStream() cryptoStream { - return &cryptoStreamImpl{queue: newFrameSorter()} +func newCryptoStream() *cryptoStream { + return &cryptoStream{queue: *newFrameSorter()} } -func (s *cryptoStreamImpl) HandleCryptoFrame(f *wire.CryptoFrame) error { +func (s *cryptoStream) HandleCryptoFrame(f *wire.CryptoFrame) error { highestOffset := f.Offset + protocol.ByteCount(len(f.Data)) if maxOffset := highestOffset; maxOffset > protocol.MaxCryptoStreamOffset { return &qerr.TransportError{ @@ -56,26 +43,16 @@ func (s *cryptoStreamImpl) HandleCryptoFrame(f *wire.CryptoFrame) error { return nil } s.highestOffset = max(s.highestOffset, highestOffset) - if err := s.queue.Push(f.Data, f.Offset, nil); err != nil { - return err - } - for { - _, data, _ := s.queue.Pop() - if data == nil { - return nil - } - s.msgBuf = append(s.msgBuf, data...) - } + return s.queue.Push(f.Data, f.Offset, nil) } // GetCryptoData retrieves data that was received in CRYPTO frames -func (s *cryptoStreamImpl) GetCryptoData() []byte { - b := s.msgBuf - s.msgBuf = nil - return b +func (s *cryptoStream) GetCryptoData() []byte { + _, data, _ := s.queue.Pop() + return data } -func (s *cryptoStreamImpl) Finish() error { +func (s *cryptoStream) Finish() error { if s.queue.HasMoreData() { return &qerr.TransportError{ ErrorCode: qerr.ProtocolViolation, @@ -87,16 +64,16 @@ func (s *cryptoStreamImpl) Finish() error { } // Writes writes data that should be sent out in CRYPTO frames -func (s *cryptoStreamImpl) Write(p []byte) (int, error) { +func (s *cryptoStream) Write(p []byte) (int, error) { s.writeBuf = append(s.writeBuf, p...) return len(p), nil } -func (s *cryptoStreamImpl) HasData() bool { +func (s *cryptoStream) HasData() bool { return len(s.writeBuf) > 0 } -func (s *cryptoStreamImpl) PopCryptoFrame(maxLen protocol.ByteCount) *wire.CryptoFrame { +func (s *cryptoStream) PopCryptoFrame(maxLen protocol.ByteCount) *wire.CryptoFrame { f := &wire.CryptoFrame{Offset: s.writeOffset} n := min(f.MaxDataLen(maxLen), protocol.ByteCount(len(s.writeBuf))) f.Data = s.writeBuf[:n] diff --git a/vendor/github.com/quic-go/quic-go/crypto_stream_manager.go b/vendor/github.com/quic-go/quic-go/crypto_stream_manager.go index c48e238..d70b9b0 100644 --- a/vendor/github.com/quic-go/quic-go/crypto_stream_manager.go +++ b/vendor/github.com/quic-go/quic-go/crypto_stream_manager.go @@ -3,32 +3,22 @@ package quic import ( "fmt" - "github.com/quic-go/quic-go/internal/handshake" "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/wire" ) -type cryptoDataHandler interface { - HandleMessage([]byte, protocol.EncryptionLevel) error - NextEvent() handshake.Event -} - type cryptoStreamManager struct { - cryptoHandler cryptoDataHandler - - initialStream cryptoStream - handshakeStream cryptoStream - oneRTTStream cryptoStream + initialStream *cryptoStream + handshakeStream *cryptoStream + oneRTTStream *cryptoStream } func newCryptoStreamManager( - cryptoHandler cryptoDataHandler, - initialStream cryptoStream, - handshakeStream cryptoStream, - oneRTTStream cryptoStream, + initialStream *cryptoStream, + handshakeStream *cryptoStream, + oneRTTStream *cryptoStream, ) *cryptoStreamManager { return &cryptoStreamManager{ - cryptoHandler: cryptoHandler, initialStream: initialStream, handshakeStream: handshakeStream, oneRTTStream: oneRTTStream, @@ -36,7 +26,7 @@ func newCryptoStreamManager( } func (m *cryptoStreamManager) HandleCryptoFrame(frame *wire.CryptoFrame, encLevel protocol.EncryptionLevel) error { - var str cryptoStream + var str *cryptoStream //nolint:exhaustive // CRYPTO frames cannot be sent in 0-RTT packets. switch encLevel { case protocol.EncryptionInitial: @@ -48,18 +38,23 @@ func (m *cryptoStreamManager) HandleCryptoFrame(frame *wire.CryptoFrame, encLeve default: return fmt.Errorf("received CRYPTO frame with unexpected encryption level: %s", encLevel) } - if err := str.HandleCryptoFrame(frame); err != nil { - return err - } - for { - data := str.GetCryptoData() - if data == nil { - return nil - } - if err := m.cryptoHandler.HandleMessage(data, encLevel); err != nil { - return err - } + return str.HandleCryptoFrame(frame) +} + +func (m *cryptoStreamManager) GetCryptoData(encLevel protocol.EncryptionLevel) []byte { + var str *cryptoStream + //nolint:exhaustive // CRYPTO frames cannot be sent in 0-RTT packets. + switch encLevel { + case protocol.EncryptionInitial: + str = m.initialStream + case protocol.EncryptionHandshake: + str = m.handshakeStream + case protocol.Encryption1RTT: + str = m.oneRTTStream + default: + panic(fmt.Sprintf("received CRYPTO frame with unexpected encryption level: %s", encLevel)) } + return str.GetCryptoData() } func (m *cryptoStreamManager) GetPostHandshakeData(maxSize protocol.ByteCount) *wire.CryptoFrame { diff --git a/vendor/github.com/quic-go/quic-go/errors.go b/vendor/github.com/quic-go/quic-go/errors.go index fda3c92..3fe1e0a 100644 --- a/vendor/github.com/quic-go/quic-go/errors.go +++ b/vendor/github.com/quic-go/quic-go/errors.go @@ -64,7 +64,7 @@ func (e *StreamError) Error() string { // DatagramTooLargeError is returned from Connection.SendDatagram if the payload is too large to be sent. type DatagramTooLargeError struct { - PeerMaxDatagramFrameSize int64 + MaxDatagramPayloadSize int64 } func (e *DatagramTooLargeError) Is(target error) bool { diff --git a/vendor/github.com/quic-go/quic-go/framer.go b/vendor/github.com/quic-go/quic-go/framer.go index 1e6219a..e162f6b 100644 --- a/vendor/github.com/quic-go/quic-go/framer.go +++ b/vendor/github.com/quic-go/quic-go/framer.go @@ -1,7 +1,7 @@ package quic import ( - "errors" + "slices" "sync" "github.com/quic-go/quic-go/internal/ackhandler" @@ -11,37 +11,25 @@ import ( "github.com/quic-go/quic-go/quicvarint" ) -type framer interface { - HasData() bool - - QueueControlFrame(wire.Frame) - AppendControlFrames([]ackhandler.Frame, protocol.ByteCount, protocol.Version) ([]ackhandler.Frame, protocol.ByteCount) - - AddActiveStream(protocol.StreamID) - AppendStreamFrames([]ackhandler.StreamFrame, protocol.ByteCount, protocol.Version) ([]ackhandler.StreamFrame, protocol.ByteCount) - - Handle0RTTRejection() error - - // QueuedTooManyControlFrames says if the control frame queue exceeded its maximum queue length. - // This is a hack. - // It is easier to implement than propagating an error return value in QueueControlFrame. - // The correct solution would be to queue frames with their respective structs. - // See https://github.com/quic-go/quic-go/issues/4271 for the queueing of stream-related control frames. - QueuedTooManyControlFrames() bool -} - const ( maxPathResponses = 256 maxControlFrames = 16 << 10 ) -type framerI struct { - mutex sync.Mutex +// This is the largest possible size of a stream-related control frame +// (which is the RESET_STREAM frame). +const maxStreamControlFrameSize = 25 - streamGetter streamGetter +type streamControlFrameGetter interface { + getControlFrame() (_ ackhandler.Frame, ok, hasMore bool) +} - activeStreams map[protocol.StreamID]struct{} - streamQueue ringbuffer.RingBuffer[protocol.StreamID] +type framer struct { + mutex sync.Mutex + + activeStreams map[protocol.StreamID]sendStreamI + streamQueue ringbuffer.RingBuffer[protocol.StreamID] + streamsWithControlFrames map[protocol.StreamID]streamControlFrameGetter controlFrameMutex sync.Mutex controlFrames []wire.Frame @@ -49,16 +37,14 @@ type framerI struct { queuedTooManyControlFrames bool } -var _ framer = &framerI{} - -func newFramer(streamGetter streamGetter) framer { - return &framerI{ - streamGetter: streamGetter, - activeStreams: make(map[protocol.StreamID]struct{}), +func newFramer() *framer { + return &framer{ + activeStreams: make(map[protocol.StreamID]sendStreamI), + streamsWithControlFrames: make(map[protocol.StreamID]streamControlFrameGetter), } } -func (f *framerI) HasData() bool { +func (f *framer) HasData() bool { f.mutex.Lock() hasData := !f.streamQueue.Empty() f.mutex.Unlock() @@ -67,10 +53,10 @@ func (f *framerI) HasData() bool { } f.controlFrameMutex.Lock() defer f.controlFrameMutex.Unlock() - return len(f.controlFrames) > 0 || len(f.pathResponses) > 0 + return len(f.streamsWithControlFrames) > 0 || len(f.controlFrames) > 0 || len(f.pathResponses) > 0 } -func (f *framerI) QueueControlFrame(frame wire.Frame) { +func (f *framer) QueueControlFrame(frame wire.Frame) { f.controlFrameMutex.Lock() defer f.controlFrameMutex.Unlock() @@ -92,7 +78,7 @@ func (f *framerI) QueueControlFrame(frame wire.Frame) { f.controlFrames = append(f.controlFrames, frame) } -func (f *framerI) AppendControlFrames(frames []ackhandler.Frame, maxLen protocol.ByteCount, v protocol.Version) ([]ackhandler.Frame, protocol.ByteCount) { +func (f *framer) AppendControlFrames(frames []ackhandler.Frame, maxLen protocol.ByteCount, v protocol.Version) ([]ackhandler.Frame, protocol.ByteCount) { f.controlFrameMutex.Lock() defer f.controlFrameMutex.Unlock() @@ -108,6 +94,29 @@ func (f *framerI) AppendControlFrames(frames []ackhandler.Frame, maxLen protocol } } + // add stream-related control frames + for id, str := range f.streamsWithControlFrames { + start: + remainingLen := maxLen - length + if remainingLen <= maxStreamControlFrameSize { + break + } + fr, ok, hasMore := str.getControlFrame() + if !hasMore { + delete(f.streamsWithControlFrames, id) + } + if !ok { + continue + } + frames = append(frames, fr) + length += fr.Frame.Length(v) + if hasMore { + // It is rare that a stream has more than one control frame to queue. + // We don't want to spawn another loop for just to cover that case. + goto start + } + } + for len(f.controlFrames) > 0 { frame := f.controlFrames[len(f.controlFrames)-1] frameLen := frame.Length(v) @@ -118,27 +127,51 @@ func (f *framerI) AppendControlFrames(frames []ackhandler.Frame, maxLen protocol length += frameLen f.controlFrames = f.controlFrames[:len(f.controlFrames)-1] } + return frames, length } -func (f *framerI) QueuedTooManyControlFrames() bool { +// QueuedTooManyControlFrames says if the control frame queue exceeded its maximum queue length. +// This is a hack. +// It is easier to implement than propagating an error return value in QueueControlFrame. +// The correct solution would be to queue frames with their respective structs. +// See https://github.com/quic-go/quic-go/issues/4271 for the queueing of stream-related control frames. +func (f *framer) QueuedTooManyControlFrames() bool { return f.queuedTooManyControlFrames } -func (f *framerI) AddActiveStream(id protocol.StreamID) { +func (f *framer) AddActiveStream(id protocol.StreamID, str sendStreamI) { f.mutex.Lock() if _, ok := f.activeStreams[id]; !ok { f.streamQueue.PushBack(id) - f.activeStreams[id] = struct{}{} + f.activeStreams[id] = str } f.mutex.Unlock() } -func (f *framerI) AppendStreamFrames(frames []ackhandler.StreamFrame, maxLen protocol.ByteCount, v protocol.Version) ([]ackhandler.StreamFrame, protocol.ByteCount) { +func (f *framer) AddStreamWithControlFrames(id protocol.StreamID, str streamControlFrameGetter) { + f.controlFrameMutex.Lock() + if _, ok := f.streamsWithControlFrames[id]; !ok { + f.streamsWithControlFrames[id] = str + } + f.controlFrameMutex.Unlock() +} + +// RemoveActiveStream is called when a stream completes. +func (f *framer) RemoveActiveStream(id protocol.StreamID) { + f.mutex.Lock() + delete(f.activeStreams, id) + // We don't delete the stream from the streamQueue, + // since we'd have to iterate over the ringbuffer. + // Instead, we check if the stream is still in activeStreams in AppendStreamFrames. + f.mutex.Unlock() +} + +func (f *framer) AppendStreamFrames(frames []ackhandler.StreamFrame, maxLen protocol.ByteCount, v protocol.Version) ([]ackhandler.StreamFrame, protocol.ByteCount) { startLen := len(frames) var length protocol.ByteCount f.mutex.Lock() - // pop STREAM frames, until less than MinStreamFrameSize bytes are left in the packet + // pop STREAM frames, until less than 128 bytes are left in the packet numActiveStreams := f.streamQueue.Len() for i := 0; i < numActiveStreams; i++ { if protocol.MinStreamFrameSize+length > maxLen { @@ -147,17 +180,16 @@ func (f *framerI) AppendStreamFrames(frames []ackhandler.StreamFrame, maxLen pro id := f.streamQueue.PopFront() // This should never return an error. Better check it anyway. // The stream will only be in the streamQueue, if it enqueued itself there. - str, err := f.streamGetter.GetOrOpenSendStream(id) - // The stream can be nil if it completed after it said it had data. - if str == nil || err != nil { - delete(f.activeStreams, id) + str, ok := f.activeStreams[id] + // The stream might have been removed after being enqueued. + if !ok { continue } remainingLen := maxLen - length // For the last STREAM frame, we'll remove the DataLen field later. // Therefore, we can pretend to have more bytes available when popping // the STREAM frame (which will always have the DataLen set). - remainingLen += quicvarint.Len(uint64(remainingLen)) + remainingLen += protocol.ByteCount(quicvarint.Len(uint64(remainingLen))) frame, ok, hasMoreData := str.popStreamFrame(remainingLen, v) if hasMoreData { // put the stream back in the queue (at the end) f.streamQueue.PushBack(id) @@ -165,7 +197,7 @@ func (f *framerI) AppendStreamFrames(frames []ackhandler.StreamFrame, maxLen pro delete(f.activeStreams, id) } // The frame can be "nil" - // * if the receiveStream was canceled after it said it had data + // * if the stream was canceled after it said it had data // * the remaining size doesn't allow us to add another STREAM frame if !ok { continue @@ -183,11 +215,12 @@ func (f *framerI) AppendStreamFrames(frames []ackhandler.StreamFrame, maxLen pro return frames, length } -func (f *framerI) Handle0RTTRejection() error { +func (f *framer) Handle0RTTRejection() { f.mutex.Lock() defer f.mutex.Unlock() - f.controlFrameMutex.Lock() + defer f.controlFrameMutex.Unlock() + f.streamQueue.Clear() for id := range f.activeStreams { delete(f.activeStreams, id) @@ -195,16 +228,13 @@ func (f *framerI) Handle0RTTRejection() error { var j int for i, frame := range f.controlFrames { switch frame.(type) { - case *wire.MaxDataFrame, *wire.MaxStreamDataFrame, *wire.MaxStreamsFrame: - return errors.New("didn't expect MAX_DATA / MAX_STREAM_DATA / MAX_STREAMS frame to be sent in 0-RTT") - case *wire.DataBlockedFrame, *wire.StreamDataBlockedFrame, *wire.StreamsBlockedFrame: + case *wire.MaxDataFrame, *wire.MaxStreamDataFrame, *wire.MaxStreamsFrame, + *wire.DataBlockedFrame, *wire.StreamDataBlockedFrame, *wire.StreamsBlockedFrame: continue default: f.controlFrames[j] = f.controlFrames[i] j++ } } - f.controlFrames = f.controlFrames[:j] - f.controlFrameMutex.Unlock() - return nil + f.controlFrames = slices.Delete(f.controlFrames, j, len(f.controlFrames)) } diff --git a/vendor/github.com/quic-go/quic-go/interface.go b/vendor/github.com/quic-go/quic-go/interface.go index ca8544d..2071b59 100644 --- a/vendor/github.com/quic-go/quic-go/interface.go +++ b/vendor/github.com/quic-go/quic-go/interface.go @@ -19,10 +19,6 @@ type StreamID = protocol.StreamID // A Version is a QUIC version number. type Version = protocol.Version -// A VersionNumber is a QUIC version number. -// Deprecated: VersionNumber was renamed to Version. -type VersionNumber = Version - const ( // Version1 is RFC 9000 Version1 = protocol.Version1 @@ -57,8 +53,13 @@ var Err0RTTRejected = errors.New("0-RTT rejected") // ConnectionTracingKey can be used to associate a ConnectionTracer with a Connection. // It is set on the Connection.Context() context, // as well as on the context passed to logging.Tracer.NewConnectionTracer. +// Deprecated: Applications can set their own tracing key using Transport.ConnContext. var ConnectionTracingKey = connTracingCtxKey{} +// ConnectionTracingID is the type of the context value saved under the ConnectionTracingKey. +// Deprecated: Applications can set their own tracing key using Transport.ConnContext. +type ConnectionTracingID uint64 + type connTracingCtxKey struct{} // QUICVersionContextKey can be used to find out the QUIC version of a TLS handshake from the @@ -84,8 +85,8 @@ type ReceiveStream interface { // Read reads data from the stream. // Read can be made to time out and return a net.Error with Timeout() == true // after a fixed time limit; see SetDeadline and SetReadDeadline. - // If the stream was canceled by the peer, the error implements the StreamError - // interface, and Canceled() == true. + // If the stream was canceled by the peer, the error is a StreamError and + // Remote == true. // If the connection was closed due to a timeout, the error satisfies // the net.Error interface, and Timeout() will be true. io.Reader @@ -108,8 +109,8 @@ type SendStream interface { // Write writes data to the stream. // Write can be made to time out and return a net.Error with Timeout() == true // after a fixed time limit; see SetDeadline and SetWriteDeadline. - // If the stream was canceled by the peer, the error implements the StreamError - // interface, and Canceled() == true. + // If the stream was canceled by the peer, the error is a StreamError and + // Remote == true. // If the connection was closed due to a timeout, the error satisfies // the net.Error interface, and Timeout() will be true. io.Writer @@ -121,7 +122,9 @@ type SendStream interface { // CancelWrite aborts sending on this stream. // Data already written, but not yet delivered to the peer is not guaranteed to be delivered reliably. // Write will unblock immediately, and future calls to Write will fail. - // When called multiple times or after closing the stream it is a no-op. + // When called multiple times it is a no-op. + // When called after Close, it aborts delivery. Note that there is no guarantee if + // the peer will receive the FIN or the reset first. CancelWrite(StreamErrorCode) // The Context is canceled as soon as the write-side of the stream is closed. // This happens when Close() or CancelWrite() is called, or when the peer @@ -143,7 +146,7 @@ type SendStream interface { // * TransportError: for errors triggered by the QUIC transport (in many cases a misbehaving peer) // * IdleTimeoutError: when the peer goes away unexpectedly (this is a net.Error timeout error) // * HandshakeTimeoutError: when the cryptographic handshake takes too long (this is a net.Error timeout error) -// * StatelessResetError: when we receive a stateless reset (this is a net.Error temporary error) +// * StatelessResetError: when we receive a stateless reset // * VersionNegotiationError: returned by the client, when there's no version overlap between the peers type Connection interface { // AcceptStream returns the next stream opened by the peer, blocking until one is available. @@ -156,28 +159,29 @@ type Connection interface { AcceptUniStream(context.Context) (ReceiveStream, error) // OpenStream opens a new bidirectional QUIC stream. // There is no signaling to the peer about new streams: - // The peer can only accept the stream after data has been sent on the stream. - // If the error is non-nil, it satisfies the net.Error interface. - // When reaching the peer's stream limit, err.Temporary() will be true. - // If the connection was closed due to a timeout, Timeout() will be true. + // The peer can only accept the stream after data has been sent on the stream, + // or the stream has been reset or closed. + // When reaching the peer's stream limit, it is not possible to open a new stream until the + // peer raises the stream limit. In that case, a StreamLimitReachedError is returned. OpenStream() (Stream, error) // OpenStreamSync opens a new bidirectional QUIC stream. // It blocks until a new stream can be opened. // There is no signaling to the peer about new streams: // The peer can only accept the stream after data has been sent on the stream, // or the stream has been reset or closed. - // If the error is non-nil, it satisfies the net.Error interface. - // If the connection was closed due to a timeout, Timeout() will be true. OpenStreamSync(context.Context) (Stream, error) // OpenUniStream opens a new outgoing unidirectional QUIC stream. - // If the error is non-nil, it satisfies the net.Error interface. - // When reaching the peer's stream limit, Temporary() will be true. - // If the connection was closed due to a timeout, Timeout() will be true. + // There is no signaling to the peer about new streams: + // The peer can only accept the stream after data has been sent on the stream, + // or the stream has been reset or closed. + // When reaching the peer's stream limit, it is not possible to open a new stream until the + // peer raises the stream limit. In that case, a StreamLimitReachedError is returned. OpenUniStream() (SendStream, error) // OpenUniStreamSync opens a new outgoing unidirectional QUIC stream. // It blocks until a new stream can be opened. - // If the error is non-nil, it satisfies the net.Error interface. - // If the connection was closed due to a timeout, Timeout() will be true. + // There is no signaling to the peer about new streams: + // The peer can only accept the stream after data has been sent on the stream, + // or the stream has been reset or closed. OpenUniStreamSync(context.Context) (SendStream, error) // LocalAddr returns the local address. LocalAddr() net.Addr @@ -217,7 +221,7 @@ type EarlyConnection interface { // however the client's identity is only verified once the handshake completes. HandshakeComplete() <-chan struct{} - NextConnection() Connection + NextConnection(context.Context) (Connection, error) } // StatelessResetKey is a key used to derive stateless reset tokens. @@ -320,10 +324,15 @@ type Config struct { // If set to 0, then no keep alive is sent. Otherwise, the keep alive is sent on that period (or at most // every half of MaxIdleTimeout, whichever is smaller). KeepAlivePeriod time.Duration + // InitialPacketSize is the initial size of packets sent. + // It is usually not necessary to manually set this value, + // since Path MTU discovery very quickly finds the path's MTU. + // If set too high, the path might not support packets that large, leading to a timeout of the QUIC handshake. + // Values below 1200 are invalid. + InitialPacketSize uint16 // DisablePathMTUDiscovery disables Path MTU Discovery (RFC 8899). // This allows the sending of QUIC packets that fully utilize the available MTU of the path. // Path MTU discovery is only available on systems that allow setting of the Don't Fragment (DF) bit. - // If unavailable or disabled, packets will be at most 1252 (IPv4) / 1232 (IPv6) bytes in size. DisablePathMTUDiscovery bool // Allow0RTT allows the application to decide if a 0-RTT connection attempt should be accepted. // Only valid for the server. diff --git a/vendor/github.com/quic-go/quic-go/internal/ackhandler/received_packet_history.go b/vendor/github.com/quic-go/quic-go/internal/ackhandler/received_packet_history.go index 3143bfe..f9feae1 100644 --- a/vendor/github.com/quic-go/quic-go/internal/ackhandler/received_packet_history.go +++ b/vendor/github.com/quic-go/quic-go/internal/ackhandler/received_packet_history.go @@ -1,10 +1,9 @@ package ackhandler import ( - "sync" + "slices" "github.com/quic-go/quic-go/internal/protocol" - list "github.com/quic-go/quic-go/internal/utils/linkedlist" "github.com/quic-go/quic-go/internal/wire" ) @@ -14,25 +13,17 @@ type interval struct { End protocol.PacketNumber } -var intervalElementPool sync.Pool - -func init() { - intervalElementPool = *list.NewPool[interval]() -} - // The receivedPacketHistory stores if a packet number has already been received. // It generates ACK ranges which can be used to assemble an ACK frame. // It does not store packet contents. type receivedPacketHistory struct { - ranges *list.List[interval] + ranges []interval // maximum length: protocol.MaxNumAckRanges deletedBelow protocol.PacketNumber } func newReceivedPacketHistory() *receivedPacketHistory { - return &receivedPacketHistory{ - ranges: list.NewWithPool[interval](&intervalElementPool), - } + return &receivedPacketHistory{} } // ReceivedPacket registers a packet with PacketNumber p and updates the ranges @@ -41,58 +32,54 @@ func (h *receivedPacketHistory) ReceivedPacket(p protocol.PacketNumber) bool /* if p < h.deletedBelow { return false } + isNew := h.addToRanges(p) - h.maybeDeleteOldRanges() + // Delete old ranges, if we're tracking too many of them. + // This is a DoS defense against a peer that sends us too many gaps. + if len(h.ranges) > protocol.MaxNumAckRanges { + h.ranges = slices.Delete(h.ranges, 0, len(h.ranges)-protocol.MaxNumAckRanges) + } return isNew } func (h *receivedPacketHistory) addToRanges(p protocol.PacketNumber) bool /* is a new packet (and not a duplicate / delayed packet) */ { - if h.ranges.Len() == 0 { - h.ranges.PushBack(interval{Start: p, End: p}) + if len(h.ranges) == 0 { + h.ranges = append(h.ranges, interval{Start: p, End: p}) return true } - for el := h.ranges.Back(); el != nil; el = el.Prev() { + for i := len(h.ranges) - 1; i >= 0; i-- { // p already included in an existing range. Nothing to do here - if p >= el.Value.Start && p <= el.Value.End { + if p >= h.ranges[i].Start && p <= h.ranges[i].End { return false } - if el.Value.End == p-1 { // extend a range at the end - el.Value.End = p + if h.ranges[i].End == p-1 { // extend a range at the end + h.ranges[i].End = p return true } - if el.Value.Start == p+1 { // extend a range at the beginning - el.Value.Start = p + if h.ranges[i].Start == p+1 { // extend a range at the beginning + h.ranges[i].Start = p - prev := el.Prev() - if prev != nil && prev.Value.End+1 == el.Value.Start { // merge two ranges - prev.Value.End = el.Value.End - h.ranges.Remove(el) + if i > 0 && h.ranges[i-1].End+1 == h.ranges[i].Start { // merge two ranges + h.ranges[i-1].End = h.ranges[i].End + h.ranges = slices.Delete(h.ranges, i, i+1) } return true } - // create a new range at the end - if p > el.Value.End { - h.ranges.InsertAfter(interval{Start: p, End: p}, el) + // create a new range after the current one + if p > h.ranges[i].End { + h.ranges = slices.Insert(h.ranges, i+1, interval{Start: p, End: p}) return true } } // create a new range at the beginning - h.ranges.InsertBefore(interval{Start: p, End: p}, h.ranges.Front()) + h.ranges = slices.Insert(h.ranges, 0, interval{Start: p, End: p}) return true } -// Delete old ranges, if we're tracking more than 500 of them. -// This is a DoS defense against a peer that sends us too many gaps. -func (h *receivedPacketHistory) maybeDeleteOldRanges() { - for h.ranges.Len() > protocol.MaxNumAckRanges { - h.ranges.Remove(h.ranges.Front()) - } -} - // DeleteBelow deletes all entries below (but not including) p func (h *receivedPacketHistory) DeleteBelow(p protocol.PacketNumber) { if p < h.deletedBelow { @@ -100,37 +87,39 @@ func (h *receivedPacketHistory) DeleteBelow(p protocol.PacketNumber) { } h.deletedBelow = p - nextEl := h.ranges.Front() - for el := h.ranges.Front(); nextEl != nil; el = nextEl { - nextEl = el.Next() + if len(h.ranges) == 0 { + return + } - if el.Value.End < p { // delete a whole range - h.ranges.Remove(el) - } else if p > el.Value.Start && p <= el.Value.End { - el.Value.Start = p - return + idx := -1 + for i := 0; i < len(h.ranges); i++ { + if h.ranges[i].End < p { // delete a whole range + idx = i + } else if p > h.ranges[i].Start && p <= h.ranges[i].End { + h.ranges[i].Start = p + break } else { // no ranges affected. Nothing to do - return + break } } + if idx >= 0 { + h.ranges = slices.Delete(h.ranges, 0, idx+1) + } } // AppendAckRanges appends to a slice of all AckRanges that can be used in an AckFrame func (h *receivedPacketHistory) AppendAckRanges(ackRanges []wire.AckRange) []wire.AckRange { - if h.ranges.Len() > 0 { - for el := h.ranges.Back(); el != nil; el = el.Prev() { - ackRanges = append(ackRanges, wire.AckRange{Smallest: el.Value.Start, Largest: el.Value.End}) - } + for i := len(h.ranges) - 1; i >= 0; i-- { + ackRanges = append(ackRanges, wire.AckRange{Smallest: h.ranges[i].Start, Largest: h.ranges[i].End}) } return ackRanges } func (h *receivedPacketHistory) GetHighestAckRange() wire.AckRange { ackRange := wire.AckRange{} - if h.ranges.Len() > 0 { - r := h.ranges.Back().Value - ackRange.Smallest = r.Start - ackRange.Largest = r.End + if len(h.ranges) > 0 { + ackRange.Smallest = h.ranges[len(h.ranges)-1].Start + ackRange.Largest = h.ranges[len(h.ranges)-1].End } return ackRange } @@ -139,11 +128,12 @@ func (h *receivedPacketHistory) IsPotentiallyDuplicate(p protocol.PacketNumber) if p < h.deletedBelow { return true } - for el := h.ranges.Back(); el != nil; el = el.Prev() { - if p > el.Value.End { + // Iterating over the slices is faster than using a binary search (using slices.BinarySearchFunc). + for i := len(h.ranges) - 1; i >= 0; i-- { + if p > h.ranges[i].End { return false } - if p <= el.Value.End && p >= el.Value.Start { + if p <= h.ranges[i].End && p >= h.ranges[i].Start { return true } } diff --git a/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_handler.go b/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_handler.go index 3cef892..b84f0dc 100644 --- a/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_handler.go +++ b/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_handler.go @@ -28,7 +28,7 @@ const ( ) type packetNumberSpace struct { - history *sentPacketHistory + history sentPacketHistory pns packetNumberGenerator lossTime time.Time @@ -38,15 +38,15 @@ type packetNumberSpace struct { largestSent protocol.PacketNumber } -func newPacketNumberSpace(initialPN protocol.PacketNumber, skipPNs bool) *packetNumberSpace { +func newPacketNumberSpace(initialPN protocol.PacketNumber, isAppData bool) *packetNumberSpace { var pns packetNumberGenerator - if skipPNs { + if isAppData { pns = newSkippingPacketNumberGenerator(initialPN, protocol.SkipPacketInitialPeriod, protocol.SkipPacketMaxPeriod) } else { pns = newSequentialPacketNumberGenerator(initialPN) } return &packetNumberSpace{ - history: newSentPacketHistory(), + history: *newSentPacketHistory(isAppData), pns: pns, largestSent: protocol.InvalidPacketNumber, largestAcked: protocol.InvalidPacketNumber, @@ -756,7 +756,7 @@ func (h *sentPacketHandler) PeekPacketNumber(encLevel protocol.EncryptionLevel) pnSpace := h.getPacketNumberSpace(encLevel) pn := pnSpace.pns.Peek() // See section 17.1 of RFC 9000. - return pn, protocol.GetPacketNumberLengthForHeader(pn, pnSpace.largestAcked) + return pn, protocol.PacketNumberLengthForHeader(pn, pnSpace.largestAcked) } func (h *sentPacketHandler) PopPacketNumber(encLevel protocol.EncryptionLevel) protocol.PacketNumber { diff --git a/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_history.go b/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_history.go index c14c0f4..9968df6 100644 --- a/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_history.go +++ b/vendor/github.com/quic-go/quic-go/internal/ackhandler/sent_packet_history.go @@ -14,11 +14,16 @@ type sentPacketHistory struct { highestPacketNumber protocol.PacketNumber } -func newSentPacketHistory() *sentPacketHistory { - return &sentPacketHistory{ - packets: make([]*packet, 0, 32), +func newSentPacketHistory(isAppData bool) *sentPacketHistory { + h := &sentPacketHistory{ highestPacketNumber: protocol.InvalidPacketNumber, } + if isAppData { + h.packets = make([]*packet, 0, 32) + } else { + h.packets = make([]*packet, 0, 6) + } + return h } func (h *sentPacketHistory) checkSequentialPacketNumberUse(pn protocol.PacketNumber) { diff --git a/vendor/github.com/quic-go/quic-go/internal/congestion/cubic.go b/vendor/github.com/quic-go/quic-go/internal/congestion/cubic.go index 4e30de6..b35d40d 100644 --- a/vendor/github.com/quic-go/quic-go/internal/congestion/cubic.go +++ b/vendor/github.com/quic-go/quic-go/internal/congestion/cubic.go @@ -17,11 +17,11 @@ import ( // 1024*1024^3 (first 1024 is from 0.100^3) // where 0.100 is 100 ms which is the scaling round trip time. const ( - cubeScale = 40 - cubeCongestionWindowScale = 410 - cubeFactor protocol.ByteCount = 1 << cubeScale / cubeCongestionWindowScale / maxDatagramSize + cubeScale = 40 + cubeCongestionWindowScale = 410 + cubeFactor = 1 << cubeScale / cubeCongestionWindowScale / maxDatagramSize // TODO: when re-enabling cubic, make sure to use the actual packet size here - maxDatagramSize = protocol.ByteCount(protocol.InitialPacketSizeIPv4) + maxDatagramSize = protocol.ByteCount(protocol.InitialPacketSize) ) const defaultNumConnections = 1 diff --git a/vendor/github.com/quic-go/quic-go/internal/congestion/cubic_sender.go b/vendor/github.com/quic-go/quic-go/internal/congestion/cubic_sender.go index a1b06ab..075b08e 100644 --- a/vendor/github.com/quic-go/quic-go/internal/congestion/cubic_sender.go +++ b/vendor/github.com/quic-go/quic-go/internal/congestion/cubic_sender.go @@ -12,7 +12,7 @@ import ( const ( // maxDatagramSize is the default maximum packet size used in the Linux TCP implementation. // Used in QUIC for congestion window computations in bytes. - initialMaxDatagramSize = protocol.ByteCount(protocol.InitialPacketSizeIPv4) + initialMaxDatagramSize = protocol.ByteCount(protocol.InitialPacketSize) maxBurstPackets = 3 renoBeta = 0.7 // Reno backoff factor. minCongestionWindowPackets = 2 diff --git a/vendor/github.com/quic-go/quic-go/internal/flowcontrol/connection_flow_controller.go b/vendor/github.com/quic-go/quic-go/internal/flowcontrol/connection_flow_controller.go index 8504cdc..2efcad7 100644 --- a/vendor/github.com/quic-go/quic-go/internal/flowcontrol/connection_flow_controller.go +++ b/vendor/github.com/quic-go/quic-go/internal/flowcontrol/connection_flow_controller.go @@ -12,8 +12,6 @@ import ( type connectionFlowController struct { baseFlowController - - queueWindowUpdate func() } var _ ConnectionFlowController = &connectionFlowController{} @@ -23,7 +21,6 @@ var _ ConnectionFlowController = &connectionFlowController{} func NewConnectionFlowController( receiveWindow protocol.ByteCount, maxReceiveWindow protocol.ByteCount, - queueWindowUpdate func(), allowWindowIncrease func(size protocol.ByteCount) bool, rttStats *utils.RTTStats, logger utils.Logger, @@ -37,7 +34,6 @@ func NewConnectionFlowController( allowWindowIncrease: allowWindowIncrease, logger: logger, }, - queueWindowUpdate: queueWindowUpdate, } } @@ -63,18 +59,14 @@ func (c *connectionFlowController) IncrementHighestReceived(increment protocol.B func (c *connectionFlowController) AddBytesRead(n protocol.ByteCount) { c.mutex.Lock() c.baseFlowController.addBytesRead(n) - shouldQueueWindowUpdate := c.hasWindowUpdate() c.mutex.Unlock() - if shouldQueueWindowUpdate { - c.queueWindowUpdate() - } } func (c *connectionFlowController) GetWindowUpdate() protocol.ByteCount { c.mutex.Lock() oldWindowSize := c.receiveWindowSize offset := c.baseFlowController.getWindowUpdate() - if oldWindowSize < c.receiveWindowSize { + if c.logger.Debug() && oldWindowSize < c.receiveWindowSize { c.logger.Debugf("Increasing receive flow control window for the connection to %d kB", c.receiveWindowSize/(1<<10)) } c.mutex.Unlock() diff --git a/vendor/github.com/quic-go/quic-go/internal/flowcontrol/interface.go b/vendor/github.com/quic-go/quic-go/internal/flowcontrol/interface.go index fc5f9de..57d12a9 100644 --- a/vendor/github.com/quic-go/quic-go/internal/flowcontrol/interface.go +++ b/vendor/github.com/quic-go/quic-go/internal/flowcontrol/interface.go @@ -8,14 +8,13 @@ type flowController interface { UpdateSendWindow(protocol.ByteCount) (updated bool) AddBytesSent(protocol.ByteCount) // for receiving - AddBytesRead(protocol.ByteCount) GetWindowUpdate() protocol.ByteCount // returns 0 if no update is necessary - IsNewlyBlocked() (bool, protocol.ByteCount) } // A StreamFlowController is a flow controller for a QUIC stream. type StreamFlowController interface { flowController + AddBytesRead(protocol.ByteCount) (shouldQueueWindowUpdate bool) // UpdateHighestReceived is called when a new highest offset is received // final has to be to true if this is the final offset of the stream, // as contained in a STREAM frame with FIN bit, and the RESET_STREAM frame @@ -23,12 +22,15 @@ type StreamFlowController interface { // Abandon is called when reading from the stream is aborted early, // and there won't be any further calls to AddBytesRead. Abandon() + IsNewlyBlocked() bool } // The ConnectionFlowController is the flow controller for the connection. type ConnectionFlowController interface { flowController + AddBytesRead(protocol.ByteCount) Reset() error + IsNewlyBlocked() (bool, protocol.ByteCount) } type connectionFlowControllerI interface { diff --git a/vendor/github.com/quic-go/quic-go/internal/flowcontrol/stream_flow_controller.go b/vendor/github.com/quic-go/quic-go/internal/flowcontrol/stream_flow_controller.go index 1a69fb2..2d58351 100644 --- a/vendor/github.com/quic-go/quic-go/internal/flowcontrol/stream_flow_controller.go +++ b/vendor/github.com/quic-go/quic-go/internal/flowcontrol/stream_flow_controller.go @@ -13,8 +13,6 @@ type streamFlowController struct { streamID protocol.StreamID - queueWindowUpdate func() - connection connectionFlowControllerI receivedFinalOffset bool @@ -29,14 +27,12 @@ func NewStreamFlowController( receiveWindow protocol.ByteCount, maxReceiveWindow protocol.ByteCount, initialSendWindow protocol.ByteCount, - queueWindowUpdate func(protocol.StreamID), rttStats *utils.RTTStats, logger utils.Logger, ) StreamFlowController { return &streamFlowController{ - streamID: streamID, - connection: cfc.(connectionFlowControllerI), - queueWindowUpdate: func() { queueWindowUpdate(streamID) }, + streamID: streamID, + connection: cfc.(connectionFlowControllerI), baseFlowController: baseFlowController{ rttStats: rttStats, receiveWindow: receiveWindow, @@ -97,20 +93,19 @@ func (c *streamFlowController) UpdateHighestReceived(offset protocol.ByteCount, return c.connection.IncrementHighestReceived(increment) } -func (c *streamFlowController) AddBytesRead(n protocol.ByteCount) { +func (c *streamFlowController) AddBytesRead(n protocol.ByteCount) (shouldQueueWindowUpdate bool) { c.mutex.Lock() c.baseFlowController.addBytesRead(n) - shouldQueueWindowUpdate := c.shouldQueueWindowUpdate() + shouldQueueWindowUpdate = c.shouldQueueWindowUpdate() c.mutex.Unlock() - if shouldQueueWindowUpdate { - c.queueWindowUpdate() - } c.connection.AddBytesRead(n) + return } func (c *streamFlowController) Abandon() { c.mutex.Lock() unread := c.highestReceived - c.bytesRead + c.bytesRead = c.highestReceived c.mutex.Unlock() if unread > 0 { c.connection.AddBytesRead(unread) @@ -126,6 +121,11 @@ func (c *streamFlowController) SendWindowSize() protocol.ByteCount { return min(c.baseFlowController.sendWindowSize(), c.connection.SendWindowSize()) } +func (c *streamFlowController) IsNewlyBlocked() bool { + blocked, _ := c.baseFlowController.IsNewlyBlocked() + return blocked +} + func (c *streamFlowController) shouldQueueWindowUpdate() bool { return !c.receivedFinalOffset && c.hasWindowUpdate() } diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/crypto_setup.go b/vendor/github.com/quic-go/quic-go/internal/handshake/crypto_setup.go index adf74fe..c8e6cb3 100644 --- a/vendor/github.com/quic-go/quic-go/internal/handshake/crypto_setup.go +++ b/vendor/github.com/quic-go/quic-go/internal/handshake/crypto_setup.go @@ -1,7 +1,6 @@ package handshake import ( - "bytes" "context" "crypto/tls" "errors" @@ -124,44 +123,12 @@ func NewCryptoSetupServer( ) cs.allow0RTT = allow0RTT - quicConf := &tls.QUICConfig{TLSConfig: tlsConf} - qtls.SetupConfigForServer(quicConf, cs.allow0RTT, cs.getDataForSessionTicket, cs.handleSessionTicket) - addConnToClientHelloInfo(quicConf.TLSConfig, localAddr, remoteAddr) - - cs.tlsConf = quicConf.TLSConfig - cs.conn = tls.QUICServer(quicConf) - + tlsConf = qtls.SetupConfigForServer(tlsConf, localAddr, remoteAddr, cs.getDataForSessionTicket, cs.handleSessionTicket) + cs.tlsConf = tlsConf + cs.conn = tls.QUICServer(&tls.QUICConfig{TLSConfig: tlsConf}) return cs } -// The tls.Config contains two callbacks that pass in a tls.ClientHelloInfo. -// Since crypto/tls doesn't do it, we need to make sure to set the Conn field with a fake net.Conn -// that allows the caller to get the local and the remote address. -func addConnToClientHelloInfo(conf *tls.Config, localAddr, remoteAddr net.Addr) { - if conf.GetConfigForClient != nil { - gcfc := conf.GetConfigForClient - conf.GetConfigForClient = func(info *tls.ClientHelloInfo) (*tls.Config, error) { - info.Conn = &conn{localAddr: localAddr, remoteAddr: remoteAddr} - c, err := gcfc(info) - if c != nil { - c = c.Clone() - // This won't be necessary anymore once https://github.com/golang/go/issues/63722 is accepted. - c.MinVersion = tls.VersionTLS13 - // We're returning a tls.Config here, so we need to apply this recursively. - addConnToClientHelloInfo(c, localAddr, remoteAddr) - } - return c, err - } - } - if conf.GetCertificate != nil { - gc := conf.GetCertificate - conf.GetCertificate = func(info *tls.ClientHelloInfo) (*tls.Certificate, error) { - info.Conn = &conn{localAddr: localAddr, remoteAddr: remoteAddr} - return gc(info) - } - } -} - func newCryptoSetup( connID protocol.ConnectionID, tp *wire.TransportParameters, @@ -204,8 +171,8 @@ func (h *cryptoSetup) SetLargest1RTTAcked(pn protocol.PacketNumber) error { return h.aead.SetLargestAcked(pn) } -func (h *cryptoSetup) StartHandshake() error { - err := h.conn.Start(context.WithValue(context.Background(), QUICVersionContextKey, h.version)) +func (h *cryptoSetup) StartHandshake(ctx context.Context) error { + err := h.conn.Start(context.WithValue(ctx, QUICVersionContextKey, h.version)) if err != nil { return wrapError(err) } @@ -262,6 +229,9 @@ func (h *cryptoSetup) handleMessage(data []byte, encLevel protocol.EncryptionLev } func (h *cryptoSetup) handleEvent(ev tls.QUICEvent) (done bool, err error) { + //nolint:exhaustive + // Go 1.23 added new 0-RTT events, see https://github.com/quic-go/quic-go/issues/4272. + // We will start using these events when dropping support for Go 1.22. switch ev.Kind { case tls.QUICNoEvent: return true, nil @@ -286,7 +256,10 @@ func (h *cryptoSetup) handleEvent(ev tls.QUICEvent) (done bool, err error) { h.handshakeComplete() return false, nil default: - return false, fmt.Errorf("unexpected event: %d", ev.Kind) + // Unknown events should be ignored. + // crypto/tls will ensure that this is safe to do. + // See the discussion following https://github.com/golang/go/issues/68124#issuecomment-2187042510 for details. + return false, nil } } @@ -338,25 +311,26 @@ func (h *cryptoSetup) handleDataFromSessionState(data []byte, earlyData bool) (a return false } -func decodeDataFromSessionState(data []byte, earlyData bool) (time.Duration, *wire.TransportParameters, error) { - r := bytes.NewReader(data) - ver, err := quicvarint.Read(r) +func decodeDataFromSessionState(b []byte, earlyData bool) (time.Duration, *wire.TransportParameters, error) { + ver, l, err := quicvarint.Parse(b) if err != nil { return 0, nil, err } + b = b[l:] if ver != clientSessionStateRevision { return 0, nil, fmt.Errorf("mismatching version. Got %d, expected %d", ver, clientSessionStateRevision) } - rttEncoded, err := quicvarint.Read(r) + rttEncoded, l, err := quicvarint.Parse(b) if err != nil { return 0, nil, err } + b = b[l:] rtt := time.Duration(rttEncoded) * time.Microsecond if !earlyData { return rtt, nil, nil } var tp wire.TransportParameters - if err := tp.UnmarshalFromSessionTicket(r); err != nil { + if err := tp.UnmarshalFromSessionTicket(b); err != nil { return 0, nil, err } return rtt, &tp, nil @@ -376,9 +350,7 @@ func (h *cryptoSetup) getDataForSessionTicket() []byte { // Due to limitations in crypto/tls, it's only possible to generate a single session ticket per connection. // It is only valid for the server. func (h *cryptoSetup) GetSessionTicket() ([]byte, error) { - if err := h.conn.SendSessionTicket(tls.QUICSessionTicketOptions{ - EarlyData: h.allow0RTT, - }); err != nil { + if err := h.conn.SendSessionTicket(tls.QUICSessionTicketOptions{EarlyData: h.allow0RTT}); err != nil { // Session tickets might be disabled by tls.Config.SessionTicketsDisabled. // We can't check h.tlsConfig here, since the actual config might have been obtained from // the GetConfigForClient callback. @@ -655,8 +627,7 @@ func (h *cryptoSetup) ConnectionState() ConnectionState { } func wrapError(err error) error { - // alert 80 is an internal error - if alertErr := tls.AlertError(0); errors.As(err, &alertErr) && alertErr != 80 { + if alertErr := tls.AlertError(0); errors.As(err, &alertErr) { return qerr.NewLocalCryptoError(uint8(alertErr), err) } return &qerr.TransportError{ErrorCode: qerr.InternalError, ErrorMessage: err.Error()} diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/interface.go b/vendor/github.com/quic-go/quic-go/internal/handshake/interface.go index fab224f..c3a59fc 100644 --- a/vendor/github.com/quic-go/quic-go/internal/handshake/interface.go +++ b/vendor/github.com/quic-go/quic-go/internal/handshake/interface.go @@ -1,6 +1,7 @@ package handshake import ( + "context" "crypto/tls" "errors" "io" @@ -82,6 +83,29 @@ const ( EventHandshakeComplete ) +func (k EventKind) String() string { + switch k { + case EventNoEvent: + return "EventNoEvent" + case EventWriteInitialData: + return "EventWriteInitialData" + case EventWriteHandshakeData: + return "EventWriteHandshakeData" + case EventReceivedReadKeys: + return "EventReceivedReadKeys" + case EventDiscard0RTTKeys: + return "EventDiscard0RTTKeys" + case EventReceivedTransportParameters: + return "EventReceivedTransportParameters" + case EventRestoredTransportParameters: + return "EventRestoredTransportParameters" + case EventHandshakeComplete: + return "EventHandshakeComplete" + default: + return "Unknown EventKind" + } +} + // Event is a handshake event. type Event struct { Kind EventKind @@ -91,7 +115,7 @@ type Event struct { // CryptoSetup handles the handshake and protecting / unprotecting packets type CryptoSetup interface { - StartHandshake() error + StartHandshake(context.Context) error io.Closer ChangeConnectionID(protocol.ConnectionID) GetSessionTicket() ([]byte, error) diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/session_ticket.go b/vendor/github.com/quic-go/quic-go/internal/handshake/session_ticket.go index 9481af5..b67f010 100644 --- a/vendor/github.com/quic-go/quic-go/internal/handshake/session_ticket.go +++ b/vendor/github.com/quic-go/quic-go/internal/handshake/session_ticket.go @@ -1,7 +1,6 @@ package handshake import ( - "bytes" "errors" "fmt" "time" @@ -28,25 +27,26 @@ func (t *sessionTicket) Marshal() []byte { } func (t *sessionTicket) Unmarshal(b []byte, using0RTT bool) error { - r := bytes.NewReader(b) - rev, err := quicvarint.Read(r) + rev, l, err := quicvarint.Parse(b) if err != nil { return errors.New("failed to read session ticket revision") } + b = b[l:] if rev != sessionTicketRevision { return fmt.Errorf("unknown session ticket revision: %d", rev) } - rtt, err := quicvarint.Read(r) + rtt, l, err := quicvarint.Parse(b) if err != nil { return errors.New("failed to read RTT") } + b = b[l:] if using0RTT { var tp wire.TransportParameters - if err := tp.UnmarshalFromSessionTicket(r); err != nil { + if err := tp.UnmarshalFromSessionTicket(b); err != nil { return fmt.Errorf("unmarshaling transport parameters from session ticket failed: %s", err.Error()) } t.Parameters = &tp - } else if r.Len() > 0 { + } else if len(b) > 0 { return fmt.Errorf("the session ticket has more bytes than expected") } t.RTT = time.Duration(rtt) * time.Microsecond diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/token_generator.go b/vendor/github.com/quic-go/quic-go/internal/handshake/token_generator.go index 2d91e6b..84e58cf 100644 --- a/vendor/github.com/quic-go/quic-go/internal/handshake/token_generator.go +++ b/vendor/github.com/quic-go/quic-go/internal/handshake/token_generator.go @@ -46,7 +46,7 @@ type TokenGenerator struct { // NewTokenGenerator initializes a new TokenGenerator func NewTokenGenerator(key TokenProtectorKey) *TokenGenerator { - return &TokenGenerator{tokenProtector: newTokenProtector(key)} + return &TokenGenerator{tokenProtector: *newTokenProtector(key)} } // NewRetryToken generates a new token for a Retry for a given source address diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/token_protector.go b/vendor/github.com/quic-go/quic-go/internal/handshake/token_protector.go index f3a99e4..1577918 100644 --- a/vendor/github.com/quic-go/quic-go/internal/handshake/token_protector.go +++ b/vendor/github.com/quic-go/quic-go/internal/handshake/token_protector.go @@ -14,28 +14,20 @@ import ( // TokenProtectorKey is the key used to encrypt both Retry and session resumption tokens. type TokenProtectorKey [32]byte -// TokenProtector is used to create and verify a token -type tokenProtector interface { - // NewToken creates a new token - NewToken([]byte) ([]byte, error) - // DecodeToken decodes a token - DecodeToken([]byte) ([]byte, error) -} - const tokenNonceSize = 32 // tokenProtector is used to create and verify a token -type tokenProtectorImpl struct { +type tokenProtector struct { key TokenProtectorKey } // newTokenProtector creates a source for source address tokens -func newTokenProtector(key TokenProtectorKey) tokenProtector { - return &tokenProtectorImpl{key: key} +func newTokenProtector(key TokenProtectorKey) *tokenProtector { + return &tokenProtector{key: key} } // NewToken encodes data into a new token. -func (s *tokenProtectorImpl) NewToken(data []byte) ([]byte, error) { +func (s *tokenProtector) NewToken(data []byte) ([]byte, error) { var nonce [tokenNonceSize]byte if _, err := rand.Read(nonce[:]); err != nil { return nil, err @@ -48,7 +40,7 @@ func (s *tokenProtectorImpl) NewToken(data []byte) ([]byte, error) { } // DecodeToken decodes a token. -func (s *tokenProtectorImpl) DecodeToken(p []byte) ([]byte, error) { +func (s *tokenProtector) DecodeToken(p []byte) ([]byte, error) { if len(p) < tokenNonceSize { return nil, fmt.Errorf("token too short: %d", len(p)) } @@ -60,7 +52,7 @@ func (s *tokenProtectorImpl) DecodeToken(p []byte) ([]byte, error) { return aead.Open(nil, aeadNonce, p[tokenNonceSize:], nil) } -func (s *tokenProtectorImpl) createAEAD(nonce []byte) (cipher.AEAD, []byte, error) { +func (s *tokenProtector) createAEAD(nonce []byte) (cipher.AEAD, []byte, error) { h := hkdf.New(sha256.New, s.key[:], nonce, []byte("quic-go token source")) key := make([]byte, 32) // use a 32 byte key, in order to select AES-256 if _, err := io.ReadFull(h, key); err != nil { diff --git a/vendor/github.com/quic-go/quic-go/internal/logutils/frame.go b/vendor/github.com/quic-go/quic-go/internal/logutils/frame.go deleted file mode 100644 index a6032fc..0000000 --- a/vendor/github.com/quic-go/quic-go/internal/logutils/frame.go +++ /dev/null @@ -1,50 +0,0 @@ -package logutils - -import ( - "github.com/quic-go/quic-go/internal/protocol" - "github.com/quic-go/quic-go/internal/wire" - "github.com/quic-go/quic-go/logging" -) - -// ConvertFrame converts a wire.Frame into a logging.Frame. -// This makes it possible for external packages to access the frames. -// Furthermore, it removes the data slices from CRYPTO and STREAM frames. -func ConvertFrame(frame wire.Frame) logging.Frame { - switch f := frame.(type) { - case *wire.AckFrame: - // We use a pool for ACK frames. - // Implementations of the tracer interface may hold on to frames, so we need to make a copy here. - return ConvertAckFrame(f) - case *wire.CryptoFrame: - return &logging.CryptoFrame{ - Offset: f.Offset, - Length: protocol.ByteCount(len(f.Data)), - } - case *wire.StreamFrame: - return &logging.StreamFrame{ - StreamID: f.StreamID, - Offset: f.Offset, - Length: f.DataLen(), - Fin: f.Fin, - } - case *wire.DatagramFrame: - return &logging.DatagramFrame{ - Length: logging.ByteCount(len(f.Data)), - } - default: - return logging.Frame(frame) - } -} - -func ConvertAckFrame(f *wire.AckFrame) *logging.AckFrame { - ranges := make([]wire.AckRange, 0, len(f.AckRanges)) - ranges = append(ranges, f.AckRanges...) - ack := &logging.AckFrame{ - AckRanges: ranges, - DelayTime: f.DelayTime, - ECNCE: f.ECNCE, - ECT0: f.ECT0, - ECT1: f.ECT1, - } - return ack -} diff --git a/vendor/github.com/quic-go/quic-go/internal/protocol/packet_number.go b/vendor/github.com/quic-go/quic-go/internal/protocol/packet_number.go index bd34016..9422db9 100644 --- a/vendor/github.com/quic-go/quic-go/internal/protocol/packet_number.go +++ b/vendor/github.com/quic-go/quic-go/internal/protocol/packet_number.go @@ -21,58 +21,36 @@ const ( PacketNumberLen4 PacketNumberLen = 4 ) -// DecodePacketNumber calculates the packet number based on the received packet number, its length and the last seen packet number -func DecodePacketNumber( - packetNumberLength PacketNumberLen, - lastPacketNumber PacketNumber, - wirePacketNumber PacketNumber, -) PacketNumber { - var epochDelta PacketNumber - switch packetNumberLength { - case PacketNumberLen1: - epochDelta = PacketNumber(1) << 8 - case PacketNumberLen2: - epochDelta = PacketNumber(1) << 16 - case PacketNumberLen3: - epochDelta = PacketNumber(1) << 24 - case PacketNumberLen4: - epochDelta = PacketNumber(1) << 32 +// DecodePacketNumber calculates the packet number based its length and the last seen packet number +// This function is taken from https://www.rfc-editor.org/rfc/rfc9000.html#section-a.3. +func DecodePacketNumber(length PacketNumberLen, largest PacketNumber, truncated PacketNumber) PacketNumber { + expected := largest + 1 + win := PacketNumber(1 << (length * 8)) + hwin := win / 2 + mask := win - 1 + candidate := (expected & ^mask) | truncated + if candidate <= expected-hwin && candidate < 1<<62-win { + return candidate + win } - epoch := lastPacketNumber & ^(epochDelta - 1) - var prevEpochBegin PacketNumber - if epoch > epochDelta { - prevEpochBegin = epoch - epochDelta + if candidate > expected+hwin && candidate >= win { + return candidate - win } - nextEpochBegin := epoch + epochDelta - return closestTo( - lastPacketNumber+1, - epoch+wirePacketNumber, - closestTo(lastPacketNumber+1, prevEpochBegin+wirePacketNumber, nextEpochBegin+wirePacketNumber), - ) + return candidate } -func closestTo(target, a, b PacketNumber) PacketNumber { - if delta(target, a) < delta(target, b) { - return a - } - return b -} - -func delta(a, b PacketNumber) PacketNumber { - if a < b { - return b - a - } - return a - b -} - -// GetPacketNumberLengthForHeader gets the length of the packet number for the public header +// PacketNumberLengthForHeader gets the length of the packet number for the public header // it never chooses a PacketNumberLen of 1 byte, since this is too short under certain circumstances -func GetPacketNumberLengthForHeader(packetNumber, leastUnacked PacketNumber) PacketNumberLen { - diff := uint64(packetNumber - leastUnacked) - if diff < (1 << (16 - 1)) { +func PacketNumberLengthForHeader(pn, largestAcked PacketNumber) PacketNumberLen { + var numUnacked PacketNumber + if largestAcked == InvalidPacketNumber { + numUnacked = pn + 1 + } else { + numUnacked = pn - largestAcked + } + if numUnacked < 1<<(16-1) { return PacketNumberLen2 } - if diff < (1 << (24 - 1)) { + if numUnacked < 1<<(24-1) { return PacketNumberLen3 } return PacketNumberLen4 diff --git a/vendor/github.com/quic-go/quic-go/internal/protocol/params.go b/vendor/github.com/quic-go/quic-go/internal/protocol/params.go index 487cbc0..7c4d8d4 100644 --- a/vendor/github.com/quic-go/quic-go/internal/protocol/params.go +++ b/vendor/github.com/quic-go/quic-go/internal/protocol/params.go @@ -3,16 +3,13 @@ package protocol import "time" // DesiredReceiveBufferSize is the kernel UDP receive buffer size that we'd like to use. -const DesiredReceiveBufferSize = (1 << 20) * 2 // 2 MB +const DesiredReceiveBufferSize = (1 << 20) * 7 // 7 MB // DesiredSendBufferSize is the kernel UDP send buffer size that we'd like to use. -const DesiredSendBufferSize = (1 << 20) * 2 // 2 MB +const DesiredSendBufferSize = (1 << 20) * 7 // 7 MB -// InitialPacketSizeIPv4 is the maximum packet size that we use for sending IPv4 packets. -const InitialPacketSizeIPv4 = 1252 - -// InitialPacketSizeIPv6 is the maximum packet size that we use for sending IPv6 packets. -const InitialPacketSizeIPv6 = 1232 +// InitialPacketSize is the initial (before Path MTU discovery) maximum packet size used. +const InitialPacketSize = 1280 // MaxCongestionWindowPackets is the maximum congestion window in packet. const MaxCongestionWindowPackets = 10000 diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/conn.go b/vendor/github.com/quic-go/quic-go/internal/qtls/conn.go similarity index 97% rename from vendor/github.com/quic-go/quic-go/internal/handshake/conn.go rename to vendor/github.com/quic-go/quic-go/internal/qtls/conn.go index 54af823..6660ac6 100644 --- a/vendor/github.com/quic-go/quic-go/internal/handshake/conn.go +++ b/vendor/github.com/quic-go/quic-go/internal/qtls/conn.go @@ -1,4 +1,4 @@ -package handshake +package qtls import ( "net" diff --git a/vendor/github.com/quic-go/quic-go/internal/qtls/qtls.go b/vendor/github.com/quic-go/quic-go/internal/qtls/qtls.go index ebcd9d4..cdfe82a 100644 --- a/vendor/github.com/quic-go/quic-go/internal/qtls/qtls.go +++ b/vendor/github.com/quic-go/quic-go/internal/qtls/qtls.go @@ -4,20 +4,23 @@ import ( "bytes" "crypto/tls" "fmt" + "net" "github.com/quic-go/quic-go/internal/protocol" ) -func SetupConfigForServer(qconf *tls.QUICConfig, _ bool, getData func() []byte, handleSessionTicket func([]byte, bool) bool) { - conf := qconf.TLSConfig - +func SetupConfigForServer( + conf *tls.Config, + localAddr, remoteAddr net.Addr, + getData func() []byte, + handleSessionTicket func([]byte, bool) bool, +) *tls.Config { // Workaround for https://github.com/golang/go/issues/60506. // This initializes the session tickets _before_ cloning the config. _, _ = conf.DecryptTicket(nil, tls.ConnectionState{}) conf = conf.Clone() conf.MinVersion = tls.VersionTLS13 - qconf.TLSConfig = conf // add callbacks to save transport parameters into the session ticket origWrapSession := conf.WrapSession @@ -58,6 +61,29 @@ func SetupConfigForServer(qconf *tls.QUICConfig, _ bool, getData func() []byte, return state, nil } + // The tls.Config contains two callbacks that pass in a tls.ClientHelloInfo. + // Since crypto/tls doesn't do it, we need to make sure to set the Conn field with a fake net.Conn + // that allows the caller to get the local and the remote address. + if conf.GetConfigForClient != nil { + gcfc := conf.GetConfigForClient + conf.GetConfigForClient = func(info *tls.ClientHelloInfo) (*tls.Config, error) { + info.Conn = &conn{localAddr: localAddr, remoteAddr: remoteAddr} + c, err := gcfc(info) + if c != nil { + // We're returning a tls.Config here, so we need to apply this recursively. + c = SetupConfigForServer(c, localAddr, remoteAddr, getData, handleSessionTicket) + } + return c, err + } + } + if conf.GetCertificate != nil { + gc := conf.GetCertificate + conf.GetCertificate = func(info *tls.ClientHelloInfo) (*tls.Certificate, error) { + info.Conn = &conn{localAddr: localAddr, remoteAddr: remoteAddr} + return gc(info) + } + } + return conf } func SetupConfigForClient( diff --git a/vendor/github.com/quic-go/quic-go/internal/utils/byteorder.go b/vendor/github.com/quic-go/quic-go/internal/utils/byteorder.go deleted file mode 100644 index a9b715e..0000000 --- a/vendor/github.com/quic-go/quic-go/internal/utils/byteorder.go +++ /dev/null @@ -1,21 +0,0 @@ -package utils - -import ( - "bytes" - "io" -) - -// A ByteOrder specifies how to convert byte sequences into 16-, 32-, or 64-bit unsigned integers. -type ByteOrder interface { - Uint32([]byte) uint32 - Uint24([]byte) uint32 - Uint16([]byte) uint16 - - ReadUint32(io.ByteReader) (uint32, error) - ReadUint24(io.ByteReader) (uint32, error) - ReadUint16(io.ByteReader) (uint16, error) - - WriteUint32(*bytes.Buffer, uint32) - WriteUint24(*bytes.Buffer, uint32) - WriteUint16(*bytes.Buffer, uint16) -} diff --git a/vendor/github.com/quic-go/quic-go/internal/utils/byteorder_big_endian.go b/vendor/github.com/quic-go/quic-go/internal/utils/byteorder_big_endian.go deleted file mode 100644 index 834a711..0000000 --- a/vendor/github.com/quic-go/quic-go/internal/utils/byteorder_big_endian.go +++ /dev/null @@ -1,103 +0,0 @@ -package utils - -import ( - "bytes" - "encoding/binary" - "io" -) - -// BigEndian is the big-endian implementation of ByteOrder. -var BigEndian ByteOrder = bigEndian{} - -type bigEndian struct{} - -var _ ByteOrder = &bigEndian{} - -// ReadUintN reads N bytes -func (bigEndian) ReadUintN(b io.ByteReader, length uint8) (uint64, error) { - var res uint64 - for i := uint8(0); i < length; i++ { - bt, err := b.ReadByte() - if err != nil { - return 0, err - } - res ^= uint64(bt) << ((length - 1 - i) * 8) - } - return res, nil -} - -// ReadUint32 reads a uint32 -func (bigEndian) ReadUint32(b io.ByteReader) (uint32, error) { - var b1, b2, b3, b4 uint8 - var err error - if b4, err = b.ReadByte(); err != nil { - return 0, err - } - if b3, err = b.ReadByte(); err != nil { - return 0, err - } - if b2, err = b.ReadByte(); err != nil { - return 0, err - } - if b1, err = b.ReadByte(); err != nil { - return 0, err - } - return uint32(b1) + uint32(b2)<<8 + uint32(b3)<<16 + uint32(b4)<<24, nil -} - -// ReadUint24 reads a uint24 -func (bigEndian) ReadUint24(b io.ByteReader) (uint32, error) { - var b1, b2, b3 uint8 - var err error - if b3, err = b.ReadByte(); err != nil { - return 0, err - } - if b2, err = b.ReadByte(); err != nil { - return 0, err - } - if b1, err = b.ReadByte(); err != nil { - return 0, err - } - return uint32(b1) + uint32(b2)<<8 + uint32(b3)<<16, nil -} - -// ReadUint16 reads a uint16 -func (bigEndian) ReadUint16(b io.ByteReader) (uint16, error) { - var b1, b2 uint8 - var err error - if b2, err = b.ReadByte(); err != nil { - return 0, err - } - if b1, err = b.ReadByte(); err != nil { - return 0, err - } - return uint16(b1) + uint16(b2)<<8, nil -} - -func (bigEndian) Uint32(b []byte) uint32 { - return binary.BigEndian.Uint32(b) -} - -func (bigEndian) Uint24(b []byte) uint32 { - _ = b[2] // bounds check hint to compiler; see golang.org/issue/14808 - return uint32(b[2]) | uint32(b[1])<<8 | uint32(b[0])<<16 -} - -func (bigEndian) Uint16(b []byte) uint16 { - return binary.BigEndian.Uint16(b) -} - -// WriteUint32 writes a uint32 -func (bigEndian) WriteUint32(b *bytes.Buffer, i uint32) { - b.Write([]byte{uint8(i >> 24), uint8(i >> 16), uint8(i >> 8), uint8(i)}) -} - -// WriteUint24 writes a uint24 -func (bigEndian) WriteUint24(b *bytes.Buffer, i uint32) { - b.Write([]byte{uint8(i >> 16), uint8(i >> 8), uint8(i)}) -} - -// WriteUint16 writes a uint16 -func (bigEndian) WriteUint16(b *bytes.Buffer, i uint16) { - b.Write([]byte{uint8(i >> 8), uint8(i)}) -} diff --git a/vendor/github.com/quic-go/quic-go/internal/utils/ip.go b/vendor/github.com/quic-go/quic-go/internal/utils/ip.go deleted file mode 100644 index 7ac7ffe..0000000 --- a/vendor/github.com/quic-go/quic-go/internal/utils/ip.go +++ /dev/null @@ -1,10 +0,0 @@ -package utils - -import "net" - -func IsIPv4(ip net.IP) bool { - // If ip is not an IPv4 address, To4 returns nil. - // Note that there might be some corner cases, where this is not correct. - // See https://stackoverflow.com/questions/22751035/golang-distinguish-ipv4-ipv6. - return ip.To4() != nil -} diff --git a/vendor/github.com/quic-go/quic-go/internal/utils/minmax.go b/vendor/github.com/quic-go/quic-go/internal/utils/minmax.go deleted file mode 100644 index 03a9c9a..0000000 --- a/vendor/github.com/quic-go/quic-go/internal/utils/minmax.go +++ /dev/null @@ -1,36 +0,0 @@ -package utils - -import ( - "math" - "time" -) - -// InfDuration is a duration of infinite length -const InfDuration = time.Duration(math.MaxInt64) - -// MinNonZeroDuration return the minimum duration that's not zero. -func MinNonZeroDuration(a, b time.Duration) time.Duration { - if a == 0 { - return b - } - if b == 0 { - return a - } - return min(a, b) -} - -// MinTime returns the earlier time -func MinTime(a, b time.Time) time.Time { - if a.After(b) { - return b - } - return a -} - -// MaxTime returns the later time -func MaxTime(a, b time.Time) time.Time { - if a.After(b) { - return a - } - return b -} diff --git a/vendor/github.com/quic-go/quic-go/internal/utils/rtt_stats.go b/vendor/github.com/quic-go/quic-go/internal/utils/rtt_stats.go index 463b954..dcfac67 100644 --- a/vendor/github.com/quic-go/quic-go/internal/utils/rtt_stats.go +++ b/vendor/github.com/quic-go/quic-go/internal/utils/rtt_stats.go @@ -27,11 +27,6 @@ type RTTStats struct { maxAckDelay time.Duration } -// NewRTTStats makes a properly initialized RTTStats object -func NewRTTStats() *RTTStats { - return &RTTStats{} -} - // MinRTT Returns the minRTT for the entire connection. // May return Zero if no valid updates have occurred. func (r *RTTStats) MinRTT() time.Duration { return r.minRTT } @@ -64,7 +59,7 @@ func (r *RTTStats) PTO(includeMaxAckDelay bool) time.Duration { // UpdateRTT updates the RTT based on a new sample. func (r *RTTStats) UpdateRTT(sendDelta, ackDelay time.Duration, now time.Time) { - if sendDelta == InfDuration || sendDelta <= 0 { + if sendDelta <= 0 { return } @@ -113,19 +108,3 @@ func (r *RTTStats) SetInitialRTT(t time.Duration) { r.smoothedRTT = t r.latestRTT = t } - -// OnConnectionMigration is called when connection migrates and rtt measurement needs to be reset. -func (r *RTTStats) OnConnectionMigration() { - r.latestRTT = 0 - r.minRTT = 0 - r.smoothedRTT = 0 - r.meanDeviation = 0 -} - -// ExpireSmoothedMetrics causes the smoothed_rtt to be increased to the latest_rtt if the latest_rtt -// is larger. The mean deviation is increased to the most recent deviation if -// it's larger. -func (r *RTTStats) ExpireSmoothedMetrics() { - r.meanDeviation = max(r.meanDeviation, (r.smoothedRTT - r.latestRTT).Abs()) - r.smoothedRTT = max(r.smoothedRTT, r.latestRTT) -} diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/ack_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/ack_frame.go index a0f3feb..8befef4 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/ack_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/ack_frame.go @@ -1,13 +1,12 @@ package wire import ( - "bytes" "errors" + "math" "sort" "time" "github.com/quic-go/quic-go/internal/protocol" - "github.com/quic-go/quic-go/internal/utils" "github.com/quic-go/quic-go/quicvarint" ) @@ -22,91 +21,101 @@ type AckFrame struct { } // parseAckFrame reads an ACK frame -func parseAckFrame(frame *AckFrame, r *bytes.Reader, typ uint64, ackDelayExponent uint8, _ protocol.Version) error { +func parseAckFrame(frame *AckFrame, b []byte, typ uint64, ackDelayExponent uint8, _ protocol.Version) (int, error) { + startLen := len(b) ecn := typ == ackECNFrameType - la, err := quicvarint.Read(r) + la, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] largestAcked := protocol.PacketNumber(la) - delay, err := quicvarint.Read(r) + delay, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] delayTime := time.Duration(delay*1< largestAcked { - return errors.New("invalid first ACK range") + return 0, errors.New("invalid first ACK range") } smallest := largestAcked - ackBlock frame.AckRanges = append(frame.AckRanges, AckRange{Smallest: smallest, Largest: largestAcked}) // read all the other ACK ranges for i := uint64(0); i < numBlocks; i++ { - g, err := quicvarint.Read(r) + g, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] gap := protocol.PacketNumber(g) if smallest < gap+2 { - return errInvalidAckRanges + return 0, errInvalidAckRanges } largest := smallest - gap - 2 - ab, err := quicvarint.Read(r) + ab, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] ackBlock := protocol.PacketNumber(ab) if ackBlock > largest { - return errInvalidAckRanges + return 0, errInvalidAckRanges } smallest = largest - ackBlock frame.AckRanges = append(frame.AckRanges, AckRange{Smallest: smallest, Largest: largest}) } if !frame.validateAckRanges() { - return errInvalidAckRanges + return 0, errInvalidAckRanges } if ecn { - ect0, err := quicvarint.Read(r) + ect0, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] frame.ECT0 = ect0 - ect1, err := quicvarint.Read(r) + ect1, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] frame.ECT1 = ect1 - ecnce, err := quicvarint.Read(r) + ecnce, l, err := quicvarint.Parse(b) if err != nil { - return err + return 0, replaceUnexpectedEOF(err) } + b = b[l:] frame.ECNCE = ecnce } - return nil + return startLen - len(b), nil } // Append appends an ACK frame. @@ -163,7 +172,7 @@ func (f *AckFrame) Length(_ protocol.Version) protocol.ByteCount { length += quicvarint.Len(f.ECT1) length += quicvarint.Len(f.ECNCE) } - return length + return protocol.ByteCount(length) } // gets the number of ACK ranges that can be encoded @@ -174,7 +183,7 @@ func (f *AckFrame) numEncodableAckRanges() int { for i := 1; i < len(f.AckRanges); i++ { gap, len := f.encodeAckRange(i) rangeLen := quicvarint.Len(gap) + quicvarint.Len(len) - if length+rangeLen > protocol.MaxAckFrameSize { + if protocol.ByteCount(length+rangeLen) > protocol.MaxAckFrameSize { // Writing range i would exceed the MaxAckFrameSize. // So encode one range less than that. return i - 1 diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/connection_close_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/connection_close_frame.go index df36244..be11a1b 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/connection_close_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/connection_close_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "io" "github.com/quic-go/quic-go/internal/protocol" @@ -16,47 +15,45 @@ type ConnectionCloseFrame struct { ReasonPhrase string } -func parseConnectionCloseFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*ConnectionCloseFrame, error) { +func parseConnectionCloseFrame(b []byte, typ uint64, _ protocol.Version) (*ConnectionCloseFrame, int, error) { + startLen := len(b) f := &ConnectionCloseFrame{IsApplicationError: typ == applicationCloseFrameType} - ec, err := quicvarint.Read(r) + ec, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] f.ErrorCode = ec // read the Frame Type, if this is not an application error if !f.IsApplicationError { - ft, err := quicvarint.Read(r) + ft, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] f.FrameType = ft } var reasonPhraseLen uint64 - reasonPhraseLen, err = quicvarint.Read(r) + reasonPhraseLen, l, err = quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - // shortcut to prevent the unnecessary allocation of dataLen bytes - // if the dataLen is larger than the remaining length of the packet - // reading the whole reason phrase would result in EOF when attempting to READ - if int(reasonPhraseLen) > r.Len() { - return nil, io.EOF + b = b[l:] + if int(reasonPhraseLen) > len(b) { + return nil, 0, io.EOF } reasonPhrase := make([]byte, reasonPhraseLen) - if _, err := io.ReadFull(r, reasonPhrase); err != nil { - // this should never happen, since we already checked the reasonPhraseLen earlier - return nil, err - } + copy(reasonPhrase, b) f.ReasonPhrase = string(reasonPhrase) - return f, nil + return f, startLen - len(b) + int(reasonPhraseLen), nil } // Length of a written frame func (f *ConnectionCloseFrame) Length(protocol.Version) protocol.ByteCount { - length := 1 + quicvarint.Len(f.ErrorCode) + quicvarint.Len(uint64(len(f.ReasonPhrase))) + protocol.ByteCount(len(f.ReasonPhrase)) + length := 1 + protocol.ByteCount(quicvarint.Len(f.ErrorCode)+quicvarint.Len(uint64(len(f.ReasonPhrase)))) + protocol.ByteCount(len(f.ReasonPhrase)) if !f.IsApplicationError { - length += quicvarint.Len(f.FrameType) // for the frame type + length += protocol.ByteCount(quicvarint.Len(f.FrameType)) // for the frame type } return length } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/crypto_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/crypto_frame.go index d421463..0aa7fe7 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/crypto_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/crypto_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "io" "github.com/quic-go/quic-go/internal/protocol" @@ -14,28 +13,28 @@ type CryptoFrame struct { Data []byte } -func parseCryptoFrame(r *bytes.Reader, _ protocol.Version) (*CryptoFrame, error) { +func parseCryptoFrame(b []byte, _ protocol.Version) (*CryptoFrame, int, error) { + startLen := len(b) frame := &CryptoFrame{} - offset, err := quicvarint.Read(r) + offset, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] frame.Offset = protocol.ByteCount(offset) - dataLen, err := quicvarint.Read(r) + dataLen, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - if dataLen > uint64(r.Len()) { - return nil, io.EOF + b = b[l:] + if dataLen > uint64(len(b)) { + return nil, 0, io.EOF } if dataLen != 0 { frame.Data = make([]byte, dataLen) - if _, err := io.ReadFull(r, frame.Data); err != nil { - // this should never happen, since we already checked the dataLen earlier - return nil, err - } + copy(frame.Data, b) } - return frame, nil + return frame, startLen - len(b) + int(dataLen), nil } func (f *CryptoFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -48,14 +47,14 @@ func (f *CryptoFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { // Length of a written frame func (f *CryptoFrame) Length(_ protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.Offset)) + quicvarint.Len(uint64(len(f.Data))) + protocol.ByteCount(len(f.Data)) + return protocol.ByteCount(1 + quicvarint.Len(uint64(f.Offset)) + quicvarint.Len(uint64(len(f.Data))) + len(f.Data)) } // MaxDataLen returns the maximum data length func (f *CryptoFrame) MaxDataLen(maxSize protocol.ByteCount) protocol.ByteCount { // pretend that the data size will be 1 bytes // if it turns out that varint encoding the length will consume 2 bytes, we need to adjust the data length afterwards - headerLen := 1 + quicvarint.Len(uint64(f.Offset)) + 1 + headerLen := protocol.ByteCount(1 + quicvarint.Len(uint64(f.Offset)) + 1) if headerLen > maxSize { return 0 } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/data_blocked_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/data_blocked_frame.go index 8fe2acb..c97d4c6 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/data_blocked_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/data_blocked_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/quicvarint" ) @@ -12,12 +10,12 @@ type DataBlockedFrame struct { MaximumData protocol.ByteCount } -func parseDataBlockedFrame(r *bytes.Reader, _ protocol.Version) (*DataBlockedFrame, error) { - offset, err := quicvarint.Read(r) +func parseDataBlockedFrame(b []byte, _ protocol.Version) (*DataBlockedFrame, int, error) { + offset, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - return &DataBlockedFrame{MaximumData: protocol.ByteCount(offset)}, nil + return &DataBlockedFrame{MaximumData: protocol.ByteCount(offset)}, l, nil } func (f *DataBlockedFrame) Append(b []byte, version protocol.Version) ([]byte, error) { @@ -27,5 +25,5 @@ func (f *DataBlockedFrame) Append(b []byte, version protocol.Version) ([]byte, e // Length of a written frame func (f *DataBlockedFrame) Length(version protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.MaximumData)) + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.MaximumData))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/datagram_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/datagram_frame.go index 8e406f1..071fda9 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/datagram_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/datagram_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "io" "github.com/quic-go/quic-go/internal/protocol" @@ -20,29 +19,29 @@ type DatagramFrame struct { Data []byte } -func parseDatagramFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*DatagramFrame, error) { +func parseDatagramFrame(b []byte, typ uint64, _ protocol.Version) (*DatagramFrame, int, error) { + startLen := len(b) f := &DatagramFrame{} f.DataLenPresent = typ&0x1 > 0 var length uint64 if f.DataLenPresent { var err error - len, err := quicvarint.Read(r) + var l int + length, l, err = quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - if len > uint64(r.Len()) { - return nil, io.EOF + b = b[l:] + if length > uint64(len(b)) { + return nil, 0, io.EOF } - length = len } else { - length = uint64(r.Len()) + length = uint64(len(b)) } f.Data = make([]byte, length) - if _, err := io.ReadFull(r, f.Data); err != nil { - return nil, err - } - return f, nil + copy(f.Data, b) + return f, startLen - len(b) + int(length), nil } func (f *DatagramFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -80,7 +79,7 @@ func (f *DatagramFrame) MaxDataLen(maxSize protocol.ByteCount, version protocol. func (f *DatagramFrame) Length(_ protocol.Version) protocol.ByteCount { length := 1 + protocol.ByteCount(len(f.Data)) if f.DataLenPresent { - length += quicvarint.Len(uint64(len(f.Data))) + length += protocol.ByteCount(quicvarint.Len(uint64(len(f.Data)))) } return length } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/extended_header.go b/vendor/github.com/quic-go/quic-go/internal/wire/extended_header.go index e04d91b..1c6ad99 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/extended_header.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/extended_header.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "encoding/binary" "errors" "fmt" @@ -32,66 +31,23 @@ type ExtendedHeader struct { parsedLen protocol.ByteCount } -func (h *ExtendedHeader) parse(b *bytes.Reader, v protocol.Version) (bool /* reserved bits valid */, error) { - startLen := b.Len() +func (h *ExtendedHeader) parse(data []byte) (bool /* reserved bits valid */, error) { // read the (now unencrypted) first byte - var err error - h.typeByte, err = b.ReadByte() - if err != nil { - return false, err - } - if _, err := b.Seek(int64(h.Header.ParsedLen())-1, io.SeekCurrent); err != nil { - return false, err - } - reservedBitsValid, err := h.parseLongHeader(b, v) - if err != nil { - return false, err + h.typeByte = data[0] + h.PacketNumberLen = protocol.PacketNumberLen(h.typeByte&0x3) + 1 + if protocol.ByteCount(len(data)) < h.Header.ParsedLen()+protocol.ByteCount(h.PacketNumberLen) { + return false, io.EOF } - h.parsedLen = protocol.ByteCount(startLen - b.Len()) - return reservedBitsValid, err -} -func (h *ExtendedHeader) parseLongHeader(b *bytes.Reader, _ protocol.Version) (bool /* reserved bits valid */, error) { - if err := h.readPacketNumber(b); err != nil { - return false, err - } - if h.typeByte&0xc != 0 { - return false, nil + pn, err := readPacketNumber(data[h.Header.ParsedLen():], h.PacketNumberLen) + if err != nil { + return true, nil } - return true, nil -} + h.PacketNumber = pn + reservedBitsValid := h.typeByte&0xc == 0 -func (h *ExtendedHeader) readPacketNumber(b *bytes.Reader) error { - h.PacketNumberLen = protocol.PacketNumberLen(h.typeByte&0x3) + 1 - switch h.PacketNumberLen { - case protocol.PacketNumberLen1: - n, err := b.ReadByte() - if err != nil { - return err - } - h.PacketNumber = protocol.PacketNumber(n) - case protocol.PacketNumberLen2: - n, err := utils.BigEndian.ReadUint16(b) - if err != nil { - return err - } - h.PacketNumber = protocol.PacketNumber(n) - case protocol.PacketNumberLen3: - n, err := utils.BigEndian.ReadUint24(b) - if err != nil { - return err - } - h.PacketNumber = protocol.PacketNumber(n) - case protocol.PacketNumberLen4: - n, err := utils.BigEndian.ReadUint32(b) - if err != nil { - return err - } - h.PacketNumber = protocol.PacketNumber(n) - default: - return fmt.Errorf("invalid packet number length: %d", h.PacketNumberLen) - } - return nil + h.parsedLen = h.Header.ParsedLen() + protocol.ByteCount(h.PacketNumberLen) + return reservedBitsValid, err } // Append appends the Header. @@ -165,7 +121,7 @@ func (h *ExtendedHeader) ParsedLen() protocol.ByteCount { func (h *ExtendedHeader) GetLength(_ protocol.Version) protocol.ByteCount { length := 1 /* type byte */ + 4 /* version */ + 1 /* dest conn ID len */ + protocol.ByteCount(h.DestConnectionID.Len()) + 1 /* src conn ID len */ + protocol.ByteCount(h.SrcConnectionID.Len()) + protocol.ByteCount(h.PacketNumberLen) + 2 /* length */ if h.Type == protocol.PacketTypeInitial { - length += quicvarint.Len(uint64(len(h.Token))) + protocol.ByteCount(len(h.Token)) + length += protocol.ByteCount(quicvarint.Len(uint64(len(h.Token))) + len(h.Token)) } return length } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/frame_parser.go b/vendor/github.com/quic-go/quic-go/internal/wire/frame_parser.go index cf7d4ce..59d4144 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/frame_parser.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/frame_parser.go @@ -1,9 +1,9 @@ package wire import ( - "bytes" "errors" "fmt" + "io" "reflect" "github.com/quic-go/quic-go/internal/protocol" @@ -38,8 +38,6 @@ const ( // The FrameParser parses QUIC frames, one by one. type FrameParser struct { - r bytes.Reader // cached bytes.Reader, so we don't have to repeatedly allocate them - ackDelayExponent uint8 supportsDatagrams bool @@ -51,7 +49,6 @@ type FrameParser struct { // NewFrameParser creates a new frame parser. func NewFrameParser(supportsDatagrams bool) *FrameParser { return &FrameParser{ - r: *bytes.NewReader(nil), supportsDatagrams: supportsDatagrams, ackFrame: &AckFrame{}, } @@ -60,45 +57,46 @@ func NewFrameParser(supportsDatagrams bool) *FrameParser { // ParseNext parses the next frame. // It skips PADDING frames. func (p *FrameParser) ParseNext(data []byte, encLevel protocol.EncryptionLevel, v protocol.Version) (int, Frame, error) { - startLen := len(data) - p.r.Reset(data) - frame, err := p.parseNext(&p.r, encLevel, v) - n := startLen - p.r.Len() - p.r.Reset(nil) - return n, frame, err + frame, l, err := p.parseNext(data, encLevel, v) + return l, frame, err } -func (p *FrameParser) parseNext(r *bytes.Reader, encLevel protocol.EncryptionLevel, v protocol.Version) (Frame, error) { - for r.Len() != 0 { - typ, err := quicvarint.Read(r) +func (p *FrameParser) parseNext(b []byte, encLevel protocol.EncryptionLevel, v protocol.Version) (Frame, int, error) { + var parsed int + for len(b) != 0 { + typ, l, err := quicvarint.Parse(b) + parsed += l if err != nil { - return nil, &qerr.TransportError{ + return nil, parsed, &qerr.TransportError{ ErrorCode: qerr.FrameEncodingError, ErrorMessage: err.Error(), } } + b = b[l:] if typ == 0x0 { // skip PADDING frames continue } - f, err := p.parseFrame(r, typ, encLevel, v) + f, l, err := p.parseFrame(b, typ, encLevel, v) + parsed += l if err != nil { - return nil, &qerr.TransportError{ + return nil, parsed, &qerr.TransportError{ FrameType: typ, ErrorCode: qerr.FrameEncodingError, ErrorMessage: err.Error(), } } - return f, nil + return f, parsed, nil } - return nil, nil + return nil, parsed, nil } -func (p *FrameParser) parseFrame(r *bytes.Reader, typ uint64, encLevel protocol.EncryptionLevel, v protocol.Version) (Frame, error) { +func (p *FrameParser) parseFrame(b []byte, typ uint64, encLevel protocol.EncryptionLevel, v protocol.Version) (Frame, int, error) { var frame Frame var err error + var l int if typ&0xf8 == 0x8 { - frame, err = parseStreamFrame(r, typ, v) + frame, l, err = parseStreamFrame(b, typ, v) } else { switch typ { case pingFrameType: @@ -109,43 +107,43 @@ func (p *FrameParser) parseFrame(r *bytes.Reader, typ uint64, encLevel protocol. ackDelayExponent = protocol.DefaultAckDelayExponent } p.ackFrame.Reset() - err = parseAckFrame(p.ackFrame, r, typ, ackDelayExponent, v) + l, err = parseAckFrame(p.ackFrame, b, typ, ackDelayExponent, v) frame = p.ackFrame case resetStreamFrameType: - frame, err = parseResetStreamFrame(r, v) + frame, l, err = parseResetStreamFrame(b, v) case stopSendingFrameType: - frame, err = parseStopSendingFrame(r, v) + frame, l, err = parseStopSendingFrame(b, v) case cryptoFrameType: - frame, err = parseCryptoFrame(r, v) + frame, l, err = parseCryptoFrame(b, v) case newTokenFrameType: - frame, err = parseNewTokenFrame(r, v) + frame, l, err = parseNewTokenFrame(b, v) case maxDataFrameType: - frame, err = parseMaxDataFrame(r, v) + frame, l, err = parseMaxDataFrame(b, v) case maxStreamDataFrameType: - frame, err = parseMaxStreamDataFrame(r, v) + frame, l, err = parseMaxStreamDataFrame(b, v) case bidiMaxStreamsFrameType, uniMaxStreamsFrameType: - frame, err = parseMaxStreamsFrame(r, typ, v) + frame, l, err = parseMaxStreamsFrame(b, typ, v) case dataBlockedFrameType: - frame, err = parseDataBlockedFrame(r, v) + frame, l, err = parseDataBlockedFrame(b, v) case streamDataBlockedFrameType: - frame, err = parseStreamDataBlockedFrame(r, v) + frame, l, err = parseStreamDataBlockedFrame(b, v) case bidiStreamBlockedFrameType, uniStreamBlockedFrameType: - frame, err = parseStreamsBlockedFrame(r, typ, v) + frame, l, err = parseStreamsBlockedFrame(b, typ, v) case newConnectionIDFrameType: - frame, err = parseNewConnectionIDFrame(r, v) + frame, l, err = parseNewConnectionIDFrame(b, v) case retireConnectionIDFrameType: - frame, err = parseRetireConnectionIDFrame(r, v) + frame, l, err = parseRetireConnectionIDFrame(b, v) case pathChallengeFrameType: - frame, err = parsePathChallengeFrame(r, v) + frame, l, err = parsePathChallengeFrame(b, v) case pathResponseFrameType: - frame, err = parsePathResponseFrame(r, v) + frame, l, err = parsePathResponseFrame(b, v) case connectionCloseFrameType, applicationCloseFrameType: - frame, err = parseConnectionCloseFrame(r, typ, v) + frame, l, err = parseConnectionCloseFrame(b, typ, v) case handshakeDoneFrameType: frame = &HandshakeDoneFrame{} case 0x30, 0x31: if p.supportsDatagrams { - frame, err = parseDatagramFrame(r, typ, v) + frame, l, err = parseDatagramFrame(b, typ, v) break } fallthrough @@ -154,12 +152,12 @@ func (p *FrameParser) parseFrame(r *bytes.Reader, typ uint64, encLevel protocol. } } if err != nil { - return nil, err + return nil, 0, err } if !p.isAllowedAtEncLevel(frame, encLevel) { - return nil, fmt.Errorf("%s not allowed at encryption level %s", reflect.TypeOf(frame).Elem().Name(), encLevel) + return nil, l, fmt.Errorf("%s not allowed at encryption level %s", reflect.TypeOf(frame).Elem().Name(), encLevel) } - return frame, nil + return frame, l, nil } func (p *FrameParser) isAllowedAtEncLevel(f Frame, encLevel protocol.EncryptionLevel) bool { @@ -190,3 +188,10 @@ func (p *FrameParser) isAllowedAtEncLevel(f Frame, encLevel protocol.EncryptionL func (p *FrameParser) SetAckDelayExponent(exp uint8) { p.ackDelayExponent = exp } + +func replaceUnexpectedEOF(e error) error { + if e == io.ErrUnexpectedEOF { + return io.EOF + } + return e +} diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/header.go b/vendor/github.com/quic-go/quic-go/internal/wire/header.go index 2991168..678a04a 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/header.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/header.go @@ -1,14 +1,12 @@ package wire import ( - "bytes" "encoding/binary" "errors" "fmt" "io" "github.com/quic-go/quic-go/internal/protocol" - "github.com/quic-go/quic-go/internal/utils" "github.com/quic-go/quic-go/quicvarint" ) @@ -41,37 +39,27 @@ func ParseConnectionID(data []byte, shortHeaderConnIDLen int) (protocol.Connecti // https://datatracker.ietf.org/doc/html/rfc8999#section-5.1. // This function should only be called on Long Header packets for which we don't support the version. func ParseArbitraryLenConnectionIDs(data []byte) (bytesParsed int, dest, src protocol.ArbitraryLenConnectionID, _ error) { - r := bytes.NewReader(data) - remaining := r.Len() - src, dest, err := parseArbitraryLenConnectionIDs(r) - return remaining - r.Len(), src, dest, err -} - -func parseArbitraryLenConnectionIDs(r *bytes.Reader) (dest, src protocol.ArbitraryLenConnectionID, _ error) { - r.Seek(5, io.SeekStart) // skip first byte and version field - destConnIDLen, err := r.ReadByte() - if err != nil { - return nil, nil, err + startLen := len(data) + if len(data) < 6 { + return 0, nil, nil, io.EOF } + data = data[5:] // skip first byte and version field + destConnIDLen := data[0] + data = data[1:] destConnID := make(protocol.ArbitraryLenConnectionID, destConnIDLen) - if _, err := io.ReadFull(r, destConnID); err != nil { - if err == io.ErrUnexpectedEOF { - err = io.EOF - } - return nil, nil, err + if len(data) < int(destConnIDLen)+1 { + return 0, nil, nil, io.EOF } - srcConnIDLen, err := r.ReadByte() - if err != nil { - return nil, nil, err + copy(destConnID, data) + data = data[destConnIDLen:] + srcConnIDLen := data[0] + data = data[1:] + if len(data) < int(srcConnIDLen) { + return 0, nil, nil, io.EOF } srcConnID := make(protocol.ArbitraryLenConnectionID, srcConnIDLen) - if _, err := io.ReadFull(r, srcConnID); err != nil { - if err == io.ErrUnexpectedEOF { - err = io.EOF - } - return nil, nil, err - } - return destConnID, srcConnID, nil + copy(srcConnID, data) + return startLen - len(data) + int(srcConnIDLen), destConnID, srcConnID, nil } func IsPotentialQUICPacket(firstByte byte) bool { @@ -139,18 +127,18 @@ type Header struct { parsedLen protocol.ByteCount // how many bytes were read while parsing this header } -// ParsePacket parses a packet. -// If the packet has a long header, the packet is cut according to the length field. -// If we understand the version, the packet is header up unto the packet number. +// ParsePacket parses a long header packet. +// The packet is cut according to the length field. +// If we understand the version, the packet is parsed up unto the packet number. // Otherwise, only the invariant part of the header is parsed. func ParsePacket(data []byte) (*Header, []byte, []byte, error) { if len(data) == 0 || !IsLongHeaderPacket(data[0]) { return nil, nil, nil, errors.New("not a long header packet") } - hdr, err := parseHeader(bytes.NewReader(data)) + hdr, err := parseHeader(data) if err != nil { - if err == ErrUnsupportedVersion { - return hdr, nil, nil, ErrUnsupportedVersion + if errors.Is(err, ErrUnsupportedVersion) { + return hdr, nil, nil, err } return nil, nil, nil, err } @@ -161,55 +149,55 @@ func ParsePacket(data []byte) (*Header, []byte, []byte, error) { return hdr, data[:packetLen], data[packetLen:], nil } -// ParseHeader parses the header. -// For short header packets: up to the packet number. -// For long header packets: +// ParseHeader parses the header: // * if we understand the version: up to the packet number // * if not, only the invariant part of the header -func parseHeader(b *bytes.Reader) (*Header, error) { - startLen := b.Len() - typeByte, err := b.ReadByte() - if err != nil { - return nil, err +func parseHeader(b []byte) (*Header, error) { + if len(b) == 0 { + return nil, io.EOF } + typeByte := b[0] h := &Header{typeByte: typeByte} - err = h.parseLongHeader(b) - h.parsedLen = protocol.ByteCount(startLen - b.Len()) + l, err := h.parseLongHeader(b[1:]) + h.parsedLen = protocol.ByteCount(l) + 1 return h, err } -func (h *Header) parseLongHeader(b *bytes.Reader) error { - v, err := utils.BigEndian.ReadUint32(b) - if err != nil { - return err +func (h *Header) parseLongHeader(b []byte) (int, error) { + startLen := len(b) + if len(b) < 5 { + return 0, io.EOF } - h.Version = protocol.Version(v) + h.Version = protocol.Version(binary.BigEndian.Uint32(b[:4])) if h.Version != 0 && h.typeByte&0x40 == 0 { - return errors.New("not a QUIC packet") + return startLen - len(b), errors.New("not a QUIC packet") } - destConnIDLen, err := b.ReadByte() - if err != nil { - return err + destConnIDLen := int(b[4]) + if destConnIDLen > protocol.MaxConnIDLen { + return startLen - len(b), protocol.ErrInvalidConnectionIDLen } - h.DestConnectionID, err = protocol.ReadConnectionID(b, int(destConnIDLen)) - if err != nil { - return err + b = b[5:] + if len(b) < destConnIDLen+1 { + return startLen - len(b), io.EOF } - srcConnIDLen, err := b.ReadByte() - if err != nil { - return err + h.DestConnectionID = protocol.ParseConnectionID(b[:destConnIDLen]) + srcConnIDLen := int(b[destConnIDLen]) + if srcConnIDLen > protocol.MaxConnIDLen { + return startLen - len(b), protocol.ErrInvalidConnectionIDLen } - h.SrcConnectionID, err = protocol.ReadConnectionID(b, int(srcConnIDLen)) - if err != nil { - return err + b = b[destConnIDLen+1:] + if len(b) < srcConnIDLen { + return startLen - len(b), io.EOF } + h.SrcConnectionID = protocol.ParseConnectionID(b[:srcConnIDLen]) + b = b[srcConnIDLen:] if h.Version == 0 { // version negotiation packet - return nil + return startLen - len(b), nil } // If we don't understand the version, we have no idea how to interpret the rest of the bytes if !protocol.IsSupportedVersion(protocol.SupportedVersions, h.Version) { - return ErrUnsupportedVersion + return startLen - len(b), ErrUnsupportedVersion } if h.Version == protocol.Version2 { @@ -237,38 +225,35 @@ func (h *Header) parseLongHeader(b *bytes.Reader) error { } if h.Type == protocol.PacketTypeRetry { - tokenLen := b.Len() - 16 + tokenLen := len(b) - 16 if tokenLen <= 0 { - return io.EOF + return startLen - len(b), io.EOF } h.Token = make([]byte, tokenLen) - if _, err := io.ReadFull(b, h.Token); err != nil { - return err - } - _, err := b.Seek(16, io.SeekCurrent) - return err + copy(h.Token, b[:tokenLen]) + return startLen - len(b) + tokenLen + 16, nil } if h.Type == protocol.PacketTypeInitial { - tokenLen, err := quicvarint.Read(b) + tokenLen, n, err := quicvarint.Parse(b) if err != nil { - return err + return startLen - len(b), err } - if tokenLen > uint64(b.Len()) { - return io.EOF + b = b[n:] + if tokenLen > uint64(len(b)) { + return startLen - len(b), io.EOF } h.Token = make([]byte, tokenLen) - if _, err := io.ReadFull(b, h.Token); err != nil { - return err - } + copy(h.Token, b[:tokenLen]) + b = b[tokenLen:] } - pl, err := quicvarint.Read(b) + pl, n, err := quicvarint.Parse(b) if err != nil { - return err + return 0, err } h.Length = protocol.ByteCount(pl) - return nil + return startLen - len(b) + n, nil } // ParsedLen returns the number of bytes that were consumed when parsing the header @@ -278,9 +263,9 @@ func (h *Header) ParsedLen() protocol.ByteCount { // ParseExtended parses the version dependent part of the header. // The Reader has to be set such that it points to the first byte of the header. -func (h *Header) ParseExtended(b *bytes.Reader, ver protocol.Version) (*ExtendedHeader, error) { +func (h *Header) ParseExtended(data []byte) (*ExtendedHeader, error) { extHdr := h.toExtendedHeader() - reservedBitsValid, err := extHdr.parse(b, ver) + reservedBitsValid, err := extHdr.parse(data) if err != nil { return nil, err } @@ -298,3 +283,20 @@ func (h *Header) toExtendedHeader() *ExtendedHeader { func (h *Header) PacketType() string { return h.Type.String() } + +func readPacketNumber(data []byte, pnLen protocol.PacketNumberLen) (protocol.PacketNumber, error) { + var pn protocol.PacketNumber + switch pnLen { + case protocol.PacketNumberLen1: + pn = protocol.PacketNumber(data[0]) + case protocol.PacketNumberLen2: + pn = protocol.PacketNumber(binary.BigEndian.Uint16(data[:2])) + case protocol.PacketNumberLen3: + pn = protocol.PacketNumber(uint32(data[2]) + uint32(data[1])<<8 + uint32(data[0])<<16) + case protocol.PacketNumberLen4: + pn = protocol.PacketNumber(binary.BigEndian.Uint32(data[:4])) + default: + return 0, fmt.Errorf("invalid packet number length: %d", pnLen) + } + return pn, nil +} diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/max_data_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/max_data_frame.go index 3dfd761..5819c02 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/max_data_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/max_data_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/quicvarint" ) @@ -13,14 +11,14 @@ type MaxDataFrame struct { } // parseMaxDataFrame parses a MAX_DATA frame -func parseMaxDataFrame(r *bytes.Reader, _ protocol.Version) (*MaxDataFrame, error) { +func parseMaxDataFrame(b []byte, _ protocol.Version) (*MaxDataFrame, int, error) { frame := &MaxDataFrame{} - byteOffset, err := quicvarint.Read(r) + byteOffset, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } frame.MaximumData = protocol.ByteCount(byteOffset) - return frame, nil + return frame, l, nil } func (f *MaxDataFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -31,5 +29,5 @@ func (f *MaxDataFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { // Length of a written frame func (f *MaxDataFrame) Length(_ protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.MaximumData)) + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.MaximumData))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/max_stream_data_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/max_stream_data_frame.go index cb5eab1..db9091a 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/max_stream_data_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/max_stream_data_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/quicvarint" ) @@ -13,23 +11,26 @@ type MaxStreamDataFrame struct { MaximumStreamData protocol.ByteCount } -func parseMaxStreamDataFrame(r *bytes.Reader, _ protocol.Version) (*MaxStreamDataFrame, error) { - sid, err := quicvarint.Read(r) +func parseMaxStreamDataFrame(b []byte, _ protocol.Version) (*MaxStreamDataFrame, int, error) { + startLen := len(b) + sid, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - offset, err := quicvarint.Read(r) + b = b[l:] + offset, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] return &MaxStreamDataFrame{ StreamID: protocol.StreamID(sid), MaximumStreamData: protocol.ByteCount(offset), - }, nil + }, startLen - len(b), nil } -func (f *MaxStreamDataFrame) Append(b []byte, version protocol.Version) ([]byte, error) { +func (f *MaxStreamDataFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { b = append(b, maxStreamDataFrameType) b = quicvarint.Append(b, uint64(f.StreamID)) b = quicvarint.Append(b, uint64(f.MaximumStreamData)) @@ -37,6 +38,6 @@ func (f *MaxStreamDataFrame) Append(b []byte, version protocol.Version) ([]byte, } // Length of a written frame -func (f *MaxStreamDataFrame) Length(version protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.StreamID)) + quicvarint.Len(uint64(f.MaximumStreamData)) +func (f *MaxStreamDataFrame) Length(protocol.Version) protocol.ByteCount { + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.StreamID))+quicvarint.Len(uint64(f.MaximumStreamData))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/max_streams_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/max_streams_frame.go index d902933..a8745bd 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/max_streams_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/max_streams_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "fmt" "github.com/quic-go/quic-go/internal/protocol" @@ -14,7 +13,7 @@ type MaxStreamsFrame struct { MaxStreamNum protocol.StreamNum } -func parseMaxStreamsFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*MaxStreamsFrame, error) { +func parseMaxStreamsFrame(b []byte, typ uint64, _ protocol.Version) (*MaxStreamsFrame, int, error) { f := &MaxStreamsFrame{} switch typ { case bidiMaxStreamsFrameType: @@ -22,15 +21,15 @@ func parseMaxStreamsFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*Max case uniMaxStreamsFrameType: f.Type = protocol.StreamTypeUni } - streamID, err := quicvarint.Read(r) + streamID, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } f.MaxStreamNum = protocol.StreamNum(streamID) if f.MaxStreamNum > protocol.MaxStreamCount { - return nil, fmt.Errorf("%d exceeds the maximum stream count", f.MaxStreamNum) + return nil, 0, fmt.Errorf("%d exceeds the maximum stream count", f.MaxStreamNum) } - return f, nil + return f, l, nil } func (f *MaxStreamsFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -46,5 +45,5 @@ func (f *MaxStreamsFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { // Length of a written frame func (f *MaxStreamsFrame) Length(protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.MaxStreamNum)) + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.MaxStreamNum))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/new_connection_id_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/new_connection_id_frame.go index afae010..852d46e 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/new_connection_id_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/new_connection_id_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "errors" "fmt" "io" @@ -18,43 +17,47 @@ type NewConnectionIDFrame struct { StatelessResetToken protocol.StatelessResetToken } -func parseNewConnectionIDFrame(r *bytes.Reader, _ protocol.Version) (*NewConnectionIDFrame, error) { - seq, err := quicvarint.Read(r) +func parseNewConnectionIDFrame(b []byte, _ protocol.Version) (*NewConnectionIDFrame, int, error) { + startLen := len(b) + seq, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - ret, err := quicvarint.Read(r) + b = b[l:] + ret, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] if ret > seq { //nolint:stylecheck - return nil, fmt.Errorf("Retire Prior To value (%d) larger than Sequence Number (%d)", ret, seq) + return nil, 0, fmt.Errorf("Retire Prior To value (%d) larger than Sequence Number (%d)", ret, seq) } - connIDLen, err := r.ReadByte() - if err != nil { - return nil, err + if len(b) == 0 { + return nil, 0, io.EOF } + connIDLen := int(b[0]) + b = b[1:] if connIDLen == 0 { - return nil, errors.New("invalid zero-length connection ID") + return nil, 0, errors.New("invalid zero-length connection ID") } - connID, err := protocol.ReadConnectionID(r, int(connIDLen)) - if err != nil { - return nil, err + if connIDLen > protocol.MaxConnIDLen { + return nil, 0, protocol.ErrInvalidConnectionIDLen + } + if len(b) < connIDLen { + return nil, 0, io.EOF } frame := &NewConnectionIDFrame{ SequenceNumber: seq, RetirePriorTo: ret, - ConnectionID: connID, + ConnectionID: protocol.ParseConnectionID(b[:connIDLen]), } - if _, err := io.ReadFull(r, frame.StatelessResetToken[:]); err != nil { - if err == io.ErrUnexpectedEOF { - return nil, io.EOF - } - return nil, err + b = b[connIDLen:] + if len(b) < len(frame.StatelessResetToken) { + return nil, 0, io.EOF } - - return frame, nil + copy(frame.StatelessResetToken[:], b) + return frame, startLen - len(b) + len(frame.StatelessResetToken), nil } func (f *NewConnectionIDFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -73,5 +76,5 @@ func (f *NewConnectionIDFrame) Append(b []byte, _ protocol.Version) ([]byte, err // Length of a written frame func (f *NewConnectionIDFrame) Length(protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(f.SequenceNumber) + quicvarint.Len(f.RetirePriorTo) + 1 /* connection ID length */ + protocol.ByteCount(f.ConnectionID.Len()) + 16 + return 1 + protocol.ByteCount(quicvarint.Len(f.SequenceNumber)+quicvarint.Len(f.RetirePriorTo)+1 /* connection ID length */ +f.ConnectionID.Len()) + 16 } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/new_token_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/new_token_frame.go index 6a2eac9..f1d4d00 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/new_token_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/new_token_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "errors" "io" @@ -14,22 +13,21 @@ type NewTokenFrame struct { Token []byte } -func parseNewTokenFrame(r *bytes.Reader, _ protocol.Version) (*NewTokenFrame, error) { - tokenLen, err := quicvarint.Read(r) +func parseNewTokenFrame(b []byte, _ protocol.Version) (*NewTokenFrame, int, error) { + tokenLen, l, err := quicvarint.Parse(b) if err != nil { - return nil, err - } - if uint64(r.Len()) < tokenLen { - return nil, io.EOF + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] if tokenLen == 0 { - return nil, errors.New("token must not be empty") + return nil, 0, errors.New("token must not be empty") } - token := make([]byte, int(tokenLen)) - if _, err := io.ReadFull(r, token); err != nil { - return nil, err + if uint64(len(b)) < tokenLen { + return nil, 0, io.EOF } - return &NewTokenFrame{Token: token}, nil + token := make([]byte, int(tokenLen)) + copy(token, b) + return &NewTokenFrame{Token: token}, l + int(tokenLen), nil } func (f *NewTokenFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -41,5 +39,5 @@ func (f *NewTokenFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { // Length of a written frame func (f *NewTokenFrame) Length(protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(len(f.Token))) + protocol.ByteCount(len(f.Token)) + return 1 + protocol.ByteCount(quicvarint.Len(uint64(len(f.Token)))+len(f.Token)) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/path_challenge_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/path_challenge_frame.go index 772041a..2aca989 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/path_challenge_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/path_challenge_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "io" "github.com/quic-go/quic-go/internal/protocol" @@ -12,15 +11,13 @@ type PathChallengeFrame struct { Data [8]byte } -func parsePathChallengeFrame(r *bytes.Reader, _ protocol.Version) (*PathChallengeFrame, error) { - frame := &PathChallengeFrame{} - if _, err := io.ReadFull(r, frame.Data[:]); err != nil { - if err == io.ErrUnexpectedEOF { - return nil, io.EOF - } - return nil, err +func parsePathChallengeFrame(b []byte, _ protocol.Version) (*PathChallengeFrame, int, error) { + f := &PathChallengeFrame{} + if len(b) < 8 { + return nil, 0, io.EOF } - return frame, nil + copy(f.Data[:], b) + return f, 8, nil } func (f *PathChallengeFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/path_response_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/path_response_frame.go index 86bbe61..76532c8 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/path_response_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/path_response_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "io" "github.com/quic-go/quic-go/internal/protocol" @@ -12,15 +11,13 @@ type PathResponseFrame struct { Data [8]byte } -func parsePathResponseFrame(r *bytes.Reader, _ protocol.Version) (*PathResponseFrame, error) { - frame := &PathResponseFrame{} - if _, err := io.ReadFull(r, frame.Data[:]); err != nil { - if err == io.ErrUnexpectedEOF { - return nil, io.EOF - } - return nil, err +func parsePathResponseFrame(b []byte, _ protocol.Version) (*PathResponseFrame, int, error) { + f := &PathResponseFrame{} + if len(b) < 8 { + return nil, 0, io.EOF } - return frame, nil + copy(f.Data[:], b) + return f, 8, nil } func (f *PathResponseFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/reset_stream_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/reset_stream_frame.go index e60f1db..a20029a 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/reset_stream_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/reset_stream_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/qerr" "github.com/quic-go/quic-go/quicvarint" @@ -15,21 +13,24 @@ type ResetStreamFrame struct { FinalSize protocol.ByteCount } -func parseResetStreamFrame(r *bytes.Reader, _ protocol.Version) (*ResetStreamFrame, error) { +func parseResetStreamFrame(b []byte, _ protocol.Version) (*ResetStreamFrame, int, error) { + startLen := len(b) var streamID protocol.StreamID var byteOffset protocol.ByteCount - sid, err := quicvarint.Read(r) + sid, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] streamID = protocol.StreamID(sid) - errorCode, err := quicvarint.Read(r) + errorCode, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - bo, err := quicvarint.Read(r) + b = b[l:] + bo, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } byteOffset = protocol.ByteCount(bo) @@ -37,7 +38,7 @@ func parseResetStreamFrame(r *bytes.Reader, _ protocol.Version) (*ResetStreamFra StreamID: streamID, ErrorCode: qerr.StreamErrorCode(errorCode), FinalSize: byteOffset, - }, nil + }, startLen - len(b) + l, nil } func (f *ResetStreamFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -49,6 +50,6 @@ func (f *ResetStreamFrame) Append(b []byte, _ protocol.Version) ([]byte, error) } // Length of a written frame -func (f *ResetStreamFrame) Length(version protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.StreamID)) + quicvarint.Len(uint64(f.ErrorCode)) + quicvarint.Len(uint64(f.FinalSize)) +func (f *ResetStreamFrame) Length(protocol.Version) protocol.ByteCount { + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.StreamID))+quicvarint.Len(uint64(f.ErrorCode))+quicvarint.Len(uint64(f.FinalSize))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/retire_connection_id_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/retire_connection_id_frame.go index 9815362..27aeff8 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/retire_connection_id_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/retire_connection_id_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/quicvarint" ) @@ -12,12 +10,12 @@ type RetireConnectionIDFrame struct { SequenceNumber uint64 } -func parseRetireConnectionIDFrame(r *bytes.Reader, _ protocol.Version) (*RetireConnectionIDFrame, error) { - seq, err := quicvarint.Read(r) +func parseRetireConnectionIDFrame(b []byte, _ protocol.Version) (*RetireConnectionIDFrame, int, error) { + seq, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - return &RetireConnectionIDFrame{SequenceNumber: seq}, nil + return &RetireConnectionIDFrame{SequenceNumber: seq}, l, nil } func (f *RetireConnectionIDFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -28,5 +26,5 @@ func (f *RetireConnectionIDFrame) Append(b []byte, _ protocol.Version) ([]byte, // Length of a written frame func (f *RetireConnectionIDFrame) Length(protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(f.SequenceNumber) + return 1 + protocol.ByteCount(quicvarint.Len(f.SequenceNumber)) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/short_header.go b/vendor/github.com/quic-go/quic-go/internal/wire/short_header.go index 69aa834..cf2889c 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/short_header.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/short_header.go @@ -2,7 +2,6 @@ package wire import ( "errors" - "fmt" "io" "github.com/quic-go/quic-go/internal/protocol" @@ -28,25 +27,15 @@ func ParseShortHeader(data []byte, connIDLen int) (length int, _ protocol.Packet } pos := 1 + connIDLen - var pn protocol.PacketNumber - switch pnLen { - case protocol.PacketNumberLen1: - pn = protocol.PacketNumber(data[pos]) - case protocol.PacketNumberLen2: - pn = protocol.PacketNumber(utils.BigEndian.Uint16(data[pos : pos+2])) - case protocol.PacketNumberLen3: - pn = protocol.PacketNumber(utils.BigEndian.Uint24(data[pos : pos+3])) - case protocol.PacketNumberLen4: - pn = protocol.PacketNumber(utils.BigEndian.Uint32(data[pos : pos+4])) - default: - return 0, 0, 0, 0, fmt.Errorf("invalid packet number length: %d", pnLen) + pn, err := readPacketNumber(data[pos:], pnLen) + if err != nil { + return 0, 0, 0, 0, err } kp := protocol.KeyPhaseZero if data[0]&0b100 > 0 { kp = protocol.KeyPhaseOne } - var err error if data[0]&0x18 != 0 { err = ErrInvalidReservedBits } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/stop_sending_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/stop_sending_frame.go index d314a56..a2326f8 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/stop_sending_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/stop_sending_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/qerr" "github.com/quic-go/quic-go/quicvarint" @@ -15,25 +13,28 @@ type StopSendingFrame struct { } // parseStopSendingFrame parses a STOP_SENDING frame -func parseStopSendingFrame(r *bytes.Reader, _ protocol.Version) (*StopSendingFrame, error) { - streamID, err := quicvarint.Read(r) +func parseStopSendingFrame(b []byte, _ protocol.Version) (*StopSendingFrame, int, error) { + startLen := len(b) + streamID, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - errorCode, err := quicvarint.Read(r) + b = b[l:] + errorCode, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] return &StopSendingFrame{ StreamID: protocol.StreamID(streamID), ErrorCode: qerr.StreamErrorCode(errorCode), - }, nil + }, startLen - len(b), nil } // Length of a written frame func (f *StopSendingFrame) Length(_ protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.StreamID)) + quicvarint.Len(uint64(f.ErrorCode)) + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.StreamID))+quicvarint.Len(uint64(f.ErrorCode))) } func (f *StopSendingFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/stream_data_blocked_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/stream_data_blocked_frame.go index f79740f..3762ec7 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/stream_data_blocked_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/stream_data_blocked_frame.go @@ -1,8 +1,6 @@ package wire import ( - "bytes" - "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/quicvarint" ) @@ -13,20 +11,22 @@ type StreamDataBlockedFrame struct { MaximumStreamData protocol.ByteCount } -func parseStreamDataBlockedFrame(r *bytes.Reader, _ protocol.Version) (*StreamDataBlockedFrame, error) { - sid, err := quicvarint.Read(r) +func parseStreamDataBlockedFrame(b []byte, _ protocol.Version) (*StreamDataBlockedFrame, int, error) { + startLen := len(b) + sid, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } - offset, err := quicvarint.Read(r) + b = b[l:] + offset, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } return &StreamDataBlockedFrame{ StreamID: protocol.StreamID(sid), MaximumStreamData: protocol.ByteCount(offset), - }, nil + }, startLen - len(b) + l, nil } func (f *StreamDataBlockedFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -37,6 +37,6 @@ func (f *StreamDataBlockedFrame) Append(b []byte, _ protocol.Version) ([]byte, e } // Length of a written frame -func (f *StreamDataBlockedFrame) Length(version protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.StreamID)) + quicvarint.Len(uint64(f.MaximumStreamData)) +func (f *StreamDataBlockedFrame) Length(protocol.Version) protocol.ByteCount { + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.StreamID))+quicvarint.Len(uint64(f.MaximumStreamData))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/stream_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/stream_frame.go index 0f6c00d..f9470ec 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/stream_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/stream_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "errors" "io" @@ -20,33 +19,41 @@ type StreamFrame struct { fromPool bool } -func parseStreamFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*StreamFrame, error) { +func parseStreamFrame(b []byte, typ uint64, _ protocol.Version) (*StreamFrame, int, error) { + startLen := len(b) hasOffset := typ&0b100 > 0 fin := typ&0b1 > 0 hasDataLen := typ&0b10 > 0 - streamID, err := quicvarint.Read(r) + streamID, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] var offset uint64 if hasOffset { - offset, err = quicvarint.Read(r) + offset, l, err = quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } + b = b[l:] } var dataLen uint64 if hasDataLen { var err error - dataLen, err = quicvarint.Read(r) + var l int + dataLen, l, err = quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) + } + b = b[l:] + if dataLen > uint64(len(b)) { + return nil, 0, io.EOF } } else { // The rest of the packet is data - dataLen = uint64(r.Len()) + dataLen = uint64(len(b)) } var frame *StreamFrame @@ -57,7 +64,7 @@ func parseStreamFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*StreamF // The STREAM frame can't be larger than the StreamFrame we obtained from the buffer, // since those StreamFrames have a buffer length of the maximum packet size. if dataLen > uint64(cap(frame.Data)) { - return nil, io.EOF + return nil, 0, io.EOF } frame.Data = frame.Data[:dataLen] } @@ -68,17 +75,14 @@ func parseStreamFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*StreamF frame.DataLenPresent = hasDataLen if dataLen != 0 { - if _, err := io.ReadFull(r, frame.Data); err != nil { - return nil, err - } + copy(frame.Data, b) } if frame.Offset+frame.DataLen() > protocol.MaxByteCount { - return nil, errors.New("stream data overflows maximum offset") + return nil, 0, errors.New("stream data overflows maximum offset") } - return frame, nil + return frame, startLen - len(b) + int(dataLen), nil } -// Write writes a STREAM frame func (f *StreamFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { if len(f.Data) == 0 && !f.Fin { return nil, errors.New("StreamFrame: attempting to write empty frame without FIN") @@ -108,7 +112,7 @@ func (f *StreamFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { } // Length returns the total length of the STREAM frame -func (f *StreamFrame) Length(version protocol.Version) protocol.ByteCount { +func (f *StreamFrame) Length(protocol.Version) protocol.ByteCount { length := 1 + quicvarint.Len(uint64(f.StreamID)) if f.Offset != 0 { length += quicvarint.Len(uint64(f.Offset)) @@ -116,7 +120,7 @@ func (f *StreamFrame) Length(version protocol.Version) protocol.ByteCount { if f.DataLenPresent { length += quicvarint.Len(uint64(f.DataLen())) } - return length + f.DataLen() + return protocol.ByteCount(length) + f.DataLen() } // DataLen gives the length of data in bytes @@ -126,14 +130,14 @@ func (f *StreamFrame) DataLen() protocol.ByteCount { // MaxDataLen returns the maximum data length // If 0 is returned, writing will fail (a STREAM frame must contain at least 1 byte of data). -func (f *StreamFrame) MaxDataLen(maxSize protocol.ByteCount, version protocol.Version) protocol.ByteCount { - headerLen := 1 + quicvarint.Len(uint64(f.StreamID)) +func (f *StreamFrame) MaxDataLen(maxSize protocol.ByteCount, _ protocol.Version) protocol.ByteCount { + headerLen := 1 + protocol.ByteCount(quicvarint.Len(uint64(f.StreamID))) if f.Offset != 0 { - headerLen += quicvarint.Len(uint64(f.Offset)) + headerLen += protocol.ByteCount(quicvarint.Len(uint64(f.Offset))) } if f.DataLenPresent { - // pretend that the data size will be 1 bytes - // if it turns out that varint encoding the length will consume 2 bytes, we need to adjust the data length afterwards + // Pretend that the data size will be 1 byte. + // If it turns out that varint encoding the length will consume 2 bytes, we need to adjust the data length afterward headerLen++ } if headerLen > maxSize { diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/streams_blocked_frame.go b/vendor/github.com/quic-go/quic-go/internal/wire/streams_blocked_frame.go index b24619a..c946fec 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/streams_blocked_frame.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/streams_blocked_frame.go @@ -1,7 +1,6 @@ package wire import ( - "bytes" "fmt" "github.com/quic-go/quic-go/internal/protocol" @@ -14,7 +13,7 @@ type StreamsBlockedFrame struct { StreamLimit protocol.StreamNum } -func parseStreamsBlockedFrame(r *bytes.Reader, typ uint64, _ protocol.Version) (*StreamsBlockedFrame, error) { +func parseStreamsBlockedFrame(b []byte, typ uint64, _ protocol.Version) (*StreamsBlockedFrame, int, error) { f := &StreamsBlockedFrame{} switch typ { case bidiStreamBlockedFrameType: @@ -22,15 +21,15 @@ func parseStreamsBlockedFrame(r *bytes.Reader, typ uint64, _ protocol.Version) ( case uniStreamBlockedFrameType: f.Type = protocol.StreamTypeUni } - streamLimit, err := quicvarint.Read(r) + streamLimit, l, err := quicvarint.Parse(b) if err != nil { - return nil, err + return nil, 0, replaceUnexpectedEOF(err) } f.StreamLimit = protocol.StreamNum(streamLimit) if f.StreamLimit > protocol.MaxStreamCount { - return nil, fmt.Errorf("%d exceeds the maximum stream count", f.StreamLimit) + return nil, 0, fmt.Errorf("%d exceeds the maximum stream count", f.StreamLimit) } - return f, nil + return f, l, nil } func (f *StreamsBlockedFrame) Append(b []byte, _ protocol.Version) ([]byte, error) { @@ -46,5 +45,5 @@ func (f *StreamsBlockedFrame) Append(b []byte, _ protocol.Version) ([]byte, erro // Length of a written frame func (f *StreamsBlockedFrame) Length(_ protocol.Version) protocol.ByteCount { - return 1 + quicvarint.Len(uint64(f.StreamLimit)) + return 1 + protocol.ByteCount(quicvarint.Len(uint64(f.StreamLimit))) } diff --git a/vendor/github.com/quic-go/quic-go/internal/wire/transport_parameters.go b/vendor/github.com/quic-go/quic-go/internal/wire/transport_parameters.go index c03be3c..cee74b8 100644 --- a/vendor/github.com/quic-go/quic-go/internal/wire/transport_parameters.go +++ b/vendor/github.com/quic-go/quic-go/internal/wire/transport_parameters.go @@ -1,19 +1,17 @@ package wire import ( - "bytes" "crypto/rand" "encoding/binary" "errors" "fmt" "io" "net/netip" - "sort" + "slices" "time" "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/qerr" - "github.com/quic-go/quic-go/internal/utils" "github.com/quic-go/quic-go/quicvarint" ) @@ -89,7 +87,7 @@ type TransportParameters struct { // Unmarshal the transport parameters func (p *TransportParameters) Unmarshal(data []byte, sentBy protocol.Perspective) error { - if err := p.unmarshal(bytes.NewReader(data), sentBy, false); err != nil { + if err := p.unmarshal(data, sentBy, false); err != nil { return &qerr.TransportError{ ErrorCode: qerr.TransportParameterError, ErrorMessage: err.Error(), @@ -98,9 +96,9 @@ func (p *TransportParameters) Unmarshal(data []byte, sentBy protocol.Perspective return nil } -func (p *TransportParameters) unmarshal(r *bytes.Reader, sentBy protocol.Perspective, fromSessionTicket bool) error { +func (p *TransportParameters) unmarshal(b []byte, sentBy protocol.Perspective, fromSessionTicket bool) error { // needed to check that every parameter is only sent at most once - var parameterIDs []transportParameterID + parameterIDs := make([]transportParameterID, 0, 32) var ( readOriginalDestinationConnectionID bool @@ -112,18 +110,20 @@ func (p *TransportParameters) unmarshal(r *bytes.Reader, sentBy protocol.Perspec p.MaxAckDelay = protocol.DefaultMaxAckDelay p.MaxDatagramFrameSize = protocol.InvalidByteCount - for r.Len() > 0 { - paramIDInt, err := quicvarint.Read(r) + for len(b) > 0 { + paramIDInt, l, err := quicvarint.Parse(b) if err != nil { return err } paramID := transportParameterID(paramIDInt) - paramLen, err := quicvarint.Read(r) + b = b[l:] + paramLen, l, err := quicvarint.Parse(b) if err != nil { return err } - if uint64(r.Len()) < paramLen { - return fmt.Errorf("remaining length (%d) smaller than parameter length (%d)", r.Len(), paramLen) + b = b[l:] + if uint64(len(b)) < paramLen { + return fmt.Errorf("remaining length (%d) smaller than parameter length (%d)", len(b), paramLen) } parameterIDs = append(parameterIDs, paramID) switch paramID { @@ -141,16 +141,18 @@ func (p *TransportParameters) unmarshal(r *bytes.Reader, sentBy protocol.Perspec maxAckDelayParameterID, maxDatagramFrameSizeParameterID, ackDelayExponentParameterID: - if err := p.readNumericTransportParameter(r, paramID, int(paramLen)); err != nil { + if err := p.readNumericTransportParameter(b, paramID, int(paramLen)); err != nil { return err } + b = b[paramLen:] case preferredAddressParameterID: if sentBy == protocol.PerspectiveClient { return errors.New("client sent a preferred_address") } - if err := p.readPreferredAddress(r, int(paramLen)); err != nil { + if err := p.readPreferredAddress(b, int(paramLen)); err != nil { return err } + b = b[paramLen:] case disableActiveMigrationParameterID: if paramLen != 0 { return fmt.Errorf("wrong length for disable_active_migration: %d (expected empty)", paramLen) @@ -164,25 +166,41 @@ func (p *TransportParameters) unmarshal(r *bytes.Reader, sentBy protocol.Perspec return fmt.Errorf("wrong length for stateless_reset_token: %d (expected 16)", paramLen) } var token protocol.StatelessResetToken - r.Read(token[:]) + if len(b) < len(token) { + return io.EOF + } + copy(token[:], b) + b = b[len(token):] p.StatelessResetToken = &token case originalDestinationConnectionIDParameterID: if sentBy == protocol.PerspectiveClient { return errors.New("client sent an original_destination_connection_id") } - p.OriginalDestinationConnectionID, _ = protocol.ReadConnectionID(r, int(paramLen)) + if paramLen > protocol.MaxConnIDLen { + return protocol.ErrInvalidConnectionIDLen + } + p.OriginalDestinationConnectionID = protocol.ParseConnectionID(b[:paramLen]) + b = b[paramLen:] readOriginalDestinationConnectionID = true case initialSourceConnectionIDParameterID: - p.InitialSourceConnectionID, _ = protocol.ReadConnectionID(r, int(paramLen)) + if paramLen > protocol.MaxConnIDLen { + return protocol.ErrInvalidConnectionIDLen + } + p.InitialSourceConnectionID = protocol.ParseConnectionID(b[:paramLen]) + b = b[paramLen:] readInitialSourceConnectionID = true case retrySourceConnectionIDParameterID: if sentBy == protocol.PerspectiveClient { return errors.New("client sent a retry_source_connection_id") } - connID, _ := protocol.ReadConnectionID(r, int(paramLen)) + if paramLen > protocol.MaxConnIDLen { + return protocol.ErrInvalidConnectionIDLen + } + connID := protocol.ParseConnectionID(b[:paramLen]) + b = b[paramLen:] p.RetrySourceConnectionID = &connID default: - r.Seek(int64(paramLen), io.SeekCurrent) + b = b[paramLen:] } } @@ -202,7 +220,12 @@ func (p *TransportParameters) unmarshal(r *bytes.Reader, sentBy protocol.Perspec } // check that every transport parameter was sent at most once - sort.Slice(parameterIDs, func(i, j int) bool { return parameterIDs[i] < parameterIDs[j] }) + slices.SortFunc(parameterIDs, func(a, b transportParameterID) int { + if a < b { + return -1 + } + return 1 + }) for i := 0; i < len(parameterIDs)-1; i++ { if parameterIDs[i] == parameterIDs[i+1] { return fmt.Errorf("received duplicate transport parameter %#x", parameterIDs[i]) @@ -212,60 +235,47 @@ func (p *TransportParameters) unmarshal(r *bytes.Reader, sentBy protocol.Perspec return nil } -func (p *TransportParameters) readPreferredAddress(r *bytes.Reader, expectedLen int) error { - remainingLen := r.Len() +func (p *TransportParameters) readPreferredAddress(b []byte, expectedLen int) error { + remainingLen := len(b) pa := &PreferredAddress{} - var ipv4 [4]byte - if _, err := io.ReadFull(r, ipv4[:]); err != nil { - return err - } - port, err := utils.BigEndian.ReadUint16(r) - if err != nil { - return err + if len(b) < 4+2+16+2+1 { + return io.EOF } - pa.IPv4 = netip.AddrPortFrom(netip.AddrFrom4(ipv4), port) + var ipv4 [4]byte + copy(ipv4[:], b[:4]) + port4 := binary.BigEndian.Uint16(b[4:]) + b = b[4+2:] + pa.IPv4 = netip.AddrPortFrom(netip.AddrFrom4(ipv4), port4) var ipv6 [16]byte - if _, err := io.ReadFull(r, ipv6[:]); err != nil { - return err - } - port, err = utils.BigEndian.ReadUint16(r) - if err != nil { - return err - } - pa.IPv6 = netip.AddrPortFrom(netip.AddrFrom16(ipv6), port) - connIDLen, err := r.ReadByte() - if err != nil { - return err - } + copy(ipv6[:], b[:16]) + port6 := binary.BigEndian.Uint16(b[16:]) + pa.IPv6 = netip.AddrPortFrom(netip.AddrFrom16(ipv6), port6) + b = b[16+2:] + connIDLen := int(b[0]) + b = b[1:] if connIDLen == 0 || connIDLen > protocol.MaxConnIDLen { return fmt.Errorf("invalid connection ID length: %d", connIDLen) } - connID, err := protocol.ReadConnectionID(r, int(connIDLen)) - if err != nil { - return err - } - pa.ConnectionID = connID - if _, err := io.ReadFull(r, pa.StatelessResetToken[:]); err != nil { - return err + if len(b) < connIDLen+len(pa.StatelessResetToken) { + return io.EOF } - if bytesRead := remainingLen - r.Len(); bytesRead != expectedLen { + pa.ConnectionID = protocol.ParseConnectionID(b[:connIDLen]) + b = b[connIDLen:] + copy(pa.StatelessResetToken[:], b) + b = b[len(pa.StatelessResetToken):] + if bytesRead := remainingLen - len(b); bytesRead != expectedLen { return fmt.Errorf("expected preferred_address to be %d long, read %d bytes", expectedLen, bytesRead) } p.PreferredAddress = pa return nil } -func (p *TransportParameters) readNumericTransportParameter( - r *bytes.Reader, - paramID transportParameterID, - expectedLen int, -) error { - remainingLen := r.Len() - val, err := quicvarint.Read(r) +func (p *TransportParameters) readNumericTransportParameter(b []byte, paramID transportParameterID, expectedLen int) error { + val, l, err := quicvarint.Parse(b) if err != nil { return fmt.Errorf("error while reading transport parameter %d: %s", paramID, err) } - if remainingLen-r.Len() != expectedLen { + if l != expectedLen { return fmt.Errorf("inconsistent transport parameter length for transport parameter %#x", paramID) } //nolint:exhaustive // This only covers the numeric transport parameters. @@ -292,7 +302,7 @@ func (p *TransportParameters) readNumericTransportParameter( p.MaxIdleTimeout = max(protocol.MinRemoteIdleTimeout, time.Duration(val)*time.Millisecond) case maxUDPPayloadSizeParameterID: if val < 1200 { - return fmt.Errorf("invalid value for max_packet_size: %d (minimum 1200)", val) + return fmt.Errorf("invalid value for max_udp_payload_size: %d (minimum 1200)", val) } p.MaxUDPPayloadSize = protocol.ByteCount(val) case ackDelayExponentParameterID: @@ -347,8 +357,10 @@ func (p *TransportParameters) Marshal(pers protocol.Perspective) []byte { b = p.marshalVarintParam(b, initialMaxStreamsUniParameterID, uint64(p.MaxUniStreamNum)) // idle_timeout b = p.marshalVarintParam(b, maxIdleTimeoutParameterID, uint64(p.MaxIdleTimeout/time.Millisecond)) - // max_packet_size - b = p.marshalVarintParam(b, maxUDPPayloadSizeParameterID, uint64(protocol.MaxPacketBufferSize)) + // max_udp_payload_size + if p.MaxUDPPayloadSize > 0 { + b = p.marshalVarintParam(b, maxUDPPayloadSizeParameterID, uint64(p.MaxUDPPayloadSize)) + } // max_ack_delay // Only send it if is different from the default value. if p.MaxAckDelay != protocol.DefaultMaxAckDelay { @@ -457,15 +469,15 @@ func (p *TransportParameters) MarshalForSessionTicket(b []byte) []byte { } // UnmarshalFromSessionTicket unmarshals transport parameters from a session ticket. -func (p *TransportParameters) UnmarshalFromSessionTicket(r *bytes.Reader) error { - version, err := quicvarint.Read(r) +func (p *TransportParameters) UnmarshalFromSessionTicket(b []byte) error { + version, l, err := quicvarint.Parse(b) if err != nil { return err } if version != transportParameterMarshalingVersion { return fmt.Errorf("unknown transport parameter marshaling version: %d", version) } - return p.unmarshal(r, protocol.PerspectiveServer, true) + return p.unmarshal(b[l:], protocol.PerspectiveServer, true) } // ValidFor0RTT checks if the transport parameters match those saved in the session ticket. diff --git a/vendor/github.com/quic-go/quic-go/logging/connection_tracer.go b/vendor/github.com/quic-go/quic-go/logging/connection_tracer.go index 7f54d6c..96bf461 100644 --- a/vendor/github.com/quic-go/quic-go/logging/connection_tracer.go +++ b/vendor/github.com/quic-go/quic-go/logging/connection_tracer.go @@ -8,14 +8,14 @@ import ( // A ConnectionTracer records events. type ConnectionTracer struct { StartedConnection func(local, remote net.Addr, srcConnID, destConnID ConnectionID) - NegotiatedVersion func(chosen VersionNumber, clientVersions, serverVersions []VersionNumber) + NegotiatedVersion func(chosen Version, clientVersions, serverVersions []Version) ClosedConnection func(error) SentTransportParameters func(*TransportParameters) ReceivedTransportParameters func(*TransportParameters) RestoredTransportParameters func(parameters *TransportParameters) // for 0-RTT SentLongHeaderPacket func(*ExtendedHeader, ByteCount, ECN, *AckFrame, []Frame) SentShortHeaderPacket func(*ShortHeader, ByteCount, ECN, *AckFrame, []Frame) - ReceivedVersionNegotiationPacket func(dest, src ArbitraryLenConnectionID, _ []VersionNumber) + ReceivedVersionNegotiationPacket func(dest, src ArbitraryLenConnectionID, _ []Version) ReceivedRetry func(*Header) ReceivedLongHeaderPacket func(*ExtendedHeader, ByteCount, ECN, []Frame) ReceivedShortHeaderPacket func(*ShortHeader, ByteCount, ECN, []Frame) @@ -24,6 +24,7 @@ type ConnectionTracer struct { UpdatedMetrics func(rttStats *RTTStats, cwnd, bytesInFlight ByteCount, packetsInFlight int) AcknowledgedPacket func(EncryptionLevel, PacketNumber) LostPacket func(EncryptionLevel, PacketNumber, PacketLossReason) + UpdatedMTU func(mtu ByteCount, done bool) UpdatedCongestionState func(CongestionState) UpdatedPTOCount func(value uint32) UpdatedKeyFromTLS func(EncryptionLevel, Perspective) @@ -56,7 +57,7 @@ func NewMultiplexedConnectionTracer(tracers ...*ConnectionTracer) *ConnectionTra } } }, - NegotiatedVersion: func(chosen VersionNumber, clientVersions, serverVersions []VersionNumber) { + NegotiatedVersion: func(chosen Version, clientVersions, serverVersions []Version) { for _, t := range tracers { if t.NegotiatedVersion != nil { t.NegotiatedVersion(chosen, clientVersions, serverVersions) @@ -105,7 +106,7 @@ func NewMultiplexedConnectionTracer(tracers ...*ConnectionTracer) *ConnectionTra } } }, - ReceivedVersionNegotiationPacket: func(dest, src ArbitraryLenConnectionID, versions []VersionNumber) { + ReceivedVersionNegotiationPacket: func(dest, src ArbitraryLenConnectionID, versions []Version) { for _, t := range tracers { if t.ReceivedVersionNegotiationPacket != nil { t.ReceivedVersionNegotiationPacket(dest, src, versions) @@ -168,6 +169,13 @@ func NewMultiplexedConnectionTracer(tracers ...*ConnectionTracer) *ConnectionTra } } }, + UpdatedMTU: func(mtu ByteCount, done bool) { + for _, t := range tracers { + if t.UpdatedMTU != nil { + t.UpdatedMTU(mtu, done) + } + } + }, UpdatedCongestionState: func(state CongestionState) { for _, t := range tracers { if t.UpdatedCongestionState != nil { diff --git a/vendor/github.com/quic-go/quic-go/logging/interface.go b/vendor/github.com/quic-go/quic-go/logging/interface.go index a618a18..1f8edb9 100644 --- a/vendor/github.com/quic-go/quic-go/logging/interface.go +++ b/vendor/github.com/quic-go/quic-go/logging/interface.go @@ -36,8 +36,8 @@ type ( StreamNum = protocol.StreamNum // The StreamType is the type of the stream (unidirectional or bidirectional). StreamType = protocol.StreamType - // The VersionNumber is the QUIC version. - VersionNumber = protocol.Version + // The Version is the QUIC version. + Version = protocol.Version // The Header is the QUIC packet header, before removing header protection. Header = wire.Header @@ -72,27 +72,27 @@ const ( const ( // KeyPhaseZero is key phase bit 0 - KeyPhaseZero KeyPhaseBit = protocol.KeyPhaseZero + KeyPhaseZero = protocol.KeyPhaseZero // KeyPhaseOne is key phase bit 1 - KeyPhaseOne KeyPhaseBit = protocol.KeyPhaseOne + KeyPhaseOne = protocol.KeyPhaseOne ) const ( // PerspectiveServer is used for a QUIC server - PerspectiveServer Perspective = protocol.PerspectiveServer + PerspectiveServer = protocol.PerspectiveServer // PerspectiveClient is used for a QUIC client - PerspectiveClient Perspective = protocol.PerspectiveClient + PerspectiveClient = protocol.PerspectiveClient ) const ( // EncryptionInitial is the Initial encryption level - EncryptionInitial EncryptionLevel = protocol.EncryptionInitial + EncryptionInitial = protocol.EncryptionInitial // EncryptionHandshake is the Handshake encryption level - EncryptionHandshake EncryptionLevel = protocol.EncryptionHandshake + EncryptionHandshake = protocol.EncryptionHandshake // Encryption1RTT is the 1-RTT encryption level - Encryption1RTT EncryptionLevel = protocol.Encryption1RTT + Encryption1RTT = protocol.Encryption1RTT // Encryption0RTT is the 0-RTT encryption level - Encryption0RTT EncryptionLevel = protocol.Encryption0RTT + Encryption0RTT = protocol.Encryption0RTT ) const ( diff --git a/vendor/github.com/quic-go/quic-go/logging/tracer.go b/vendor/github.com/quic-go/quic-go/logging/tracer.go index edd85db..625a809 100644 --- a/vendor/github.com/quic-go/quic-go/logging/tracer.go +++ b/vendor/github.com/quic-go/quic-go/logging/tracer.go @@ -5,7 +5,7 @@ import "net" // A Tracer traces events. type Tracer struct { SentPacket func(net.Addr, *Header, ByteCount, []Frame) - SentVersionNegotiationPacket func(_ net.Addr, dest, src ArbitraryLenConnectionID, _ []VersionNumber) + SentVersionNegotiationPacket func(_ net.Addr, dest, src ArbitraryLenConnectionID, _ []Version) DroppedPacket func(net.Addr, PacketType, ByteCount, PacketDropReason) Debug func(name, msg string) Close func() @@ -27,7 +27,7 @@ func NewMultiplexedTracer(tracers ...*Tracer) *Tracer { } } }, - SentVersionNegotiationPacket: func(remote net.Addr, dest, src ArbitraryLenConnectionID, versions []VersionNumber) { + SentVersionNegotiationPacket: func(remote net.Addr, dest, src ArbitraryLenConnectionID, versions []Version) { for _, t := range tracers { if t.SentVersionNegotiationPacket != nil { t.SentVersionNegotiationPacket(remote, dest, src, versions) diff --git a/vendor/github.com/quic-go/quic-go/mockgen.go b/vendor/github.com/quic-go/quic-go/mockgen.go index 81cc4a5..65ec465 100644 --- a/vendor/github.com/quic-go/quic-go/mockgen.go +++ b/vendor/github.com/quic-go/quic-go/mockgen.go @@ -14,23 +14,17 @@ type Sender = sender //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_stream_internal_test.go github.com/quic-go/quic-go StreamI" type StreamI = streamI -//go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_crypto_stream_test.go github.com/quic-go/quic-go CryptoStream" -type CryptoStream = cryptoStream - //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_receive_stream_internal_test.go github.com/quic-go/quic-go ReceiveStreamI" type ReceiveStreamI = receiveStreamI //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_send_stream_internal_test.go github.com/quic-go/quic-go SendStreamI" type SendStreamI = sendStreamI -//go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_stream_getter_test.go github.com/quic-go/quic-go StreamGetter" -type StreamGetter = streamGetter - //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_stream_sender_test.go github.com/quic-go/quic-go StreamSender" type StreamSender = streamSender -//go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_crypto_data_handler_test.go github.com/quic-go/quic-go CryptoDataHandler" -type CryptoDataHandler = cryptoDataHandler +//go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_stream_control_frame_getter_test.go github.com/quic-go/quic-go StreamControlFrameGetter" +type StreamControlFrameGetter = streamControlFrameGetter //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -build_flags=\"-tags=gomock\" -package quic -self_package github.com/quic-go/quic-go -destination mock_frame_source_test.go github.com/quic-go/quic-go FrameSource" type FrameSource = frameSource @@ -72,5 +66,4 @@ type PacketHandlerManager = packetHandlerManager // //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -package quic -self_package github.com/quic-go/quic-go -source sys_conn_oob.go -destination mock_batch_conn_test.go -mock_names batchConn=MockBatchConn" -//go:generate sh -c "go run go.uber.org/mock/mockgen -typed -package quic -self_package github.com/quic-go/quic-go -self_package github.com/quic-go/quic-go -destination mock_token_store_test.go github.com/quic-go/quic-go TokenStore" //go:generate sh -c "go run go.uber.org/mock/mockgen -typed -package quic -self_package github.com/quic-go/quic-go -self_package github.com/quic-go/quic-go -destination mock_packetconn_test.go net PacketConn" diff --git a/vendor/github.com/quic-go/quic-go/mtu_discoverer.go b/vendor/github.com/quic-go/quic-go/mtu_discoverer.go index 317b092..3f3a640 100644 --- a/vendor/github.com/quic-go/quic-go/mtu_discoverer.go +++ b/vendor/github.com/quic-go/quic-go/mtu_discoverer.go @@ -1,19 +1,19 @@ package quic import ( - "net" "time" "github.com/quic-go/quic-go/internal/ackhandler" "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/utils" "github.com/quic-go/quic-go/internal/wire" + "github.com/quic-go/quic-go/logging" ) type mtuDiscoverer interface { // Start starts the MTU discovery process. // It's unnecessary to call ShouldSendProbe before that. - Start(maxPacketSize protocol.ByteCount) + Start() ShouldSendProbe(now time.Time) bool CurrentSize() protocol.ByteCount GetPing() (ping ackhandler.Frame, datagramSize protocol.ByteCount) @@ -25,54 +25,129 @@ const ( maxMTUDiff = 20 // send a probe packet every mtuProbeDelay RTTs mtuProbeDelay = 5 + // Once maxLostMTUProbes MTU probe packets larger than a certain size are lost, + // MTU discovery won't probe for larger MTUs than this size. + // The algorithm used here is resilient to packet loss of (maxLostMTUProbes - 1) packets. + maxLostMTUProbes = 3 ) -func getMaxPacketSize(addr net.Addr) protocol.ByteCount { - maxSize := protocol.ByteCount(protocol.MinInitialPacketSize) - // If this is not a UDP address, we don't know anything about the MTU. - // Use the minimum size of an Initial packet as the max packet size. - if udpAddr, ok := addr.(*net.UDPAddr); ok { - if utils.IsIPv4(udpAddr.IP) { - maxSize = protocol.InitialPacketSizeIPv4 - } else { - maxSize = protocol.InitialPacketSizeIPv6 - } - } - return maxSize -} +// The Path MTU is found by sending a larger packet every now and then. +// If the packet is acknowledged, we conclude that the path supports this larger packet size. +// If the packet is lost, this can mean one of two things: +// 1. The path doesn't support this larger packet size, or +// 2. The packet was lost due to packet loss, independent of its size. +// The algorithm used here is resilient to packet loss of (maxLostMTUProbes - 1) packets. +// For simplicty, the following example use maxLostMTUProbes = 2. +// +// Initialization: +// |------------------------------------------------------------------------------| +// min max +// +// The first MTU probe packet will have size (min+max)/2. +// Assume that this packet is acknowledged. We can now move the min marker, +// and continue the search in the resulting interval. +// +// If 1st probe packet acknowledged: +// |---------------------------------------|--------------------------------------| +// min max +// +// If 1st probe packet lost: +// |---------------------------------------|--------------------------------------| +// min lost[0] max +// +// We can't conclude that the path doesn't support this packet size, since the loss of the probe +// packet could have been unrelated to the packet size. A larger probe packet will be sent later on. +// After a loss, the next probe packet has size (min+lost[0])/2. +// Now assume this probe packet is acknowledged: +// +// 2nd probe packet acknowledged: +// |------------------|--------------------|--------------------------------------| +// min lost[0] max +// +// First of all, we conclude that the path supports at least this MTU. That's progress! +// Second, we probe a bit more aggressively with the next probe packet: +// After an acknowledgement, the next probe packet has size (min+max)/2. +// This means we'll send a packet larger than the first probe packet (which was lost). +// +// If 3rd probe packet acknowledged: +// |-------------------------------------------------|----------------------------| +// min max +// +// We can conclude that the loss of the 1st probe packet was not due to its size, and +// continue searching in a much smaller interval now. +// +// If 3rd probe packet lost: +// |------------------|--------------------|---------|----------------------------| +// min lost[0] max +// +// Since in our example numPTOProbes = 2, and we lost 2 packets smaller than max, we +// conclude that this packet size is not supported on the path, and reduce the maximum +// value of the search interval. +// +// MTU discovery concludes once the interval min and max has been narrowed down to maxMTUDiff. type mtuFinder struct { lastProbeTime time.Time mtuIncreased func(protocol.ByteCount) rttStats *utils.RTTStats + inFlight protocol.ByteCount // the size of the probe packet currently in flight. InvalidByteCount if none is in flight - current protocol.ByteCount - max protocol.ByteCount // the maximum value, as advertised by the peer (or our maximum size buffer) + min protocol.ByteCount + limit protocol.ByteCount + + // on initialization, we treat the maximum size as the first "lost" packet + lost [maxLostMTUProbes]protocol.ByteCount + lastProbeWasLost bool + + tracer *logging.ConnectionTracer } var _ mtuDiscoverer = &mtuFinder{} -func newMTUDiscoverer(rttStats *utils.RTTStats, start protocol.ByteCount, mtuIncreased func(protocol.ByteCount)) *mtuFinder { - return &mtuFinder{ +func newMTUDiscoverer( + rttStats *utils.RTTStats, + start, max protocol.ByteCount, + mtuIncreased func(protocol.ByteCount), + tracer *logging.ConnectionTracer, +) *mtuFinder { + f := &mtuFinder{ inFlight: protocol.InvalidByteCount, - current: start, + min: start, + limit: max, rttStats: rttStats, mtuIncreased: mtuIncreased, + tracer: tracer, } + for i := range f.lost { + if i == 0 { + f.lost[i] = max + continue + } + f.lost[i] = protocol.InvalidByteCount + } + return f } func (f *mtuFinder) done() bool { - return f.max-f.current <= maxMTUDiff+1 + return f.max()-f.min <= maxMTUDiff+1 } -func (f *mtuFinder) Start(maxPacketSize protocol.ByteCount) { +func (f *mtuFinder) max() protocol.ByteCount { + for i, v := range f.lost { + if v == protocol.InvalidByteCount { + return f.lost[i-1] + } + } + return f.lost[len(f.lost)-1] +} + +func (f *mtuFinder) Start() { f.lastProbeTime = time.Now() // makes sure the first probe packet is not sent immediately - f.max = maxPacketSize } func (f *mtuFinder) ShouldSendProbe(now time.Time) bool { - if f.max == 0 || f.lastProbeTime.IsZero() { + if f.lastProbeTime.IsZero() { return false } if f.inFlight != protocol.InvalidByteCount || f.done() { @@ -82,20 +157,27 @@ func (f *mtuFinder) ShouldSendProbe(now time.Time) bool { } func (f *mtuFinder) GetPing() (ackhandler.Frame, protocol.ByteCount) { - size := (f.max + f.current) / 2 + var size protocol.ByteCount + if f.lastProbeWasLost { + size = (f.min + f.lost[0]) / 2 + } else { + size = (f.min + f.max()) / 2 + } f.lastProbeTime = time.Now() f.inFlight = size return ackhandler.Frame{ Frame: &wire.PingFrame{}, - Handler: (*mtuFinderAckHandler)(f), + Handler: &mtuFinderAckHandler{f}, }, size } func (f *mtuFinder) CurrentSize() protocol.ByteCount { - return f.current + return f.min } -type mtuFinderAckHandler mtuFinder +type mtuFinderAckHandler struct { + *mtuFinder +} var _ ackhandler.FrameHandler = &mtuFinderAckHandler{} @@ -105,7 +187,28 @@ func (h *mtuFinderAckHandler) OnAcked(wire.Frame) { panic("OnAcked callback called although there's no MTU probe packet in flight") } h.inFlight = protocol.InvalidByteCount - h.current = size + h.min = size + h.lastProbeWasLost = false + // remove all values smaller than size from the lost array + var j int + for i, v := range h.lost { + if size < v { + j = i + break + } + } + if j > 0 { + for i := 0; i < len(h.lost); i++ { + if i+j < len(h.lost) { + h.lost[i] = h.lost[i+j] + } else { + h.lost[i] = protocol.InvalidByteCount + } + } + } + if h.tracer != nil && h.tracer.UpdatedMTU != nil { + h.tracer.UpdatedMTU(size, h.done()) + } h.mtuIncreased(size) } @@ -114,6 +217,13 @@ func (h *mtuFinderAckHandler) OnLost(wire.Frame) { if size == protocol.InvalidByteCount { panic("OnLost callback called although there's no MTU probe packet in flight") } - h.max = size + h.lastProbeWasLost = true h.inFlight = protocol.InvalidByteCount + for i, v := range h.lost { + if size < v { + copy(h.lost[i+1:], h.lost[i:]) + h.lost[i] = size + break + } + } } diff --git a/vendor/github.com/quic-go/quic-go/oss-fuzz.sh b/vendor/github.com/quic-go/quic-go/oss-fuzz.sh index 22a577f..92a57a2 100644 --- a/vendor/github.com/quic-go/quic-go/oss-fuzz.sh +++ b/vendor/github.com/quic-go/quic-go/oss-fuzz.sh @@ -3,12 +3,12 @@ # Install Go manually, since oss-fuzz ships with an outdated Go version. # See https://github.com/google/oss-fuzz/pull/10643. export CXX="${CXX} -lresolv" # required by Go 1.20 -wget https://go.dev/dl/go1.22.0.linux-amd64.tar.gz \ +wget https://go.dev/dl/go1.23.0.linux-amd64.tar.gz \ && mkdir temp-go \ && rm -rf /root/.go/* \ - && tar -C temp-go/ -xzf go1.22.0.linux-amd64.tar.gz \ + && tar -C temp-go/ -xzf go1.23.0.linux-amd64.tar.gz \ && mv temp-go/go/* /root/.go/ \ - && rm -rf temp-go go1.22.0.linux-amd64.tar.gz + && rm -rf temp-go go1.23.0.linux-amd64.tar.gz ( # fuzz qpack diff --git a/vendor/github.com/quic-go/quic-go/packet_packer.go b/vendor/github.com/quic-go/quic-go/packet_packer.go index e707734..8b8a03d 100644 --- a/vendor/github.com/quic-go/quic-go/packet_packer.go +++ b/vendor/github.com/quic-go/quic-go/packet_packer.go @@ -121,8 +121,8 @@ type packetPacker struct { perspective protocol.Perspective cryptoSetup sealingManager - initialStream cryptoStream - handshakeStream cryptoStream + initialStream *cryptoStream + handshakeStream *cryptoStream token []byte @@ -141,7 +141,7 @@ var _ packer = &packetPacker{} func newPacketPacker( srcConnID protocol.ConnectionID, getDestConnID func() protocol.ConnectionID, - initialStream, handshakeStream cryptoStream, + initialStream, handshakeStream *cryptoStream, packetNumberManager packetNumberManager, retransmissionQueue *retransmissionQueue, cryptoSetup sealingManager, @@ -482,7 +482,7 @@ func (p *packetPacker) maybeGetCryptoPacket(maxPacketSize protocol.ByteCount, en return nil, payload{} } - var s cryptoStream + var s *cryptoStream var handler ackhandler.FrameHandler var hasRetransmission bool //nolint:exhaustive // Initial and Handshake are the only two encryption levels here. @@ -645,6 +645,9 @@ func (p *packetPacker) composeNextPacket(maxFrameSize protocol.ByteCount, onlyAc pl.length += lengthAdded // add handlers for the control frames that were added for i := startLen; i < len(pl.frames); i++ { + if pl.frames[i].Handler != nil { + continue + } switch pl.frames[i].Frame.(type) { case *wire.PathChallengeFrame, *wire.PathResponseFrame: // Path probing is currently not supported, therefore we don't need to set the OnAcked callback yet. diff --git a/vendor/github.com/quic-go/quic-go/packet_unpacker.go b/vendor/github.com/quic-go/quic-go/packet_unpacker.go index 1034aab..9e0fa9d 100644 --- a/vendor/github.com/quic-go/quic-go/packet_unpacker.go +++ b/vendor/github.com/quic-go/quic-go/packet_unpacker.go @@ -1,7 +1,6 @@ package quic import ( - "bytes" "fmt" "time" @@ -53,7 +52,7 @@ func newPacketUnpacker(cs handshake.CryptoSetup, shortHdrConnIDLen int) *packetU // If the reserved bits are invalid, the error is wire.ErrInvalidReservedBits. // If any other error occurred when parsing the header, the error is of type headerParseError. // If decrypting the payload fails for any reason, the error is the error returned by the AEAD. -func (u *packetUnpacker) UnpackLongHeader(hdr *wire.Header, rcvTime time.Time, data []byte, v protocol.Version) (*unpackedPacket, error) { +func (u *packetUnpacker) UnpackLongHeader(hdr *wire.Header, data []byte) (*unpackedPacket, error) { var encLevel protocol.EncryptionLevel var extHdr *wire.ExtendedHeader var decrypted []byte @@ -65,7 +64,7 @@ func (u *packetUnpacker) UnpackLongHeader(hdr *wire.Header, rcvTime time.Time, d if err != nil { return nil, err } - extHdr, decrypted, err = u.unpackLongHeaderPacket(opener, hdr, data, v) + extHdr, decrypted, err = u.unpackLongHeaderPacket(opener, hdr, data) if err != nil { return nil, err } @@ -75,7 +74,7 @@ func (u *packetUnpacker) UnpackLongHeader(hdr *wire.Header, rcvTime time.Time, d if err != nil { return nil, err } - extHdr, decrypted, err = u.unpackLongHeaderPacket(opener, hdr, data, v) + extHdr, decrypted, err = u.unpackLongHeaderPacket(opener, hdr, data) if err != nil { return nil, err } @@ -85,7 +84,7 @@ func (u *packetUnpacker) UnpackLongHeader(hdr *wire.Header, rcvTime time.Time, d if err != nil { return nil, err } - extHdr, decrypted, err = u.unpackLongHeaderPacket(opener, hdr, data, v) + extHdr, decrypted, err = u.unpackLongHeaderPacket(opener, hdr, data) if err != nil { return nil, err } @@ -125,8 +124,8 @@ func (u *packetUnpacker) UnpackShortHeader(rcvTime time.Time, data []byte) (prot return pn, pnLen, kp, decrypted, nil } -func (u *packetUnpacker) unpackLongHeaderPacket(opener handshake.LongHeaderOpener, hdr *wire.Header, data []byte, v protocol.Version) (*wire.ExtendedHeader, []byte, error) { - extHdr, parseErr := u.unpackLongHeader(opener, hdr, data, v) +func (u *packetUnpacker) unpackLongHeaderPacket(opener handshake.LongHeaderOpener, hdr *wire.Header, data []byte) (*wire.ExtendedHeader, []byte, error) { + extHdr, parseErr := u.unpackLongHeader(opener, hdr, data) // If the reserved bits are set incorrectly, we still need to continue unpacking. // This avoids a timing side-channel, which otherwise might allow an attacker // to gain information about the header encryption. @@ -187,17 +186,15 @@ func (u *packetUnpacker) unpackShortHeader(hd headerDecryptor, data []byte) (int } // The error is either nil, a wire.ErrInvalidReservedBits or of type headerParseError. -func (u *packetUnpacker) unpackLongHeader(hd headerDecryptor, hdr *wire.Header, data []byte, v protocol.Version) (*wire.ExtendedHeader, error) { - extHdr, err := unpackLongHeader(hd, hdr, data, v) +func (u *packetUnpacker) unpackLongHeader(hd headerDecryptor, hdr *wire.Header, data []byte) (*wire.ExtendedHeader, error) { + extHdr, err := unpackLongHeader(hd, hdr, data) if err != nil && err != wire.ErrInvalidReservedBits { return nil, &headerParseError{err: err} } return extHdr, err } -func unpackLongHeader(hd headerDecryptor, hdr *wire.Header, data []byte, v protocol.Version) (*wire.ExtendedHeader, error) { - r := bytes.NewReader(data) - +func unpackLongHeader(hd headerDecryptor, hdr *wire.Header, data []byte) (*wire.ExtendedHeader, error) { hdrLen := hdr.ParsedLen() if protocol.ByteCount(len(data)) < hdrLen+4+16 { //nolint:stylecheck @@ -214,7 +211,7 @@ func unpackLongHeader(hd headerDecryptor, hdr *wire.Header, data []byte, v proto data[hdrLen:hdrLen+4], ) // 3. parse the header (and learn the actual length of the packet number) - extHdr, parseErr := hdr.ParseExtended(r, v) + extHdr, parseErr := hdr.ParseExtended(data) if parseErr != nil && parseErr != wire.ErrInvalidReservedBits { return nil, parseErr } diff --git a/vendor/github.com/quic-go/quic-go/quicvarint/varint.go b/vendor/github.com/quic-go/quic-go/quicvarint/varint.go index 3f12c07..9a22e33 100644 --- a/vendor/github.com/quic-go/quic-go/quicvarint/varint.go +++ b/vendor/github.com/quic-go/quic-go/quicvarint/varint.go @@ -3,8 +3,6 @@ package quicvarint import ( "fmt" "io" - - "github.com/quic-go/quic-go/internal/protocol" ) // taken from the QUIC draft @@ -28,16 +26,16 @@ func Read(r io.ByteReader) (uint64, error) { return 0, err } // the first two bits of the first byte encode the length - len := 1 << ((firstByte & 0xc0) >> 6) + l := 1 << ((firstByte & 0xc0) >> 6) b1 := firstByte & (0xff - 0xc0) - if len == 1 { + if l == 1 { return uint64(b1), nil } b2, err := r.ReadByte() if err != nil { return 0, err } - if len == 2 { + if l == 2 { return uint64(b2) + uint64(b1)<<8, nil } b3, err := r.ReadByte() @@ -48,7 +46,7 @@ func Read(r io.ByteReader) (uint64, error) { if err != nil { return 0, err } - if len == 4 { + if l == 4 { return uint64(b4) + uint64(b3)<<8 + uint64(b2)<<16 + uint64(b1)<<24, nil } b5, err := r.ReadByte() @@ -70,6 +68,31 @@ func Read(r io.ByteReader) (uint64, error) { return uint64(b8) + uint64(b7)<<8 + uint64(b6)<<16 + uint64(b5)<<24 + uint64(b4)<<32 + uint64(b3)<<40 + uint64(b2)<<48 + uint64(b1)<<56, nil } +// Parse reads a number in the QUIC varint format. +// It returns the number of bytes consumed. +func Parse(b []byte) (uint64 /* value */, int /* bytes consumed */, error) { + if len(b) == 0 { + return 0, 0, io.EOF + } + firstByte := b[0] + // the first two bits of the first byte encode the length + l := 1 << ((firstByte & 0xc0) >> 6) + if len(b) < l { + return 0, 0, io.ErrUnexpectedEOF + } + b0 := firstByte & (0xff - 0xc0) + if l == 1 { + return uint64(b0), 1, nil + } + if l == 2 { + return uint64(b[1]) + uint64(b0)<<8, 2, nil + } + if l == 4 { + return uint64(b[3]) + uint64(b[2])<<8 + uint64(b[1])<<16 + uint64(b0)<<24, 4, nil + } + return uint64(b[7]) + uint64(b[6])<<8 + uint64(b[5])<<16 + uint64(b[4])<<24 + uint64(b[3])<<32 + uint64(b[2])<<40 + uint64(b[1])<<48 + uint64(b0)<<56, 8, nil +} + // Append appends i in the QUIC varint format. func Append(b []byte, i uint64) []byte { if i <= maxVarInt1 { @@ -91,7 +114,7 @@ func Append(b []byte, i uint64) []byte { } // AppendWithLen append i in the QUIC varint format with the desired length. -func AppendWithLen(b []byte, i uint64, length protocol.ByteCount) []byte { +func AppendWithLen(b []byte, i uint64, length int) []byte { if length != 1 && length != 2 && length != 4 && length != 8 { panic("invalid varint length") } @@ -109,17 +132,17 @@ func AppendWithLen(b []byte, i uint64, length protocol.ByteCount) []byte { } else if length == 8 { b = append(b, 0b11000000) } - for j := protocol.ByteCount(1); j < length-l; j++ { + for j := 1; j < length-l; j++ { b = append(b, 0) } - for j := protocol.ByteCount(0); j < l; j++ { + for j := 0; j < l; j++ { b = append(b, uint8(i>>(8*(l-1-j)))) } return b } // Len determines the number of bytes that will be needed to write the number i. -func Len(i uint64) protocol.ByteCount { +func Len(i uint64) int { if i <= maxVarInt1 { return 1 } diff --git a/vendor/github.com/quic-go/quic-go/receive_stream.go b/vendor/github.com/quic-go/quic-go/receive_stream.go index 1235ff0..b8535ef 100644 --- a/vendor/github.com/quic-go/quic-go/receive_stream.go +++ b/vendor/github.com/quic-go/quic-go/receive_stream.go @@ -6,6 +6,7 @@ import ( "sync" "time" + "github.com/quic-go/quic-go/internal/ackhandler" "github.com/quic-go/quic-go/internal/flowcontrol" "github.com/quic-go/quic-go/internal/protocol" "github.com/quic-go/quic-go/internal/qerr" @@ -19,7 +20,6 @@ type receiveStreamI interface { handleStreamFrame(*wire.StreamFrame) error handleResetStreamFrame(*wire.ResetStreamFrame) error closeForShutdown(error) - getWindowUpdate() protocol.ByteCount } type receiveStream struct { @@ -37,10 +37,17 @@ type receiveStream struct { readPosInFrame int currentFrameIsLast bool // is the currentFrame the last frame on this stream - finRead bool // set once we read a frame with a Fin + queuedStopSending bool + queuedMaxStreamData bool + + // Set once we read the io.EOF or the cancellation error. + // Note that for local cancellations, this doesn't necessarily mean that we know the final offset yet. + errorRead bool + completed bool // set once we've called streamSender.onStreamCompleted + cancelledRemotely bool + cancelledLocally bool + cancelErr *StreamError closeForShutdownErr error - cancelReadErr error - resetRemotelyErr *StreamError readChan chan struct{} readOnce chan struct{} // cap: 1, to protect against concurrent use of Read @@ -50,8 +57,9 @@ type receiveStream struct { } var ( - _ ReceiveStream = &receiveStream{} - _ receiveStreamI = &receiveStream{} + _ ReceiveStream = &receiveStream{} + _ receiveStreamI = &receiveStream{} + _ streamControlFrameGetter = &receiveStream{} ) func newReceiveStream( @@ -83,29 +91,54 @@ func (s *receiveStream) Read(p []byte) (int, error) { defer func() { <-s.readOnce }() s.mutex.Lock() - completed, n, err := s.readImpl(p) + queuedNewControlFrame, n, err := s.readImpl(p) + completed := s.isNewlyCompleted() s.mutex.Unlock() if completed { s.sender.onStreamCompleted(s.streamID) } + if queuedNewControlFrame { + s.sender.onHasStreamControlFrame(s.streamID, s) + } return n, err } -func (s *receiveStream) readImpl(p []byte) (bool /*stream completed */, int, error) { - if s.finRead { - return false, 0, io.EOF +func (s *receiveStream) isNewlyCompleted() bool { + if s.completed { + return false + } + // We need to know the final offset (either via FIN or RESET_STREAM) for flow control accounting. + if s.finalOffset == protocol.MaxByteCount { + return false + } + // We're done with the stream if it was cancelled locally... + if s.cancelledLocally { + s.completed = true + return true + } + // ... or if the error (either io.EOF or the reset error) was read + if s.errorRead { + s.completed = true + return true } - if s.cancelReadErr != nil { - return false, 0, s.cancelReadErr + return false +} + +func (s *receiveStream) readImpl(p []byte) (bool, int, error) { + if s.currentFrameIsLast && s.currentFrame == nil { + s.errorRead = true + return false, 0, io.EOF } - if s.resetRemotelyErr != nil { - return false, 0, s.resetRemotelyErr + if s.cancelledRemotely || s.cancelledLocally { + s.errorRead = true + return false, 0, s.cancelErr } if s.closeForShutdownErr != nil { return false, 0, s.closeForShutdownErr } + var queuedNewControlFrame bool var bytesRead int var deadlineTimer *utils.Timer for bytesRead < len(p) { @@ -113,25 +146,23 @@ func (s *receiveStream) readImpl(p []byte) (bool /*stream completed */, int, err s.dequeueNextFrame() } if s.currentFrame == nil && bytesRead > 0 { - return false, bytesRead, s.closeForShutdownErr + return queuedNewControlFrame, bytesRead, s.closeForShutdownErr } for { // Stop waiting on errors if s.closeForShutdownErr != nil { - return false, bytesRead, s.closeForShutdownErr - } - if s.cancelReadErr != nil { - return false, bytesRead, s.cancelReadErr + return queuedNewControlFrame, bytesRead, s.closeForShutdownErr } - if s.resetRemotelyErr != nil { - return false, bytesRead, s.resetRemotelyErr + if s.cancelledRemotely || s.cancelledLocally { + s.errorRead = true + return queuedNewControlFrame, 0, s.cancelErr } deadline := s.deadline if !deadline.IsZero() { if !time.Now().Before(deadline) { - return false, bytesRead, errDeadline + return queuedNewControlFrame, bytesRead, errDeadline } if deadlineTimer == nil { deadlineTimer = utils.NewTimer() @@ -161,10 +192,10 @@ func (s *receiveStream) readImpl(p []byte) (bool /*stream completed */, int, err } if bytesRead > len(p) { - return false, bytesRead, fmt.Errorf("BUG: bytesRead (%d) > len(p) (%d) in stream.Read", bytesRead, len(p)) + return queuedNewControlFrame, bytesRead, fmt.Errorf("BUG: bytesRead (%d) > len(p) (%d) in stream.Read", bytesRead, len(p)) } if s.readPosInFrame > len(s.currentFrame) { - return false, bytesRead, fmt.Errorf("BUG: readPosInFrame (%d) > frame.DataLen (%d) in stream.Read", s.readPosInFrame, len(s.currentFrame)) + return queuedNewControlFrame, bytesRead, fmt.Errorf("BUG: readPosInFrame (%d) > frame.DataLen (%d) in stream.Read", s.readPosInFrame, len(s.currentFrame)) } m := copy(p[bytesRead:], s.currentFrame[s.readPosInFrame:]) @@ -173,20 +204,23 @@ func (s *receiveStream) readImpl(p []byte) (bool /*stream completed */, int, err // when a RESET_STREAM was received, the flow controller was already // informed about the final byteOffset for this stream - if s.resetRemotelyErr == nil { - s.flowController.AddBytesRead(protocol.ByteCount(m)) + if !s.cancelledRemotely { + if queueMaxStreamData := s.flowController.AddBytesRead(protocol.ByteCount(m)); queueMaxStreamData { + s.queuedMaxStreamData = true + queuedNewControlFrame = true + } } if s.readPosInFrame >= len(s.currentFrame) && s.currentFrameIsLast { - s.finRead = true s.currentFrame = nil if s.currentFrameDone != nil { s.currentFrameDone() } - return true, bytesRead, io.EOF + s.errorRead = true + return queuedNewControlFrame, bytesRead, io.EOF } } - return false, bytesRead, nil + return queuedNewControlFrame, bytesRead, nil } func (s *receiveStream) dequeueNextFrame() { @@ -202,32 +236,40 @@ func (s *receiveStream) dequeueNextFrame() { func (s *receiveStream) CancelRead(errorCode StreamErrorCode) { s.mutex.Lock() - completed := s.cancelReadImpl(errorCode) + queuedNewControlFrame := s.cancelReadImpl(errorCode) + completed := s.isNewlyCompleted() s.mutex.Unlock() + if queuedNewControlFrame { + s.sender.onHasStreamControlFrame(s.streamID, s) + } if completed { s.flowController.Abandon() s.sender.onStreamCompleted(s.streamID) } } -func (s *receiveStream) cancelReadImpl(errorCode qerr.StreamErrorCode) bool /* completed */ { - if s.finRead || s.cancelReadErr != nil || s.resetRemotelyErr != nil { +func (s *receiveStream) cancelReadImpl(errorCode qerr.StreamErrorCode) (queuedNewControlFrame bool) { + if s.cancelledLocally { // duplicate call to CancelRead return false } - s.cancelReadErr = &StreamError{StreamID: s.streamID, ErrorCode: errorCode, Remote: false} + if s.closeForShutdownErr != nil { + return false + } + s.cancelledLocally = true + if s.errorRead || s.cancelledRemotely { + return false + } + s.queuedStopSending = true + s.cancelErr = &StreamError{StreamID: s.streamID, ErrorCode: errorCode, Remote: false} s.signalRead() - s.sender.queueControlFrame(&wire.StopSendingFrame{ - StreamID: s.streamID, - ErrorCode: errorCode, - }) - // We're done with this stream if the final offset was already received. - return s.finalOffset != protocol.MaxByteCount + return true } func (s *receiveStream) handleStreamFrame(frame *wire.StreamFrame) error { s.mutex.Lock() - completed, err := s.handleStreamFrameImpl(frame) + err := s.handleStreamFrameImpl(frame) + completed := s.isNewlyCompleted() s.mutex.Unlock() if completed { @@ -237,59 +279,78 @@ func (s *receiveStream) handleStreamFrame(frame *wire.StreamFrame) error { return err } -func (s *receiveStream) handleStreamFrameImpl(frame *wire.StreamFrame) (bool /* completed */, error) { +func (s *receiveStream) handleStreamFrameImpl(frame *wire.StreamFrame) error { maxOffset := frame.Offset + frame.DataLen() if err := s.flowController.UpdateHighestReceived(maxOffset, frame.Fin); err != nil { - return false, err + return err } - var newlyRcvdFinalOffset bool if frame.Fin { - newlyRcvdFinalOffset = s.finalOffset == protocol.MaxByteCount s.finalOffset = maxOffset } - if s.cancelReadErr != nil { - return newlyRcvdFinalOffset, nil + if s.cancelledLocally { + return nil } if err := s.frameQueue.Push(frame.Data, frame.Offset, frame.PutBack); err != nil { - return false, err + return err } s.signalRead() - return false, nil + return nil } func (s *receiveStream) handleResetStreamFrame(frame *wire.ResetStreamFrame) error { s.mutex.Lock() - completed, err := s.handleResetStreamFrameImpl(frame) + err := s.handleResetStreamFrameImpl(frame) + completed := s.isNewlyCompleted() s.mutex.Unlock() if completed { - s.flowController.Abandon() s.sender.onStreamCompleted(s.streamID) } return err } -func (s *receiveStream) handleResetStreamFrameImpl(frame *wire.ResetStreamFrame) (bool /*completed */, error) { +func (s *receiveStream) handleResetStreamFrameImpl(frame *wire.ResetStreamFrame) error { if s.closeForShutdownErr != nil { - return false, nil + return nil } if err := s.flowController.UpdateHighestReceived(frame.FinalSize, true); err != nil { - return false, err + return err } - newlyRcvdFinalOffset := s.finalOffset == protocol.MaxByteCount s.finalOffset = frame.FinalSize // ignore duplicate RESET_STREAM frames for this stream (after checking their final offset) - if s.resetRemotelyErr != nil { - return false, nil + if s.cancelledRemotely { + return nil } - s.resetRemotelyErr = &StreamError{ - StreamID: s.streamID, - ErrorCode: frame.ErrorCode, - Remote: true, + s.flowController.Abandon() + // don't save the error if the RESET_STREAM frames was received after CancelRead was called + if s.cancelledLocally { + return nil } + s.cancelledRemotely = true + s.cancelErr = &StreamError{StreamID: s.streamID, ErrorCode: frame.ErrorCode, Remote: true} s.signalRead() - return newlyRcvdFinalOffset, nil + return nil +} + +func (s *receiveStream) getControlFrame() (_ ackhandler.Frame, ok, hasMore bool) { + s.mutex.Lock() + defer s.mutex.Unlock() + + if !s.queuedStopSending && !s.queuedMaxStreamData { + return ackhandler.Frame{}, false, false + } + if s.queuedStopSending { + s.queuedStopSending = false + return ackhandler.Frame{ + Frame: &wire.StopSendingFrame{StreamID: s.streamID, ErrorCode: s.cancelErr.ErrorCode}, + }, true, s.queuedMaxStreamData + } + + s.queuedMaxStreamData = false + return ackhandler.Frame{ + Frame: &wire.MaxStreamDataFrame{StreamID: s.streamID, MaximumStreamData: s.flowController.GetWindowUpdate()}, + }, true, false } func (s *receiveStream) SetReadDeadline(t time.Time) error { @@ -310,10 +371,6 @@ func (s *receiveStream) closeForShutdown(err error) { s.signalRead() } -func (s *receiveStream) getWindowUpdate() protocol.ByteCount { - return s.flowController.GetWindowUpdate() -} - // signalRead performs a non-blocking send on the readChan func (s *receiveStream) signalRead() { select { diff --git a/vendor/github.com/quic-go/quic-go/send_stream.go b/vendor/github.com/quic-go/quic-go/send_stream.go index e1ce3e6..699c40e 100644 --- a/vendor/github.com/quic-go/quic-go/send_stream.go +++ b/vendor/github.com/quic-go/quic-go/send_stream.go @@ -26,7 +26,7 @@ type sendStreamI interface { type sendStream struct { mutex sync.Mutex - numOutstandingFrames int64 + numOutstandingFrames int64 // outstanding STREAM and RESET_STREAM frames retransmissionQueue []*wire.StreamFrame ctx context.Context @@ -37,12 +37,19 @@ type sendStream struct { writeOffset protocol.ByteCount - cancelWriteErr error + cancelWriteErr *StreamError closeForShutdownErr error + queuedResetStreamFrame bool + queuedBlockedFrame bool + finishedWriting bool // set once Close() is called finSent bool // set when a STREAM_FRAME with FIN bit has been sent - completed bool // set when this stream has been reported to the streamSender as completed + // Set when the application knows about the cancellation. + // This can happen because the application called CancelWrite, + // or because Write returned the error (for remote cancellations). + cancellationFlagged bool + completed bool // set when this stream has been reported to the streamSender as completed dataForWriting []byte // during a Write() call, this slice is the part of p that still needs to be sent out nextFrame *wire.StreamFrame @@ -55,11 +62,13 @@ type sendStream struct { } var ( - _ SendStream = &sendStream{} - _ sendStreamI = &sendStream{} + _ SendStream = &sendStream{} + _ sendStreamI = &sendStream{} + _ streamControlFrameGetter = &sendStream{} ) func newSendStream( + ctx context.Context, streamID protocol.StreamID, sender streamSender, flowController flowcontrol.StreamFlowController, @@ -71,7 +80,7 @@ func newSendStream( writeChan: make(chan struct{}, 1), writeOnce: make(chan struct{}, 1), // cap: 1, to protect against concurrent use of Write } - s.ctx, s.ctxCancel = context.WithCancelCause(context.Background()) + s.ctx, s.ctxCancel = context.WithCancelCause(ctx) return s } @@ -86,23 +95,32 @@ func (s *sendStream) Write(p []byte) (int, error) { s.writeOnce <- struct{}{} defer func() { <-s.writeOnce }() + isNewlyCompleted, n, err := s.write(p) + if isNewlyCompleted { + s.sender.onStreamCompleted(s.streamID) + } + return n, err +} + +func (s *sendStream) write(p []byte) (bool /* is newly completed */, int, error) { s.mutex.Lock() defer s.mutex.Unlock() if s.finishedWriting { - return 0, fmt.Errorf("write on closed stream %d", s.streamID) + return false, 0, fmt.Errorf("write on closed stream %d", s.streamID) } if s.cancelWriteErr != nil { - return 0, s.cancelWriteErr + s.cancellationFlagged = true + return s.isNewlyCompleted(), 0, s.cancelWriteErr } if s.closeForShutdownErr != nil { - return 0, s.closeForShutdownErr + return false, 0, s.closeForShutdownErr } if !s.deadline.IsZero() && !time.Now().Before(s.deadline) { - return 0, errDeadline + return false, 0, errDeadline } if len(p) == 0 { - return 0, nil + return false, 0, nil } s.dataForWriting = p @@ -143,7 +161,7 @@ func (s *sendStream) Write(p []byte) (int, error) { if !deadline.IsZero() { if !time.Now().Before(deadline) { s.dataForWriting = nil - return bytesWritten, errDeadline + return false, bytesWritten, errDeadline } if deadlineTimer == nil { deadlineTimer = utils.NewTimer() @@ -158,7 +176,7 @@ func (s *sendStream) Write(p []byte) (int, error) { s.mutex.Unlock() if !notifiedSender { - s.sender.onHasStreamData(s.streamID) // must be called without holding the mutex + s.sender.onHasStreamData(s.streamID, s) // must be called without holding the mutex notifiedSender = true } if copied { @@ -178,14 +196,15 @@ func (s *sendStream) Write(p []byte) (int, error) { } if bytesWritten == len(p) { - return bytesWritten, nil + return false, bytesWritten, nil } if s.closeForShutdownErr != nil { - return bytesWritten, s.closeForShutdownErr + return false, bytesWritten, s.closeForShutdownErr } else if s.cancelWriteErr != nil { - return bytesWritten, s.cancelWriteErr + s.cancellationFlagged = true + return s.isNewlyCompleted(), bytesWritten, s.cancelWriteErr } - return bytesWritten, nil + return false, bytesWritten, nil } func (s *sendStream) canBufferStreamFrame() bool { @@ -200,12 +219,15 @@ func (s *sendStream) canBufferStreamFrame() bool { // maxBytes is the maximum length this frame (including frame header) will have. func (s *sendStream) popStreamFrame(maxBytes protocol.ByteCount, v protocol.Version) (af ackhandler.StreamFrame, ok, hasMore bool) { s.mutex.Lock() - f, hasMoreData := s.popNewOrRetransmittedStreamFrame(maxBytes, v) + f, hasMoreData, queuedControlFrame := s.popNewOrRetransmittedStreamFrame(maxBytes, v) if f != nil { s.numOutstandingFrames++ } s.mutex.Unlock() + if queuedControlFrame { + s.sender.onHasStreamControlFrame(s.streamID, s) + } if f == nil { return ackhandler.StreamFrame{}, false, hasMoreData } @@ -215,20 +237,20 @@ func (s *sendStream) popStreamFrame(maxBytes protocol.ByteCount, v protocol.Vers }, true, hasMoreData } -func (s *sendStream) popNewOrRetransmittedStreamFrame(maxBytes protocol.ByteCount, v protocol.Version) (*wire.StreamFrame, bool /* has more data to send */) { +func (s *sendStream) popNewOrRetransmittedStreamFrame(maxBytes protocol.ByteCount, v protocol.Version) (_ *wire.StreamFrame, hasMoreData, queuedControlFrame bool) { if s.cancelWriteErr != nil || s.closeForShutdownErr != nil { - return nil, false + return nil, false, false } if len(s.retransmissionQueue) > 0 { f, hasMoreRetransmissions := s.maybeGetRetransmission(maxBytes, v) if f != nil || hasMoreRetransmissions { if f == nil { - return nil, true + return nil, true, false } // We always claim that we have more data to send. // This might be incorrect, in which case there'll be a spurious call to popStreamFrame in the future. - return f, true + return f, true, false } } @@ -240,21 +262,18 @@ func (s *sendStream) popNewOrRetransmittedStreamFrame(maxBytes protocol.ByteCoun Offset: s.writeOffset, DataLenPresent: true, Fin: true, - }, false + }, false, false } - return nil, false + return nil, false, false } sendWindow := s.flowController.SendWindowSize() if sendWindow == 0 { - if isBlocked, offset := s.flowController.IsNewlyBlocked(); isBlocked { - s.sender.queueControlFrame(&wire.StreamDataBlockedFrame{ - StreamID: s.streamID, - MaximumStreamData: offset, - }) - return nil, false + if s.flowController.IsNewlyBlocked() { + s.queuedBlockedFrame = true + return nil, false, true } - return nil, true + return nil, true, false } f, hasMoreData := s.popNewStreamFrame(maxBytes, sendWindow, v) @@ -266,7 +285,7 @@ func (s *sendStream) popNewOrRetransmittedStreamFrame(maxBytes protocol.ByteCoun if f.Fin { s.finSent = true } - return f, hasMoreData + return f, hasMoreData, false } func (s *sendStream) popNewStreamFrame(maxBytes, sendWindow protocol.ByteCount, v protocol.Version) (*wire.StreamFrame, bool) { @@ -348,8 +367,24 @@ func (s *sendStream) getDataForWriting(f *wire.StreamFrame, maxBytes protocol.By } func (s *sendStream) isNewlyCompleted() bool { - completed := (s.finSent || s.cancelWriteErr != nil) && s.numOutstandingFrames == 0 && len(s.retransmissionQueue) == 0 - if completed && !s.completed { + if s.completed { + return false + } + // We need to keep the stream around until all frames have been sent and acknowledged. + if s.numOutstandingFrames > 0 || len(s.retransmissionQueue) > 0 || s.queuedResetStreamFrame { + return false + } + // The stream is completed if we sent the FIN. + if s.finSent { + s.completed = true + return true + } + // The stream is also completed if: + // 1. the application called CancelWrite, or + // 2. we received a STOP_SENDING, and + // * the application consumed the error via Write, or + // * the application called Close + if s.cancelWriteErr != nil && (s.cancellationFlagged || s.finishedWriting) { s.completed = true return true } @@ -362,15 +397,23 @@ func (s *sendStream) Close() error { s.mutex.Unlock() return nil } - if s.cancelWriteErr != nil { - s.mutex.Unlock() - return fmt.Errorf("close called for canceled stream %d", s.streamID) - } - s.ctxCancel(nil) s.finishedWriting = true + cancelWriteErr := s.cancelWriteErr + if cancelWriteErr != nil { + s.cancellationFlagged = true + } + completed := s.isNewlyCompleted() s.mutex.Unlock() - s.sender.onHasStreamData(s.streamID) // need to send the FIN, must be called without holding the mutex + if completed { + s.sender.onStreamCompleted(s.streamID) + } + if cancelWriteErr != nil { + return fmt.Errorf("close called for canceled stream %d", s.streamID) + } + s.sender.onHasStreamData(s.streamID, s) // need to send the FIN, must be called without holding the mutex + + s.ctxCancel(nil) return nil } @@ -378,9 +421,26 @@ func (s *sendStream) CancelWrite(errorCode StreamErrorCode) { s.cancelWriteImpl(errorCode, false) } -// must be called after locking the mutex func (s *sendStream) cancelWriteImpl(errorCode qerr.StreamErrorCode, remote bool) { s.mutex.Lock() + if s.closeForShutdownErr != nil { + s.mutex.Unlock() + return + } + if !remote { + s.cancellationFlagged = true + if s.cancelWriteErr != nil { + completed := s.isNewlyCompleted() + s.mutex.Unlock() + // The user has called CancelWrite. If the previous cancellation was + // because of a STOP_SENDING, we don't need to flag the error to the + // user anymore. + if completed { + s.sender.onStreamCompleted(s.streamID) + } + return + } + } if s.cancelWriteErr != nil { s.mutex.Unlock() return @@ -389,18 +449,11 @@ func (s *sendStream) cancelWriteImpl(errorCode qerr.StreamErrorCode, remote bool s.ctxCancel(s.cancelWriteErr) s.numOutstandingFrames = 0 s.retransmissionQueue = nil - newlyCompleted := s.isNewlyCompleted() + s.queuedResetStreamFrame = true s.mutex.Unlock() s.signalWrite() - s.sender.queueControlFrame(&wire.ResetStreamFrame{ - StreamID: s.streamID, - FinalSize: s.writeOffset, - ErrorCode: errorCode, - }) - if newlyCompleted { - s.sender.onStreamCompleted(s.streamID) - } + s.sender.onHasStreamControlFrame(s.streamID, s) } func (s *sendStream) updateSendWindow(limit protocol.ByteCount) { @@ -412,7 +465,7 @@ func (s *sendStream) updateSendWindow(limit protocol.ByteCount) { hasStreamData := s.dataForWriting != nil || s.nextFrame != nil s.mutex.Unlock() if hasStreamData { - s.sender.onHasStreamData(s.streamID) + s.sender.onHasStreamData(s.streamID, s) } } @@ -420,6 +473,32 @@ func (s *sendStream) handleStopSendingFrame(frame *wire.StopSendingFrame) { s.cancelWriteImpl(frame.ErrorCode, true) } +func (s *sendStream) getControlFrame() (_ ackhandler.Frame, ok, hasMore bool) { + s.mutex.Lock() + defer s.mutex.Unlock() + + if !s.queuedBlockedFrame && !s.queuedResetStreamFrame { + return ackhandler.Frame{}, false, false + } + if s.queuedBlockedFrame { + s.queuedBlockedFrame = false + return ackhandler.Frame{ + Frame: &wire.StreamDataBlockedFrame{StreamID: s.streamID, MaximumStreamData: s.writeOffset}, + }, true, s.queuedResetStreamFrame + } + // RESET_STREAM frame + s.queuedResetStreamFrame = false + s.numOutstandingFrames++ + return ackhandler.Frame{ + Frame: &wire.ResetStreamFrame{ + StreamID: s.streamID, + FinalSize: s.writeOffset, + ErrorCode: s.cancelWriteErr.ErrorCode, + }, + Handler: (*sendStreamResetStreamHandler)(s), + }, true, false +} + func (s *sendStream) Context() context.Context { return s.ctx } @@ -437,7 +516,6 @@ func (s *sendStream) SetWriteDeadline(t time.Time) error { // The peer will NOT be informed about this: the stream is closed without sending a FIN or RST. func (s *sendStream) closeForShutdown(err error) { s.mutex.Lock() - s.ctxCancel(err) s.closeForShutdownErr = err s.mutex.Unlock() s.signalWrite() @@ -467,10 +545,10 @@ func (s *sendStreamAckHandler) OnAcked(f wire.Frame) { if s.numOutstandingFrames < 0 { panic("numOutStandingFrames negative") } - newlyCompleted := (*sendStream)(s).isNewlyCompleted() + completed := (*sendStream)(s).isNewlyCompleted() s.mutex.Unlock() - if newlyCompleted { + if completed { s.sender.onStreamCompleted(s.streamID) } } @@ -490,5 +568,30 @@ func (s *sendStreamAckHandler) OnLost(f wire.Frame) { } s.mutex.Unlock() - s.sender.onHasStreamData(s.streamID) + s.sender.onHasStreamData(s.streamID, (*sendStream)(s)) +} + +type sendStreamResetStreamHandler sendStream + +var _ ackhandler.FrameHandler = &sendStreamResetStreamHandler{} + +func (s *sendStreamResetStreamHandler) OnAcked(wire.Frame) { + s.mutex.Lock() + s.numOutstandingFrames-- + if s.numOutstandingFrames < 0 { + panic("numOutStandingFrames negative") + } + completed := (*sendStream)(s).isNewlyCompleted() + s.mutex.Unlock() + + if completed { + s.sender.onStreamCompleted(s.streamID) + } +} + +func (s *sendStreamResetStreamHandler) OnLost(wire.Frame) { + s.mutex.Lock() + s.queuedResetStreamFrame = true + s.mutex.Unlock() + s.sender.onHasStreamControlFrame(s.streamID, (*sendStream)(s)) } diff --git a/vendor/github.com/quic-go/quic-go/server.go b/vendor/github.com/quic-go/quic-go/server.go index afbd18f..0cf45ac 100644 --- a/vendor/github.com/quic-go/quic-go/server.go +++ b/vendor/github.com/quic-go/quic-go/server.go @@ -18,7 +18,12 @@ import ( ) // ErrServerClosed is returned by the Listener or EarlyListener's Accept method after a call to Close. -var ErrServerClosed = errors.New("quic: server closed") +var ErrServerClosed = errServerClosed{} + +type errServerClosed struct{} + +func (errServerClosed) Error() string { return "quic: server closed" } +func (errServerClosed) Unwrap() error { return net.ErrClosed } // packetHandler handles packets type packetHandler interface { @@ -76,8 +81,12 @@ type baseServer struct { nextZeroRTTCleanup time.Time zeroRTTQueues map[protocol.ConnectionID]*zeroRTTQueue // only initialized if acceptEarlyConns == true + connContext func(context.Context) context.Context + // set as a member, so they can be set in the tests newConn func( + context.Context, + context.CancelCauseFunc, sendConn, connRunner, protocol.ConnectionID, /* original dest connection ID */ @@ -92,7 +101,6 @@ type baseServer struct { *handshake.TokenGenerator, bool, /* client address validated by an address validation token */ *logging.ConnectionTracer, - uint64, utils.Logger, protocol.Version, ) quicConn @@ -231,6 +239,7 @@ func newServer( conn rawConn, connHandler packetHandlerManager, connIDGenerator ConnectionIDGenerator, + connContext func(context.Context) context.Context, tlsConf *tls.Config, config *Config, tracer *logging.Tracer, @@ -243,6 +252,7 @@ func newServer( ) *baseServer { s := &baseServer{ conn: conn, + connContext: connContext, tlsConf: tlsConf, config: config, tokenGenerator: handshake.NewTokenGenerator(tokenGeneratorKey), @@ -631,7 +641,26 @@ func (s *baseServer) handleInitialImpl(p receivedPacket, hdr *wire.Header) error } var conn quicConn - tracingID := nextConnTracingID() + var cancel context.CancelCauseFunc + ctx, cancel1 := context.WithCancelCause(context.Background()) + if s.connContext != nil { + ctx = s.connContext(ctx) + if ctx == nil { + panic("quic: ConnContext returned nil") + } + // There's no guarantee that the application returns a context + // that's derived from the context we passed into ConnContext. + // We need to make sure that both contexts are cancelled. + var cancel2 context.CancelCauseFunc + ctx, cancel2 = context.WithCancelCause(ctx) + cancel = func(cause error) { + cancel1(cause) + cancel2(cause) + } + } else { + cancel = cancel1 + } + ctx = context.WithValue(ctx, ConnectionTracingKey, nextConnTracingID()) var tracer *logging.ConnectionTracer if config.Tracer != nil { // Use the same connection ID that is passed to the client's GetLogWriter callback. @@ -639,7 +668,7 @@ func (s *baseServer) handleInitialImpl(p receivedPacket, hdr *wire.Header) error if origDestConnID.Len() > 0 { connID = origDestConnID } - tracer = config.Tracer(context.WithValue(context.Background(), ConnectionTracingKey, tracingID), protocol.PerspectiveServer, connID) + tracer = config.Tracer(ctx, protocol.PerspectiveServer, connID) } connID, err := s.connIDGenerator.GenerateConnectionID() if err != nil { @@ -647,6 +676,8 @@ func (s *baseServer) handleInitialImpl(p receivedPacket, hdr *wire.Header) error } s.logger.Debugf("Changing connection ID to %s.", connID) conn = s.newConn( + ctx, + cancel, newSendConn(s.conn, p.remoteAddr, p.info, s.logger), s.connHandler, origDestConnID, @@ -661,7 +692,6 @@ func (s *baseServer) handleInitialImpl(p receivedPacket, hdr *wire.Header) error s.tokenGenerator, clientAddrVerified, tracer, - tracingID, s.logger, hdr.Version, ) @@ -778,7 +808,7 @@ func (s *baseServer) maybeSendInvalidToken(p rejectedPacket) { hdr := p.hdr sealer, opener := handshake.NewInitialAEAD(hdr.DestConnectionID, protocol.PerspectiveServer, hdr.Version) data := p.data[:hdr.ParsedLen()+hdr.Length] - extHdr, err := unpackLongHeader(opener, hdr, data, hdr.Version) + extHdr, err := unpackLongHeader(opener, hdr, data) // Only send INVALID_TOKEN if we can unprotect the packet. // This makes sure that we won't send it for packets that were corrupted. if err != nil { diff --git a/vendor/github.com/quic-go/quic-go/stream.go b/vendor/github.com/quic-go/quic-go/stream.go index ce4374d..1ed2632 100644 --- a/vendor/github.com/quic-go/quic-go/stream.go +++ b/vendor/github.com/quic-go/quic-go/stream.go @@ -1,6 +1,7 @@ package quic import ( + "context" "net" "os" "sync" @@ -23,8 +24,8 @@ var errDeadline net.Error = &deadlineError{} // The streamSender is notified by the stream about various events. type streamSender interface { - queueControlFrame(wire.Frame) - onHasStreamData(protocol.StreamID) + onHasStreamData(protocol.StreamID, sendStreamI) + onHasStreamControlFrame(protocol.StreamID, streamControlFrameGetter) // must be called without holding the mutex that is acquired by closeForShutdown onStreamCompleted(protocol.StreamID) } @@ -33,19 +34,16 @@ type streamSender interface { // This is necessary in order to keep track when both halves have been completed. type uniStreamSender struct { streamSender - onStreamCompletedImpl func() + onStreamCompletedImpl func() + onHasStreamControlFrameImpl func(protocol.StreamID, streamControlFrameGetter) } -func (s *uniStreamSender) queueControlFrame(f wire.Frame) { - s.streamSender.queueControlFrame(f) +func (s *uniStreamSender) onHasStreamData(id protocol.StreamID, str sendStreamI) { + s.streamSender.onHasStreamData(id, str) } - -func (s *uniStreamSender) onHasStreamData(id protocol.StreamID) { - s.streamSender.onHasStreamData(id) -} - -func (s *uniStreamSender) onStreamCompleted(protocol.StreamID) { - s.onStreamCompletedImpl() +func (s *uniStreamSender) onStreamCompleted(protocol.StreamID) { s.onStreamCompletedImpl() } +func (s *uniStreamSender) onHasStreamControlFrame(id protocol.StreamID, str streamControlFrameGetter) { + s.onHasStreamControlFrameImpl(id, str) } var _ streamSender = &uniStreamSender{} @@ -56,7 +54,6 @@ type streamI interface { // for receiving handleStreamFrame(*wire.StreamFrame) error handleResetStreamFrame(*wire.ResetStreamFrame) error - getWindowUpdate() protocol.ByteCount // for sending hasData() bool handleStopSendingFrame(*wire.StopSendingFrame) @@ -82,10 +79,15 @@ type stream struct { sendStreamCompleted bool } -var _ Stream = &stream{} +var ( + _ Stream = &stream{} + _ streamControlFrameGetter = &receiveStream{} +) // newStream creates a new Stream -func newStream(streamID protocol.StreamID, +func newStream( + ctx context.Context, + streamID protocol.StreamID, sender streamSender, flowController flowcontrol.StreamFlowController, ) *stream { @@ -98,8 +100,11 @@ func newStream(streamID protocol.StreamID, s.checkIfCompleted() s.completedMutex.Unlock() }, + onHasStreamControlFrameImpl: func(id protocol.StreamID, str streamControlFrameGetter) { + sender.onHasStreamControlFrame(streamID, s) + }, } - s.sendStream = *newSendStream(streamID, senderForSendStream, flowController) + s.sendStream = *newSendStream(ctx, streamID, senderForSendStream, flowController) senderForReceiveStream := &uniStreamSender{ streamSender: sender, onStreamCompletedImpl: func() { @@ -108,6 +113,9 @@ func newStream(streamID protocol.StreamID, s.checkIfCompleted() s.completedMutex.Unlock() }, + onHasStreamControlFrameImpl: func(id protocol.StreamID, str streamControlFrameGetter) { + sender.onHasStreamControlFrame(streamID, s) + }, } s.receiveStream = *newReceiveStream(streamID, senderForReceiveStream, flowController) return s @@ -123,6 +131,14 @@ func (s *stream) Close() error { return s.sendStream.Close() } +func (s *stream) getControlFrame() (_ ackhandler.Frame, ok, hasMore bool) { + f, ok, _ := s.sendStream.getControlFrame() + if ok { + return f, true, true + } + return s.receiveStream.getControlFrame() +} + func (s *stream) SetDeadline(t time.Time) error { _ = s.SetReadDeadline(t) // SetReadDeadline never errors _ = s.SetWriteDeadline(t) // SetWriteDeadline never errors diff --git a/vendor/github.com/quic-go/quic-go/streams_map.go b/vendor/github.com/quic-go/quic-go/streams_map.go index b1a80eb..0ce9128 100644 --- a/vendor/github.com/quic-go/quic-go/streams_map.go +++ b/vendor/github.com/quic-go/quic-go/streams_map.go @@ -38,19 +38,31 @@ type streamOpenErr struct{ error } var _ net.Error = &streamOpenErr{} -func (e streamOpenErr) Temporary() bool { return e.error == errTooManyOpenStreams } -func (streamOpenErr) Timeout() bool { return false } +func (streamOpenErr) Timeout() bool { return false } +func (e streamOpenErr) Unwrap() error { return e.error } -// errTooManyOpenStreams is used internally by the outgoing streams maps. -var errTooManyOpenStreams = errors.New("too many open streams") +func (e streamOpenErr) Temporary() bool { + // In older versions of quic-go, the stream limit error was documented to be a net.Error.Temporary. + // This function was since deprecated, but we keep the existing behavior. + return errors.Is(e, &StreamLimitReachedError{}) +} + +// StreamLimitReachedError is returned from Connection.OpenStream and Connection.OpenUniStream +// when it is not possible to open a new stream because the number of opens streams reached +// the peer's stream limit. +type StreamLimitReachedError struct{} + +func (e StreamLimitReachedError) Error() string { return "too many open streams" } type streamsMap struct { + ctx context.Context // not used for cancellations, but carries the values associated with the connection perspective protocol.Perspective maxIncomingBidiStreams uint64 maxIncomingUniStreams uint64 sender streamSender + queueControlFrame func(wire.Frame) newFlowController func(protocol.StreamID) flowcontrol.StreamFlowController mutex sync.Mutex @@ -64,14 +76,18 @@ type streamsMap struct { var _ streamManager = &streamsMap{} func newStreamsMap( + ctx context.Context, sender streamSender, + queueControlFrame func(wire.Frame), newFlowController func(protocol.StreamID) flowcontrol.StreamFlowController, maxIncomingBidiStreams uint64, maxIncomingUniStreams uint64, perspective protocol.Perspective, -) streamManager { +) *streamsMap { m := &streamsMap{ + ctx: ctx, perspective: perspective, + queueControlFrame: queueControlFrame, newFlowController: newFlowController, maxIncomingBidiStreams: maxIncomingBidiStreams, maxIncomingUniStreams: maxIncomingUniStreams, @@ -86,26 +102,26 @@ func (m *streamsMap) initMaps() { protocol.StreamTypeBidi, func(num protocol.StreamNum) streamI { id := num.StreamID(protocol.StreamTypeBidi, m.perspective) - return newStream(id, m.sender, m.newFlowController(id)) + return newStream(m.ctx, id, m.sender, m.newFlowController(id)) }, - m.sender.queueControlFrame, + m.queueControlFrame, ) m.incomingBidiStreams = newIncomingStreamsMap( protocol.StreamTypeBidi, func(num protocol.StreamNum) streamI { id := num.StreamID(protocol.StreamTypeBidi, m.perspective.Opposite()) - return newStream(id, m.sender, m.newFlowController(id)) + return newStream(m.ctx, id, m.sender, m.newFlowController(id)) }, m.maxIncomingBidiStreams, - m.sender.queueControlFrame, + m.queueControlFrame, ) m.outgoingUniStreams = newOutgoingStreamsMap( protocol.StreamTypeUni, func(num protocol.StreamNum) sendStreamI { id := num.StreamID(protocol.StreamTypeUni, m.perspective) - return newSendStream(id, m.sender, m.newFlowController(id)) + return newSendStream(m.ctx, id, m.sender, m.newFlowController(id)) }, - m.sender.queueControlFrame, + m.queueControlFrame, ) m.incomingUniStreams = newIncomingStreamsMap( protocol.StreamTypeUni, @@ -114,7 +130,7 @@ func (m *streamsMap) initMaps() { return newReceiveStream(id, m.sender, m.newFlowController(id)) }, m.maxIncomingUniStreams, - m.sender.queueControlFrame, + m.queueControlFrame, ) } diff --git a/vendor/github.com/quic-go/quic-go/streams_map_outgoing.go b/vendor/github.com/quic-go/quic-go/streams_map_outgoing.go index fd45f4e..a8d04b0 100644 --- a/vendor/github.com/quic-go/quic-go/streams_map_outgoing.go +++ b/vendor/github.com/quic-go/quic-go/streams_map_outgoing.go @@ -60,7 +60,7 @@ func (m *outgoingStreamsMap[T]) OpenStream() (T, error) { // if there are OpenStreamSync calls waiting, return an error here if len(m.openQueue) > 0 || m.nextStream > m.maxStream { m.maybeSendBlockedFrame() - return *new(T), streamOpenErr{errTooManyOpenStreams} + return *new(T), streamOpenErr{&StreamLimitReachedError{}} } return m.openStream(), nil } diff --git a/vendor/github.com/quic-go/quic-go/sys_conn_helper_darwin.go b/vendor/github.com/quic-go/quic-go/sys_conn_helper_darwin.go index d761072..545502d 100644 --- a/vendor/github.com/quic-go/quic-go/sys_conn_helper_darwin.go +++ b/vendor/github.com/quic-go/quic-go/sys_conn_helper_darwin.go @@ -33,4 +33,6 @@ func parseIPv4PktInfo(body []byte) (ip netip.Addr, ifIndex uint32, ok bool) { return netip.AddrFrom4(*(*[4]byte)(body[8:12])), binary.LittleEndian.Uint32(body), true } -func isGSOSupported(syscall.RawConn) bool { return false } +func isGSOEnabled(syscall.RawConn) bool { return false } + +func isECNEnabled() bool { return !isECNDisabledUsingEnv() } diff --git a/vendor/github.com/quic-go/quic-go/sys_conn_helper_freebsd.go b/vendor/github.com/quic-go/quic-go/sys_conn_helper_freebsd.go index a53ca2e..521f80d 100644 --- a/vendor/github.com/quic-go/quic-go/sys_conn_helper_freebsd.go +++ b/vendor/github.com/quic-go/quic-go/sys_conn_helper_freebsd.go @@ -28,4 +28,6 @@ func parseIPv4PktInfo(body []byte) (ip netip.Addr, _ uint32, ok bool) { return netip.AddrFrom4(*(*[4]byte)(body)), 0, true } -func isGSOSupported(syscall.RawConn) bool { return false } +func isGSOEnabled(syscall.RawConn) bool { return false } + +func isECNEnabled() bool { return !isECNDisabledUsingEnv() } diff --git a/vendor/github.com/quic-go/quic-go/sys_conn_helper_linux.go b/vendor/github.com/quic-go/quic-go/sys_conn_helper_linux.go index 5fbf34a..eec1271 100644 --- a/vendor/github.com/quic-go/quic-go/sys_conn_helper_linux.go +++ b/vendor/github.com/quic-go/quic-go/sys_conn_helper_linux.go @@ -23,6 +23,12 @@ const ecnIPv4DataLen = 1 const batchSize = 8 // needs to smaller than MaxUint8 (otherwise the type of oobConn.readPos has to be changed) +var kernelVersionMajor int + +func init() { + kernelVersionMajor, _ = kernelVersion() +} + func forceSetReceiveBuffer(c syscall.RawConn, bytes int) error { var serr error if err := c.Control(func(fd uintptr) { @@ -55,9 +61,12 @@ func parseIPv4PktInfo(body []byte) (ip netip.Addr, ifIndex uint32, ok bool) { return netip.AddrFrom4(*(*[4]byte)(body[8:12])), binary.LittleEndian.Uint32(body), true } -// isGSOSupported tests if the kernel supports GSO. +// isGSOEnabled tests if the kernel supports GSO. // Sending with GSO might still fail later on, if the interface doesn't support it (see isGSOError). -func isGSOSupported(conn syscall.RawConn) bool { +func isGSOEnabled(conn syscall.RawConn) bool { + if kernelVersionMajor < 5 { + return false + } disabled, err := strconv.ParseBool(os.Getenv("QUIC_GO_DISABLE_GSO")) if err == nil && disabled { return false @@ -108,3 +117,40 @@ func isPermissionError(err error) bool { } return false } + +func isECNEnabled() bool { + return kernelVersionMajor >= 5 && !isECNDisabledUsingEnv() +} + +// kernelVersion returns major and minor kernel version numbers, parsed from +// the syscall.Uname's Release field, or 0, 0 if the version can't be obtained +// or parsed. +// +// copied from the standard library's internal/syscall/unix/kernel_version_linux.go +func kernelVersion() (major, minor int) { + var uname syscall.Utsname + if err := syscall.Uname(&uname); err != nil { + return + } + + var ( + values [2]int + value, vi int + ) + for _, c := range uname.Release { + if '0' <= c && c <= '9' { + value = (value * 10) + int(c-'0') + } else { + // Note that we're assuming N.N.N here. + // If we see anything else, we are likely to mis-parse it. + values[vi] = value + vi++ + if vi >= len(values) { + break + } + value = 0 + } + } + + return values[0], values[1] +} diff --git a/vendor/github.com/quic-go/quic-go/sys_conn_oob.go b/vendor/github.com/quic-go/quic-go/sys_conn_oob.go index 64d581c..a6795ca 100644 --- a/vendor/github.com/quic-go/quic-go/sys_conn_oob.go +++ b/vendor/github.com/quic-go/quic-go/sys_conn_oob.go @@ -59,7 +59,7 @@ func inspectWriteBuffer(c syscall.RawConn) (int, error) { return size, serr } -func isECNDisabled() bool { +func isECNDisabledUsingEnv() bool { disabled, err := strconv.ParseBool(os.Getenv("QUIC_GO_DISABLE_ECN")) return err == nil && disabled } @@ -147,8 +147,8 @@ func newConn(c OOBCapablePacketConn, supportsDF bool) (*oobConn, error) { readPos: batchSize, cap: connCapabilities{ DF: supportsDF, - GSO: isGSOSupported(rawConn), - ECN: !isECNDisabled(), + GSO: isGSOEnabled(rawConn), + ECN: isECNEnabled(), }, } for i := 0; i < batchSize; i++ { @@ -247,7 +247,7 @@ func (c *oobConn) WritePacket(b []byte, addr net.Addr, packetInfoOOB []byte, gso } if ecn != protocol.ECNUnsupported { if !c.capabilities().ECN { - panic("tried to send a ECN-marked packet although ECN is disabled") + panic("tried to send an ECN-marked packet although ECN is disabled") } if remoteUDPAddr, ok := addr.(*net.UDPAddr); ok { if remoteUDPAddr.IP.To4() != nil { diff --git a/vendor/github.com/quic-go/quic-go/transport.go b/vendor/github.com/quic-go/quic-go/transport.go index ea219c1..059f30f 100644 --- a/vendor/github.com/quic-go/quic-go/transport.go +++ b/vendor/github.com/quic-go/quic-go/transport.go @@ -89,6 +89,17 @@ type Transport struct { // implementation of this callback (negating its return value). VerifySourceAddress func(net.Addr) bool + // ConnContext is called when the server accepts a new connection. + // The context is closed when the connection is closed, or when the handshake fails for any reason. + // The context returned from the callback is used to derive every other context used during the + // lifetime of the connection: + // * the context passed to crypto/tls (and used on the tls.ClientHelloInfo) + // * the context used in Config.Tracer + // * the context returned from Connection.Context + // * the context returned from SendStream.Context + // It is not used for dialed connections. + ConnContext func(context.Context) context.Context + // A Tracer traces events that don't belong to a single QUIC connection. // Tracer.Close is called when the transport is closed. Tracer *logging.Tracer @@ -168,6 +179,7 @@ func (t *Transport) createServer(tlsConf *tls.Config, conf *Config, allow0RTT bo t.conn, t.handlerMap, t.connIDGenerator, + t.ConnContext, tlsConf, conf, t.Tracer, diff --git a/vendor/github.com/quic-go/quic-go/window_update_queue.go b/vendor/github.com/quic-go/quic-go/window_update_queue.go deleted file mode 100644 index 9ed1214..0000000 --- a/vendor/github.com/quic-go/quic-go/window_update_queue.go +++ /dev/null @@ -1,71 +0,0 @@ -package quic - -import ( - "sync" - - "github.com/quic-go/quic-go/internal/flowcontrol" - "github.com/quic-go/quic-go/internal/protocol" - "github.com/quic-go/quic-go/internal/wire" -) - -type windowUpdateQueue struct { - mutex sync.Mutex - - queue map[protocol.StreamID]struct{} // used as a set - queuedConn bool // connection-level window update - - streamGetter streamGetter - connFlowController flowcontrol.ConnectionFlowController - callback func(wire.Frame) -} - -func newWindowUpdateQueue( - streamGetter streamGetter, - connFC flowcontrol.ConnectionFlowController, - cb func(wire.Frame), -) *windowUpdateQueue { - return &windowUpdateQueue{ - queue: make(map[protocol.StreamID]struct{}), - streamGetter: streamGetter, - connFlowController: connFC, - callback: cb, - } -} - -func (q *windowUpdateQueue) AddStream(id protocol.StreamID) { - q.mutex.Lock() - q.queue[id] = struct{}{} - q.mutex.Unlock() -} - -func (q *windowUpdateQueue) AddConnection() { - q.mutex.Lock() - q.queuedConn = true - q.mutex.Unlock() -} - -func (q *windowUpdateQueue) QueueAll() { - q.mutex.Lock() - // queue a connection-level window update - if q.queuedConn { - q.callback(&wire.MaxDataFrame{MaximumData: q.connFlowController.GetWindowUpdate()}) - q.queuedConn = false - } - // queue all stream-level window updates - for id := range q.queue { - delete(q.queue, id) - str, err := q.streamGetter.GetOrOpenReceiveStream(id) - if err != nil || str == nil { // the stream can be nil if it was completed before dequeing the window update - continue - } - offset := str.getWindowUpdate() - if offset == 0 { // can happen if we received a final offset, right after queueing the window update - continue - } - q.callback(&wire.MaxStreamDataFrame{ - StreamID: id, - MaximumStreamData: offset, - }) - } - q.mutex.Unlock() -} diff --git a/vendor/github.com/skycoin/dmsg/pkg/dmsg/const.go b/vendor/github.com/skycoin/dmsg/pkg/dmsg/const.go index ac3e857..dfe8a7e 100644 --- a/vendor/github.com/skycoin/dmsg/pkg/dmsg/const.go +++ b/vendor/github.com/skycoin/dmsg/pkg/dmsg/const.go @@ -25,8 +25,7 @@ const ( // DiscAddr returns the address of the dmsg discovery func DiscAddr(testenv bool) string { if testenv { - return skywire.Prod.DmsgDiscovery - + return skywire.Test.DmsgDiscovery } - return skywire.Test.DmsgDiscovery + return skywire.Prod.DmsgDiscovery } diff --git a/vendor/github.com/skycoin/dmsg/pkg/dmsg/session_common.go b/vendor/github.com/skycoin/dmsg/pkg/dmsg/session_common.go index 12e9c81..d38afd1 100644 --- a/vendor/github.com/skycoin/dmsg/pkg/dmsg/session_common.go +++ b/vendor/github.com/skycoin/dmsg/pkg/dmsg/session_common.go @@ -3,11 +3,13 @@ package dmsg import ( "encoding/binary" + "fmt" "io" "net" "sync" "time" + "github.com/chen3feng/safecast" "github.com/hashicorp/yamux" "github.com/sirupsen/logrus" "github.com/skycoin/skywire-utilities/pkg/cipher" @@ -123,9 +125,13 @@ func (sc *SessionCommon) writeObject(w io.Writer, obj SignedObject) error { p := sc.ns.EncryptUnsafe(obj) sc.wMx.Unlock() p = append(make([]byte, 2), p...) - binary.BigEndian.PutUint16(p, uint16(len(p)-2)) - _, err := w.Write(p) - return err + lps2, ok := safecast.To[uint16](len(p) - 2) + if ok { + binary.BigEndian.PutUint16(p, lps2) + _, err := w.Write(p) + return err + } + return fmt.Errorf("writeObject failed cast to uint16") } func (sc *SessionCommon) readObject(r io.Reader) (SignedObject, error) { diff --git a/vendor/github.com/skycoin/dmsg/pkg/dmsghttp/util.go b/vendor/github.com/skycoin/dmsg/pkg/dmsghttp/util.go index af97bc7..a3ac740 100644 --- a/vendor/github.com/skycoin/dmsg/pkg/dmsghttp/util.go +++ b/vendor/github.com/skycoin/dmsg/pkg/dmsghttp/util.go @@ -52,7 +52,7 @@ func UpdateServers(ctx context.Context, dClient disc.APIClient, dmsgDisc string, for { select { case <-ctx.Done(): - return + return //nolint case <-ticker.C: servers, err := dmsgclient.AllServers(ctx) if err != nil { diff --git a/vendor/github.com/skycoin/dmsg/pkg/noise/read_writer.go b/vendor/github.com/skycoin/dmsg/pkg/noise/read_writer.go index 8315753..a393266 100644 --- a/vendor/github.com/skycoin/dmsg/pkg/noise/read_writer.go +++ b/vendor/github.com/skycoin/dmsg/pkg/noise/read_writer.go @@ -11,6 +11,7 @@ import ( "sync" "time" + "github.com/chen3feng/safecast" "github.com/skycoin/skywire-utilities/pkg/cipher" "github.com/skycoin/dmsg/pkg/ioutil" @@ -263,11 +264,15 @@ func ResponderHandshake(ns *Noise, r *bufio.Reader, w io.Writer) error { // It returns the bytes written. func WriteRawFrame(w io.Writer, p []byte) ([]byte, error) { buf := make([]byte, prefixSize+len(p)) - binary.BigEndian.PutUint16(buf, uint16(len(p))) - copy(buf[prefixSize:], p) + lenp, ok := safecast.To[uint16](len(p)) + if ok { + binary.BigEndian.PutUint16(buf, lenp) + copy(buf[prefixSize:], p) - n, err := w.Write(buf) - return buf[:n], err + n, err := w.Write(buf) + return buf[:n], err + } + return []byte{}, fmt.Errorf("failed to cast length of slice to uint16") } // ReadRawFrame attempts to read a raw frame from a buffered reader. diff --git a/vendor/github.com/spf13/cobra/.golangci.yml b/vendor/github.com/spf13/cobra/.golangci.yml index 2578d94..2c8f480 100644 --- a/vendor/github.com/spf13/cobra/.golangci.yml +++ b/vendor/github.com/spf13/cobra/.golangci.yml @@ -19,44 +19,39 @@ linters: disable-all: true enable: #- bodyclose - - deadcode + # - deadcode ! deprecated since v1.49.0; replaced by 'unused' #- depguard #- dogsled #- dupl - errcheck #- exhaustive #- funlen - - gas #- gochecknoinits - goconst - #- gocritic + - gocritic #- gocyclo - #- gofmt + - gofmt - goimports - - golint #- gomnd #- goprintffuncname - #- gosec - #- gosimple + - gosec + - gosimple - govet - ineffassign - - interfacer #- lll - - maligned - - megacheck - #- misspell + - misspell #- nakedret #- noctx - #- nolintlint + - nolintlint #- rowserrcheck #- scopelint - #- staticcheck - - structcheck - #- stylecheck + - staticcheck + #- structcheck ! deprecated since v1.49.0; replaced by 'unused' + - stylecheck #- typecheck - unconvert #- unparam - #- unused - - varcheck + - unused + # - varcheck ! deprecated since v1.49.0; replaced by 'unused' #- whitespace fast: false diff --git a/vendor/github.com/spf13/cobra/README.md b/vendor/github.com/spf13/cobra/README.md index 592c0b8..6444f4b 100644 --- a/vendor/github.com/spf13/cobra/README.md +++ b/vendor/github.com/spf13/cobra/README.md @@ -4,7 +4,7 @@ Cobra is a library for creating powerful modern CLI applications. Cobra is used in many Go projects such as [Kubernetes](https://kubernetes.io/), [Hugo](https://gohugo.io), and [GitHub CLI](https://github.com/cli/cli) to -name a few. [This list](./projects_using_cobra.md) contains a more extensive list of projects using Cobra. +name a few. [This list](site/content/projects_using_cobra.md) contains a more extensive list of projects using Cobra. [![](https://img.shields.io/github/actions/workflow/status/spf13/cobra/test.yml?branch=main&longCache=true&label=Test&logo=github%20actions&logoColor=fff)](https://github.com/spf13/cobra/actions?query=workflow%3ATest) [![Go Reference](https://pkg.go.dev/badge/github.com/spf13/cobra.svg)](https://pkg.go.dev/github.com/spf13/cobra) @@ -80,7 +80,7 @@ which maintains the same interface while adding POSIX compliance. # Installing Using Cobra is easy. First, use `go get` to install the latest version -of the library. +of the library. ``` go get -u github.com/spf13/cobra@latest @@ -105,8 +105,8 @@ go install github.com/spf13/cobra-cli@latest For complete details on using the Cobra-CLI generator, please read [The Cobra Generator README](https://github.com/spf13/cobra-cli/blob/main/README.md) -For complete details on using the Cobra library, please read the [The Cobra User Guide](user_guide.md). +For complete details on using the Cobra library, please read the [The Cobra User Guide](site/content/user_guide.md). # License -Cobra is released under the Apache 2.0 license. See [LICENSE.txt](https://github.com/spf13/cobra/blob/master/LICENSE.txt) +Cobra is released under the Apache 2.0 license. See [LICENSE.txt](LICENSE.txt) diff --git a/vendor/github.com/spf13/cobra/active_help.go b/vendor/github.com/spf13/cobra/active_help.go index 2d02394..25c30e3 100644 --- a/vendor/github.com/spf13/cobra/active_help.go +++ b/vendor/github.com/spf13/cobra/active_help.go @@ -17,15 +17,14 @@ package cobra import ( "fmt" "os" - "strings" ) const ( activeHelpMarker = "_activeHelp_ " // The below values should not be changed: programs will be using them explicitly // in their user documentation, and users will be using them explicitly. - activeHelpEnvVarSuffix = "_ACTIVE_HELP" - activeHelpGlobalEnvVar = "COBRA_ACTIVE_HELP" + activeHelpEnvVarSuffix = "ACTIVE_HELP" + activeHelpGlobalEnvVar = configEnvVarGlobalPrefix + "_" + activeHelpEnvVarSuffix activeHelpGlobalDisable = "0" ) @@ -42,7 +41,7 @@ func AppendActiveHelp(compArray []string, activeHelpStr string) []string { // GetActiveHelpConfig returns the value of the ActiveHelp environment variable // _ACTIVE_HELP where is the name of the root command in upper -// case, with all - replaced by _. +// case, with all non-ASCII-alphanumeric characters replaced by `_`. // It will always return "0" if the global environment variable COBRA_ACTIVE_HELP // is set to "0". func GetActiveHelpConfig(cmd *Command) string { @@ -55,9 +54,7 @@ func GetActiveHelpConfig(cmd *Command) string { // activeHelpEnvVar returns the name of the program-specific ActiveHelp environment // variable. It has the format _ACTIVE_HELP where is the name of the -// root command in upper case, with all - replaced by _. +// root command in upper case, with all non-ASCII-alphanumeric characters replaced by `_`. func activeHelpEnvVar(name string) string { - // This format should not be changed: users will be using it explicitly. - activeHelpEnvVar := strings.ToUpper(fmt.Sprintf("%s%s", name, activeHelpEnvVarSuffix)) - return strings.ReplaceAll(activeHelpEnvVar, "-", "_") + return configEnvVar(name, activeHelpEnvVarSuffix) } diff --git a/vendor/github.com/spf13/cobra/active_help.md b/vendor/github.com/spf13/cobra/active_help.md deleted file mode 100644 index 5e7f59a..0000000 --- a/vendor/github.com/spf13/cobra/active_help.md +++ /dev/null @@ -1,157 +0,0 @@ -# Active Help - -Active Help is a framework provided by Cobra which allows a program to define messages (hints, warnings, etc) that will be printed during program usage. It aims to make it easier for your users to learn how to use your program. If configured by the program, Active Help is printed when the user triggers shell completion. - -For example, -``` -bash-5.1$ helm repo add [tab] -You must choose a name for the repo you are adding. - -bash-5.1$ bin/helm package [tab] -Please specify the path to the chart to package - -bash-5.1$ bin/helm package [tab][tab] -bin/ internal/ scripts/ pkg/ testdata/ -``` - -**Hint**: A good place to use Active Help messages is when the normal completion system does not provide any suggestions. In such cases, Active Help nicely supplements the normal shell completions to guide the user in knowing what is expected by the program. -## Supported shells - -Active Help is currently only supported for the following shells: -- Bash (using [bash completion V2](shell_completions.md#bash-completion-v2) only). Note that bash 4.4 or higher is required for the prompt to appear when an Active Help message is printed. -- Zsh - -## Adding Active Help messages - -As Active Help uses the shell completion system, the implementation of Active Help messages is done by enhancing custom dynamic completions. If you are not familiar with dynamic completions, please refer to [Shell Completions](shell_completions.md). - -Adding Active Help is done through the use of the `cobra.AppendActiveHelp(...)` function, where the program repeatedly adds Active Help messages to the list of completions. Keep reading for details. - -### Active Help for nouns - -Adding Active Help when completing a noun is done within the `ValidArgsFunction(...)` of a command. Please notice the use of `cobra.AppendActiveHelp(...)` in the following example: - -```go -cmd := &cobra.Command{ - Use: "add [NAME] [URL]", - Short: "add a chart repository", - Args: require.ExactArgs(2), - RunE: func(cmd *cobra.Command, args []string) error { - return addRepo(args) - }, - ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - var comps []string - if len(args) == 0 { - comps = cobra.AppendActiveHelp(comps, "You must choose a name for the repo you are adding") - } else if len(args) == 1 { - comps = cobra.AppendActiveHelp(comps, "You must specify the URL for the repo you are adding") - } else { - comps = cobra.AppendActiveHelp(comps, "This command does not take any more arguments") - } - return comps, cobra.ShellCompDirectiveNoFileComp - }, -} -``` -The example above defines the completions (none, in this specific example) as well as the Active Help messages for the `helm repo add` command. It yields the following behavior: -``` -bash-5.1$ helm repo add [tab] -You must choose a name for the repo you are adding - -bash-5.1$ helm repo add grafana [tab] -You must specify the URL for the repo you are adding - -bash-5.1$ helm repo add grafana https://grafana.github.io/helm-charts [tab] -This command does not take any more arguments -``` -**Hint**: As can be seen in the above example, a good place to use Active Help messages is when the normal completion system does not provide any suggestions. In such cases, Active Help nicely supplements the normal shell completions. - -### Active Help for flags - -Providing Active Help for flags is done in the same fashion as for nouns, but using the completion function registered for the flag. For example: -```go -_ = cmd.RegisterFlagCompletionFunc("version", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - if len(args) != 2 { - return cobra.AppendActiveHelp(nil, "You must first specify the chart to install before the --version flag can be completed"), cobra.ShellCompDirectiveNoFileComp - } - return compVersionFlag(args[1], toComplete) - }) -``` -The example above prints an Active Help message when not enough information was given by the user to complete the `--version` flag. -``` -bash-5.1$ bin/helm install myrelease --version 2.0.[tab] -You must first specify the chart to install before the --version flag can be completed - -bash-5.1$ bin/helm install myrelease bitnami/solr --version 2.0.[tab][tab] -2.0.1 2.0.2 2.0.3 -``` - -## User control of Active Help - -You may want to allow your users to disable Active Help or choose between different levels of Active Help. It is entirely up to the program to define the type of configurability of Active Help that it wants to offer, if any. -Allowing to configure Active Help is entirely optional; you can use Active Help in your program without doing anything about Active Help configuration. - -The way to configure Active Help is to use the program's Active Help environment -variable. That variable is named `_ACTIVE_HELP` where `` is the name of your -program in uppercase with any `-` replaced by an `_`. The variable should be set by the user to whatever -Active Help configuration values are supported by the program. - -For example, say `helm` has chosen to support three levels for Active Help: `on`, `off`, `local`. Then a user -would set the desired behavior to `local` by doing `export HELM_ACTIVE_HELP=local` in their shell. - -For simplicity, when in `cmd.ValidArgsFunction(...)` or a flag's completion function, the program should read the -Active Help configuration using the `cobra.GetActiveHelpConfig(cmd)` function and select what Active Help messages -should or should not be added (instead of reading the environment variable directly). - -For example: -```go -ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - activeHelpLevel := cobra.GetActiveHelpConfig(cmd) - - var comps []string - if len(args) == 0 { - if activeHelpLevel != "off" { - comps = cobra.AppendActiveHelp(comps, "You must choose a name for the repo you are adding") - } - } else if len(args) == 1 { - if activeHelpLevel != "off" { - comps = cobra.AppendActiveHelp(comps, "You must specify the URL for the repo you are adding") - } - } else { - if activeHelpLevel == "local" { - comps = cobra.AppendActiveHelp(comps, "This command does not take any more arguments") - } - } - return comps, cobra.ShellCompDirectiveNoFileComp -}, -``` -**Note 1**: If the `_ACTIVE_HELP` environment variable is set to the string "0", Cobra will automatically disable all Active Help output (even if some output was specified by the program using the `cobra.AppendActiveHelp(...)` function). Using "0" can simplify your code in situations where you want to blindly disable Active Help without having to call `cobra.GetActiveHelpConfig(cmd)` explicitly. - -**Note 2**: If a user wants to disable Active Help for every single program based on Cobra, she can set the environment variable `COBRA_ACTIVE_HELP` to "0". In this case `cobra.GetActiveHelpConfig(cmd)` will return "0" no matter what the variable `_ACTIVE_HELP` is set to. - -**Note 3**: If the user does not set `_ACTIVE_HELP` or `COBRA_ACTIVE_HELP` (which will be a common case), the default value for the Active Help configuration returned by `cobra.GetActiveHelpConfig(cmd)` will be the empty string. -## Active Help with Cobra's default completion command - -Cobra provides a default `completion` command for programs that wish to use it. -When using the default `completion` command, Active Help is configurable in the same -fashion as described above using environment variables. You may wish to document this in more -details for your users. - -## Debugging Active Help - -Debugging your Active Help code is done in the same way as debugging your dynamic completion code, which is with Cobra's hidden `__complete` command. Please refer to [debugging shell completion](shell_completions.md#debugging) for details. - -When debugging with the `__complete` command, if you want to specify different Active Help configurations, you should use the active help environment variable. That variable is named `_ACTIVE_HELP` where any `-` is replaced by an `_`. For example, we can test deactivating some Active Help as shown below: -``` -$ HELM_ACTIVE_HELP=1 bin/helm __complete install wordpress bitnami/h -bitnami/haproxy -bitnami/harbor -_activeHelp_ WARNING: cannot re-use a name that is still in use -:0 -Completion ended with directive: ShellCompDirectiveDefault - -$ HELM_ACTIVE_HELP=0 bin/helm __complete install wordpress bitnami/h -bitnami/haproxy -bitnami/harbor -:0 -Completion ended with directive: ShellCompDirectiveDefault -``` diff --git a/vendor/github.com/spf13/cobra/args.go b/vendor/github.com/spf13/cobra/args.go index e79ec33..ed1e70c 100644 --- a/vendor/github.com/spf13/cobra/args.go +++ b/vendor/github.com/spf13/cobra/args.go @@ -52,9 +52,9 @@ func OnlyValidArgs(cmd *Command, args []string) error { if len(cmd.ValidArgs) > 0 { // Remove any description that may be included in ValidArgs. // A description is following a tab character. - var validArgs []string + validArgs := make([]string, 0, len(cmd.ValidArgs)) for _, v := range cmd.ValidArgs { - validArgs = append(validArgs, strings.Split(v, "\t")[0]) + validArgs = append(validArgs, strings.SplitN(v, "\t", 2)[0]) } for _, v := range args { if !stringInSlice(v, validArgs) { diff --git a/vendor/github.com/spf13/cobra/bash_completions.go b/vendor/github.com/spf13/cobra/bash_completions.go index 10c7884..f4d198c 100644 --- a/vendor/github.com/spf13/cobra/bash_completions.go +++ b/vendor/github.com/spf13/cobra/bash_completions.go @@ -85,7 +85,7 @@ __%[1]s_handle_go_custom_completion() local out requestComp lastParam lastChar comp directive args # Prepare the command to request completions for the program. - # Calling ${words[0]} instead of directly %[1]s allows to handle aliases + # Calling ${words[0]} instead of directly %[1]s allows handling aliases args=("${words[@]:1}") # Disable ActiveHelp which is not supported for bash completion v1 requestComp="%[8]s=0 ${words[0]} %[2]s ${args[*]}" @@ -597,19 +597,16 @@ func writeRequiredFlag(buf io.StringWriter, cmd *Command) { if nonCompletableFlag(flag) { return } - for key := range flag.Annotations { - switch key { - case BashCompOneRequiredFlag: - format := " must_have_one_flag+=(\"--%s" - if flag.Value.Type() != "bool" { - format += "=" - } - format += cbn - WriteStringAndCheck(buf, fmt.Sprintf(format, flag.Name)) - - if len(flag.Shorthand) > 0 { - WriteStringAndCheck(buf, fmt.Sprintf(" must_have_one_flag+=(\"-%s"+cbn, flag.Shorthand)) - } + if _, ok := flag.Annotations[BashCompOneRequiredFlag]; ok { + format := " must_have_one_flag+=(\"--%s" + if flag.Value.Type() != "bool" { + format += "=" + } + format += cbn + WriteStringAndCheck(buf, fmt.Sprintf(format, flag.Name)) + + if len(flag.Shorthand) > 0 { + WriteStringAndCheck(buf, fmt.Sprintf(" must_have_one_flag+=(\"-%s"+cbn, flag.Shorthand)) } } }) @@ -621,7 +618,7 @@ func writeRequiredNouns(buf io.StringWriter, cmd *Command) { for _, value := range cmd.ValidArgs { // Remove any description that may be included following a tab character. // Descriptions are not supported by bash completion. - value = strings.Split(value, "\t")[0] + value = strings.SplitN(value, "\t", 2)[0] WriteStringAndCheck(buf, fmt.Sprintf(" must_have_one_noun+=(%q)\n", value)) } if cmd.ValidArgsFunction != nil { diff --git a/vendor/github.com/spf13/cobra/bash_completions.md b/vendor/github.com/spf13/cobra/bash_completions.md deleted file mode 100644 index 52919b2..0000000 --- a/vendor/github.com/spf13/cobra/bash_completions.md +++ /dev/null @@ -1,93 +0,0 @@ -# Generating Bash Completions For Your cobra.Command - -Please refer to [Shell Completions](shell_completions.md) for details. - -## Bash legacy dynamic completions - -For backward compatibility, Cobra still supports its legacy dynamic completion solution (described below). Unlike the `ValidArgsFunction` solution, the legacy solution will only work for Bash shell-completion and not for other shells. This legacy solution can be used along-side `ValidArgsFunction` and `RegisterFlagCompletionFunc()`, as long as both solutions are not used for the same command. This provides a path to gradually migrate from the legacy solution to the new solution. - -**Note**: Cobra's default `completion` command uses bash completion V2. If you are currently using Cobra's legacy dynamic completion solution, you should not use the default `completion` command but continue using your own. - -The legacy solution allows you to inject bash functions into the bash completion script. Those bash functions are responsible for providing the completion choices for your own completions. - -Some code that works in kubernetes: - -```bash -const ( - bash_completion_func = `__kubectl_parse_get() -{ - local kubectl_output out - if kubectl_output=$(kubectl get --no-headers "$1" 2>/dev/null); then - out=($(echo "${kubectl_output}" | awk '{print $1}')) - COMPREPLY=( $( compgen -W "${out[*]}" -- "$cur" ) ) - fi -} - -__kubectl_get_resource() -{ - if [[ ${#nouns[@]} -eq 0 ]]; then - return 1 - fi - __kubectl_parse_get ${nouns[${#nouns[@]} -1]} - if [[ $? -eq 0 ]]; then - return 0 - fi -} - -__kubectl_custom_func() { - case ${last_command} in - kubectl_get | kubectl_describe | kubectl_delete | kubectl_stop) - __kubectl_get_resource - return - ;; - *) - ;; - esac -} -`) -``` - -And then I set that in my command definition: - -```go -cmds := &cobra.Command{ - Use: "kubectl", - Short: "kubectl controls the Kubernetes cluster manager", - Long: `kubectl controls the Kubernetes cluster manager. - -Find more information at https://github.com/GoogleCloudPlatform/kubernetes.`, - Run: runHelp, - BashCompletionFunction: bash_completion_func, -} -``` - -The `BashCompletionFunction` option is really only valid/useful on the root command. Doing the above will cause `__kubectl_custom_func()` (`___custom_func()`) to be called when the built in processor was unable to find a solution. In the case of kubernetes a valid command might look something like `kubectl get pod [mypod]`. If you type `kubectl get pod [tab][tab]` the `__kubectl_customc_func()` will run because the cobra.Command only understood "kubectl" and "get." `__kubectl_custom_func()` will see that the cobra.Command is "kubectl_get" and will thus call another helper `__kubectl_get_resource()`. `__kubectl_get_resource` will look at the 'nouns' collected. In our example the only noun will be `pod`. So it will call `__kubectl_parse_get pod`. `__kubectl_parse_get` will actually call out to kubernetes and get any pods. It will then set `COMPREPLY` to valid pods! - -Similarly, for flags: - -```go - annotation := make(map[string][]string) - annotation[cobra.BashCompCustom] = []string{"__kubectl_get_namespaces"} - - flag := &pflag.Flag{ - Name: "namespace", - Usage: usage, - Annotations: annotation, - } - cmd.Flags().AddFlag(flag) -``` - -In addition add the `__kubectl_get_namespaces` implementation in the `BashCompletionFunction` -value, e.g.: - -```bash -__kubectl_get_namespaces() -{ - local template - template="{{ range .items }}{{ .metadata.name }} {{ end }}" - local kubectl_out - if kubectl_out=$(kubectl get -o template --template="${template}" namespace 2>/dev/null); then - COMPREPLY=( $( compgen -W "${kubectl_out}[*]" -- "$cur" ) ) - fi -} -``` diff --git a/vendor/github.com/spf13/cobra/bash_completionsV2.go b/vendor/github.com/spf13/cobra/bash_completionsV2.go index 19b0956..1cce5c3 100644 --- a/vendor/github.com/spf13/cobra/bash_completionsV2.go +++ b/vendor/github.com/spf13/cobra/bash_completionsV2.go @@ -57,7 +57,7 @@ __%[1]s_get_completion_results() { local requestComp lastParam lastChar args # Prepare the command to request completions for the program. - # Calling ${words[0]} instead of directly %[1]s allows to handle aliases + # Calling ${words[0]} instead of directly %[1]s allows handling aliases args=("${words[@]:1}") requestComp="${words[0]} %[2]s ${args[*]}" diff --git a/vendor/github.com/spf13/cobra/cobra.go b/vendor/github.com/spf13/cobra/cobra.go index b07b44a..e0b0947 100644 --- a/vendor/github.com/spf13/cobra/cobra.go +++ b/vendor/github.com/spf13/cobra/cobra.go @@ -43,12 +43,13 @@ var initializers []func() var finalizers []func() const ( - defaultPrefixMatching = false - defaultCommandSorting = true - defaultCaseInsensitive = false + defaultPrefixMatching = false + defaultCommandSorting = true + defaultCaseInsensitive = false + defaultTraverseRunHooks = false ) -// EnablePrefixMatching allows to set automatic prefix matching. Automatic prefix matching can be a dangerous thing +// EnablePrefixMatching allows setting automatic prefix matching. Automatic prefix matching can be a dangerous thing // to automatically enable in CLI tools. // Set this to true to enable it. var EnablePrefixMatching = defaultPrefixMatching @@ -60,6 +61,10 @@ var EnableCommandSorting = defaultCommandSorting // EnableCaseInsensitive allows case-insensitive commands names. (case sensitive by default) var EnableCaseInsensitive = defaultCaseInsensitive +// EnableTraverseRunHooks executes persistent pre-run and post-run hooks from all parents. +// By default this is disabled, which means only the first run hook to be found is executed. +var EnableTraverseRunHooks = defaultTraverseRunHooks + // MousetrapHelpText enables an information splash screen on Windows // if the CLI is started from explorer.exe. // To disable the mousetrap, just set this variable to blank string (""). @@ -188,8 +193,6 @@ func ld(s, t string, ignoreCase bool) int { d := make([][]int, len(s)+1) for i := range d { d[i] = make([]int, len(t)+1) - } - for i := range d { d[i][0] = i } for j := range d[0] { diff --git a/vendor/github.com/spf13/cobra/command.go b/vendor/github.com/spf13/cobra/command.go index 01f7c6f..54748fc 100644 --- a/vendor/github.com/spf13/cobra/command.go +++ b/vendor/github.com/spf13/cobra/command.go @@ -30,7 +30,10 @@ import ( flag "github.com/spf13/pflag" ) -const FlagSetByCobraAnnotation = "cobra_annotation_flag_set_by_cobra" +const ( + FlagSetByCobraAnnotation = "cobra_annotation_flag_set_by_cobra" + CommandDisplayNameAnnotation = "cobra_annotation_command_display_name" +) // FParseErrWhitelist configures Flag parse errors to be ignored type FParseErrWhitelist flag.ParseErrorsWhitelist @@ -99,7 +102,7 @@ type Command struct { Deprecated string // Annotations are key/value pairs that can be used by applications to identify or - // group commands. + // group commands or set special options. Annotations map[string]string // Version defines the version for this command. If this value is non-empty and the command does not @@ -115,6 +118,8 @@ type Command struct { // * PostRun() // * PersistentPostRun() // All functions get the same args, the arguments after the command name. + // The *PreRun and *PostRun functions will only be executed if the Run function of the current + // command has been declared. // // PersistentPreRun: children of this command will inherit and execute. PersistentPreRun func(cmd *Command, args []string) @@ -149,8 +154,10 @@ type Command struct { // pflags contains persistent flags. pflags *flag.FlagSet // lflags contains local flags. + // This field does not represent internal state, it's used as a cache to optimise LocalFlags function call lflags *flag.FlagSet // iflags contains inherited flags. + // This field does not represent internal state, it's used as a cache to optimise InheritedFlags function call iflags *flag.FlagSet // parentsPflags is all persistent flags of cmd's parents. parentsPflags *flag.FlagSet @@ -181,6 +188,9 @@ type Command struct { // versionTemplate is the version template defined by user. versionTemplate string + // errPrefix is the error message prefix defined by user. + errPrefix string + // inReader is a reader defined by the user that replaces stdin inReader io.Reader // outWriter is a writer defined by the user that replaces stdout @@ -346,6 +356,11 @@ func (c *Command) SetVersionTemplate(s string) { c.versionTemplate = s } +// SetErrPrefix sets error message prefix to be used. Application can use it to set custom prefix. +func (c *Command) SetErrPrefix(s string) { + c.errPrefix = s +} + // SetGlobalNormalizationFunc sets a normalization function to all flag sets and also to child commands. // The user should not have a cyclic dependency on commands. func (c *Command) SetGlobalNormalizationFunc(n func(f *flag.FlagSet, name string) flag.NormalizedName) { @@ -595,6 +610,18 @@ func (c *Command) VersionTemplate() string { ` } +// ErrPrefix return error message prefix for the command +func (c *Command) ErrPrefix() string { + if c.errPrefix != "" { + return c.errPrefix + } + + if c.HasParent() { + return c.parent.ErrPrefix() + } + return "Error:" +} + func hasNoOptDefVal(name string, fs *flag.FlagSet) bool { flag := fs.Lookup(name) if flag == nil { @@ -681,7 +708,7 @@ Loop: // This is not a flag or a flag value. Check to see if it matches what we're looking for, and if so, // return the args, excluding the one at this position. if s == x { - ret := []string{} + ret := make([]string, 0, len(args)-1) ret = append(ret, args[:pos]...) ret = append(ret, args[pos+1:]...) return ret @@ -729,14 +756,14 @@ func (c *Command) findSuggestions(arg string) string { if c.SuggestionsMinimumDistance <= 0 { c.SuggestionsMinimumDistance = 2 } - suggestionsString := "" + var sb strings.Builder if suggestions := c.SuggestionsFor(arg); len(suggestions) > 0 { - suggestionsString += "\n\nDid you mean this?\n" + sb.WriteString("\n\nDid you mean this?\n") for _, s := range suggestions { - suggestionsString += fmt.Sprintf("\t%v\n", s) + _, _ = fmt.Fprintf(&sb, "\t%v\n", s) } } - return suggestionsString + return sb.String() } func (c *Command) findNext(next string) *Command { @@ -752,7 +779,9 @@ func (c *Command) findNext(next string) *Command { } if len(matches) == 1 { - return matches[0] + // Temporarily disable gosec G602, which produces a false positive. + // See https://github.com/securego/gosec/issues/1005. + return matches[0] // #nosec G602 } return nil @@ -846,7 +875,7 @@ func (c *Command) ArgsLenAtDash() int { func (c *Command) execute(a []string) (err error) { if c == nil { - return fmt.Errorf("Called Execute() on a nil Command") + return fmt.Errorf("called Execute() on a nil Command") } if len(c.Deprecated) > 0 { @@ -910,15 +939,31 @@ func (c *Command) execute(a []string) (err error) { return err } + parents := make([]*Command, 0, 5) for p := c; p != nil; p = p.Parent() { + if EnableTraverseRunHooks { + // When EnableTraverseRunHooks is set: + // - Execute all persistent pre-runs from the root parent till this command. + // - Execute all persistent post-runs from this command till the root parent. + parents = append([]*Command{p}, parents...) + } else { + // Otherwise, execute only the first found persistent hook. + parents = append(parents, p) + } + } + for _, p := range parents { if p.PersistentPreRunE != nil { if err := p.PersistentPreRunE(c, argWoFlags); err != nil { return err } - break + if !EnableTraverseRunHooks { + break + } } else if p.PersistentPreRun != nil { p.PersistentPreRun(c, argWoFlags) - break + if !EnableTraverseRunHooks { + break + } } } if c.PreRunE != nil { @@ -955,10 +1000,14 @@ func (c *Command) execute(a []string) (err error) { if err := p.PersistentPostRunE(c, argWoFlags); err != nil { return err } - break + if !EnableTraverseRunHooks { + break + } } else if p.PersistentPostRun != nil { p.PersistentPostRun(c, argWoFlags) - break + if !EnableTraverseRunHooks { + break + } } } @@ -1048,7 +1097,7 @@ func (c *Command) ExecuteC() (cmd *Command, err error) { c = cmd } if !c.SilenceErrors { - c.PrintErrln("Error:", err.Error()) + c.PrintErrln(c.ErrPrefix(), err.Error()) c.PrintErrf("Run '%v --help' for usage.\n", c.CommandPath()) } return c, err @@ -1077,7 +1126,7 @@ func (c *Command) ExecuteC() (cmd *Command, err error) { // If root command has SilenceErrors flagged, // all subcommands should respect it if !cmd.SilenceErrors && !c.SilenceErrors { - c.PrintErrln("Error:", err.Error()) + c.PrintErrln(cmd.ErrPrefix(), err.Error()) } // If root command has SilenceUsage flagged, @@ -1140,10 +1189,11 @@ func (c *Command) InitDefaultHelpFlag() { c.mergePersistentFlags() if c.Flags().Lookup("help") == nil { usage := "help for " - if c.Name() == "" { + name := c.displayName() + if name == "" { usage += "this command" } else { - usage += c.Name() + usage += name } c.Flags().BoolP("help", "h", false, usage) _ = c.Flags().SetAnnotation("help", FlagSetByCobraAnnotation, []string{"true"}) @@ -1189,7 +1239,7 @@ func (c *Command) InitDefaultHelpCmd() { Use: "help [command]", Short: "Help about any command", Long: `Help provides help for any command in the application. -Simply type ` + c.Name() + ` help [path to command] for full details.`, +Simply type ` + c.displayName() + ` help [path to command] for full details.`, ValidArgsFunction: func(c *Command, args []string, toComplete string) ([]string, ShellCompDirective) { var completions []string cmd, _, e := c.Root().Find(args) @@ -1380,16 +1430,24 @@ func (c *Command) CommandPath() string { if c.HasParent() { return c.Parent().CommandPath() + " " + c.Name() } + return c.displayName() +} + +func (c *Command) displayName() string { + if displayName, ok := c.Annotations[CommandDisplayNameAnnotation]; ok { + return displayName + } return c.Name() } // UseLine puts out the full usage for a given command (including parents). func (c *Command) UseLine() string { var useline string + use := strings.Replace(c.Use, c.Name(), c.displayName(), 1) if c.HasParent() { - useline = c.parent.CommandPath() + " " + c.Use + useline = c.parent.CommandPath() + " " + use } else { - useline = c.Use + useline = use } if c.DisableFlagsInUseLine { return useline @@ -1591,7 +1649,7 @@ func (c *Command) GlobalNormalizationFunc() func(f *flag.FlagSet, name string) f // to this command (local and persistent declared here and by all parents). func (c *Command) Flags() *flag.FlagSet { if c.flags == nil { - c.flags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.flags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) if c.flagErrorBuf == nil { c.flagErrorBuf = new(bytes.Buffer) } @@ -1602,10 +1660,11 @@ func (c *Command) Flags() *flag.FlagSet { } // LocalNonPersistentFlags are flags specific to this command which will NOT persist to subcommands. +// This function does not modify the flags of the current command, it's purpose is to return the current state. func (c *Command) LocalNonPersistentFlags() *flag.FlagSet { persistentFlags := c.PersistentFlags() - out := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + out := flag.NewFlagSet(c.displayName(), flag.ContinueOnError) c.LocalFlags().VisitAll(func(f *flag.Flag) { if persistentFlags.Lookup(f.Name) == nil { out.AddFlag(f) @@ -1615,11 +1674,12 @@ func (c *Command) LocalNonPersistentFlags() *flag.FlagSet { } // LocalFlags returns the local FlagSet specifically set in the current command. +// This function does not modify the flags of the current command, it's purpose is to return the current state. func (c *Command) LocalFlags() *flag.FlagSet { c.mergePersistentFlags() if c.lflags == nil { - c.lflags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.lflags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) if c.flagErrorBuf == nil { c.flagErrorBuf = new(bytes.Buffer) } @@ -1642,11 +1702,12 @@ func (c *Command) LocalFlags() *flag.FlagSet { } // InheritedFlags returns all flags which were inherited from parent commands. +// This function does not modify the flags of the current command, it's purpose is to return the current state. func (c *Command) InheritedFlags() *flag.FlagSet { c.mergePersistentFlags() if c.iflags == nil { - c.iflags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.iflags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) if c.flagErrorBuf == nil { c.flagErrorBuf = new(bytes.Buffer) } @@ -1667,6 +1728,7 @@ func (c *Command) InheritedFlags() *flag.FlagSet { } // NonInheritedFlags returns all flags which were not inherited from parent commands. +// This function does not modify the flags of the current command, it's purpose is to return the current state. func (c *Command) NonInheritedFlags() *flag.FlagSet { return c.LocalFlags() } @@ -1674,7 +1736,7 @@ func (c *Command) NonInheritedFlags() *flag.FlagSet { // PersistentFlags returns the persistent FlagSet specifically set in the current command. func (c *Command) PersistentFlags() *flag.FlagSet { if c.pflags == nil { - c.pflags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.pflags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) if c.flagErrorBuf == nil { c.flagErrorBuf = new(bytes.Buffer) } @@ -1687,9 +1749,9 @@ func (c *Command) PersistentFlags() *flag.FlagSet { func (c *Command) ResetFlags() { c.flagErrorBuf = new(bytes.Buffer) c.flagErrorBuf.Reset() - c.flags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.flags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) c.flags.SetOutput(c.flagErrorBuf) - c.pflags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.pflags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) c.pflags.SetOutput(c.flagErrorBuf) c.lflags = nil @@ -1806,7 +1868,7 @@ func (c *Command) mergePersistentFlags() { // If c.parentsPflags == nil, it makes new. func (c *Command) updateParentsPflags() { if c.parentsPflags == nil { - c.parentsPflags = flag.NewFlagSet(c.Name(), flag.ContinueOnError) + c.parentsPflags = flag.NewFlagSet(c.displayName(), flag.ContinueOnError) c.parentsPflags.SetOutput(c.flagErrorBuf) c.parentsPflags.SortFlags = false } diff --git a/vendor/github.com/spf13/cobra/completions.go b/vendor/github.com/spf13/cobra/completions.go index ee38c4d..c0c08b0 100644 --- a/vendor/github.com/spf13/cobra/completions.go +++ b/vendor/github.com/spf13/cobra/completions.go @@ -17,6 +17,8 @@ package cobra import ( "fmt" "os" + "regexp" + "strconv" "strings" "sync" @@ -145,6 +147,20 @@ func (c *Command) RegisterFlagCompletionFunc(flagName string, f func(cmd *Comman return nil } +// GetFlagCompletionFunc returns the completion function for the given flag of the command, if available. +func (c *Command) GetFlagCompletionFunc(flagName string) (func(*Command, []string, string) ([]string, ShellCompDirective), bool) { + flag := c.Flag(flagName) + if flag == nil { + return nil, false + } + + flagCompletionMutex.RLock() + defer flagCompletionMutex.RUnlock() + + completionFunc, exists := flagCompletionFunctions[flag] + return completionFunc, exists +} + // Returns a string listing the different directive enabled in the specified parameter func (d ShellCompDirective) string() string { var directives []string @@ -197,24 +213,29 @@ func (c *Command) initCompleteCmd(args []string) { // 2- Even without completions, we need to print the directive } - noDescriptions := (cmd.CalledAs() == ShellCompNoDescRequestCmd) + noDescriptions := cmd.CalledAs() == ShellCompNoDescRequestCmd + if !noDescriptions { + if doDescriptions, err := strconv.ParseBool(getEnvConfig(cmd, configEnvVarSuffixDescriptions)); err == nil { + noDescriptions = !doDescriptions + } + } + noActiveHelp := GetActiveHelpConfig(finalCmd) == activeHelpGlobalDisable + out := finalCmd.OutOrStdout() for _, comp := range completions { - if GetActiveHelpConfig(finalCmd) == activeHelpGlobalDisable { - // Remove all activeHelp entries in this case - if strings.HasPrefix(comp, activeHelpMarker) { - continue - } + if noActiveHelp && strings.HasPrefix(comp, activeHelpMarker) { + // Remove all activeHelp entries if it's disabled. + continue } if noDescriptions { // Remove any description that may be included following a tab character. - comp = strings.Split(comp, "\t")[0] + comp = strings.SplitN(comp, "\t", 2)[0] } // Make sure we only write the first line to the output. // This is needed if a description contains a linebreak. // Otherwise the shell scripts will interpret the other lines as new flags // and could therefore provide a wrong completion. - comp = strings.Split(comp, "\n")[0] + comp = strings.SplitN(comp, "\n", 2)[0] // Finally trim the completion. This is especially important to get rid // of a trailing tab when there are no description following it. @@ -223,14 +244,14 @@ func (c *Command) initCompleteCmd(args []string) { // although there is no description). comp = strings.TrimSpace(comp) - // Print each possible completion to stdout for the completion script to consume. - fmt.Fprintln(finalCmd.OutOrStdout(), comp) + // Print each possible completion to the output for the completion script to consume. + fmt.Fprintln(out, comp) } // As the last printout, print the completion directive for the completion script to parse. // The directive integer must be that last character following a single colon (:). // The completion script expects : - fmt.Fprintf(finalCmd.OutOrStdout(), ":%d\n", directive) + fmt.Fprintf(out, ":%d\n", directive) // Print some helpful info to stderr for the user to understand. // Output from stderr must be ignored by the completion script. @@ -277,15 +298,19 @@ func (c *Command) getCompletions(args []string) (*Command, []string, ShellCompDi } if err != nil { // Unable to find the real command. E.g., someInvalidCmd - return c, []string{}, ShellCompDirectiveDefault, fmt.Errorf("Unable to find a command for arguments: %v", trimmedArgs) + return c, []string{}, ShellCompDirectiveDefault, fmt.Errorf("unable to find a command for arguments: %v", trimmedArgs) } finalCmd.ctx = c.ctx // These flags are normally added when `execute()` is called on `finalCmd`, // however, when doing completion, we don't call `finalCmd.execute()`. - // Let's add the --help and --version flag ourselves. - finalCmd.InitDefaultHelpFlag() - finalCmd.InitDefaultVersionFlag() + // Let's add the --help and --version flag ourselves but only if the finalCmd + // has not disabled flag parsing; if flag parsing is disabled, it is up to the + // finalCmd itself to handle the completion of *all* flags. + if !finalCmd.DisableFlagParsing { + finalCmd.InitDefaultHelpFlag() + finalCmd.InitDefaultVersionFlag() + } // Check if we are doing flag value completion before parsing the flags. // This is important because if we are completing a flag value, we need to also @@ -389,6 +414,11 @@ func (c *Command) getCompletions(args []string) (*Command, []string, ShellCompDi finalCmd.InheritedFlags().VisitAll(func(flag *pflag.Flag) { doCompleteFlags(flag) }) + // Try to complete non-inherited flags even if DisableFlagParsing==true. + // This allows programs to tell Cobra about flags for completion even + // if the actual parsing of flags is not done by Cobra. + // For instance, Helm uses this to provide flag name completion for + // some of its plugins. finalCmd.NonInheritedFlags().VisitAll(func(flag *pflag.Flag) { doCompleteFlags(flag) }) @@ -876,3 +906,34 @@ func CompError(msg string) { func CompErrorln(msg string) { CompError(fmt.Sprintf("%s\n", msg)) } + +// These values should not be changed: users will be using them explicitly. +const ( + configEnvVarGlobalPrefix = "COBRA" + configEnvVarSuffixDescriptions = "COMPLETION_DESCRIPTIONS" +) + +var configEnvVarPrefixSubstRegexp = regexp.MustCompile(`[^A-Z0-9_]`) + +// configEnvVar returns the name of the program-specific configuration environment +// variable. It has the format _ where is the name of the +// root command in upper case, with all non-ASCII-alphanumeric characters replaced by `_`. +func configEnvVar(name, suffix string) string { + // This format should not be changed: users will be using it explicitly. + v := strings.ToUpper(fmt.Sprintf("%s_%s", name, suffix)) + v = configEnvVarPrefixSubstRegexp.ReplaceAllString(v, "_") + return v +} + +// getEnvConfig returns the value of the configuration environment variable +// _ where is the name of the root command in upper +// case, with all non-ASCII-alphanumeric characters replaced by `_`. +// If the value is empty or not set, the value of the environment variable +// COBRA_ is returned instead. +func getEnvConfig(cmd *Command, suffix string) string { + v := os.Getenv(configEnvVar(cmd.Root().Name(), suffix)) + if v == "" { + v = os.Getenv(configEnvVar(configEnvVarGlobalPrefix, suffix)) + } + return v +} diff --git a/vendor/github.com/spf13/cobra/fish_completions.go b/vendor/github.com/spf13/cobra/fish_completions.go index 12ca0d2..12d61b6 100644 --- a/vendor/github.com/spf13/cobra/fish_completions.go +++ b/vendor/github.com/spf13/cobra/fish_completions.go @@ -113,7 +113,7 @@ function __%[1]s_clear_perform_completion_once_result __%[1]s_debug "" __%[1]s_debug "========= clearing previously set __%[1]s_perform_completion_once_result variable ==========" set --erase __%[1]s_perform_completion_once_result - __%[1]s_debug "Succesfully erased the variable __%[1]s_perform_completion_once_result" + __%[1]s_debug "Successfully erased the variable __%[1]s_perform_completion_once_result" end function __%[1]s_requires_order_preservation diff --git a/vendor/github.com/spf13/cobra/fish_completions.md b/vendor/github.com/spf13/cobra/fish_completions.md deleted file mode 100644 index 19b2ed1..0000000 --- a/vendor/github.com/spf13/cobra/fish_completions.md +++ /dev/null @@ -1,4 +0,0 @@ -## Generating Fish Completions For Your cobra.Command - -Please refer to [Shell Completions](shell_completions.md) for details. - diff --git a/vendor/github.com/spf13/cobra/flag_groups.go b/vendor/github.com/spf13/cobra/flag_groups.go index b35fde1..560612f 100644 --- a/vendor/github.com/spf13/cobra/flag_groups.go +++ b/vendor/github.com/spf13/cobra/flag_groups.go @@ -23,8 +23,9 @@ import ( ) const ( - requiredAsGroup = "cobra_annotation_required_if_others_set" - mutuallyExclusive = "cobra_annotation_mutually_exclusive" + requiredAsGroupAnnotation = "cobra_annotation_required_if_others_set" + oneRequiredAnnotation = "cobra_annotation_one_required" + mutuallyExclusiveAnnotation = "cobra_annotation_mutually_exclusive" ) // MarkFlagsRequiredTogether marks the given flags with annotations so that Cobra errors @@ -36,7 +37,23 @@ func (c *Command) MarkFlagsRequiredTogether(flagNames ...string) { if f == nil { panic(fmt.Sprintf("Failed to find flag %q and mark it as being required in a flag group", v)) } - if err := c.Flags().SetAnnotation(v, requiredAsGroup, append(f.Annotations[requiredAsGroup], strings.Join(flagNames, " "))); err != nil { + if err := c.Flags().SetAnnotation(v, requiredAsGroupAnnotation, append(f.Annotations[requiredAsGroupAnnotation], strings.Join(flagNames, " "))); err != nil { + // Only errs if the flag isn't found. + panic(err) + } + } +} + +// MarkFlagsOneRequired marks the given flags with annotations so that Cobra errors +// if the command is invoked without at least one flag from the given set of flags. +func (c *Command) MarkFlagsOneRequired(flagNames ...string) { + c.mergePersistentFlags() + for _, v := range flagNames { + f := c.Flags().Lookup(v) + if f == nil { + panic(fmt.Sprintf("Failed to find flag %q and mark it as being in a one-required flag group", v)) + } + if err := c.Flags().SetAnnotation(v, oneRequiredAnnotation, append(f.Annotations[oneRequiredAnnotation], strings.Join(flagNames, " "))); err != nil { // Only errs if the flag isn't found. panic(err) } @@ -53,13 +70,13 @@ func (c *Command) MarkFlagsMutuallyExclusive(flagNames ...string) { panic(fmt.Sprintf("Failed to find flag %q and mark it as being in a mutually exclusive flag group", v)) } // Each time this is called is a single new entry; this allows it to be a member of multiple groups if needed. - if err := c.Flags().SetAnnotation(v, mutuallyExclusive, append(f.Annotations[mutuallyExclusive], strings.Join(flagNames, " "))); err != nil { + if err := c.Flags().SetAnnotation(v, mutuallyExclusiveAnnotation, append(f.Annotations[mutuallyExclusiveAnnotation], strings.Join(flagNames, " "))); err != nil { panic(err) } } } -// ValidateFlagGroups validates the mutuallyExclusive/requiredAsGroup logic and returns the +// ValidateFlagGroups validates the mutuallyExclusive/oneRequired/requiredAsGroup logic and returns the // first error encountered. func (c *Command) ValidateFlagGroups() error { if c.DisableFlagParsing { @@ -71,15 +88,20 @@ func (c *Command) ValidateFlagGroups() error { // groupStatus format is the list of flags as a unique ID, // then a map of each flag name and whether it is set or not. groupStatus := map[string]map[string]bool{} + oneRequiredGroupStatus := map[string]map[string]bool{} mutuallyExclusiveGroupStatus := map[string]map[string]bool{} flags.VisitAll(func(pflag *flag.Flag) { - processFlagForGroupAnnotation(flags, pflag, requiredAsGroup, groupStatus) - processFlagForGroupAnnotation(flags, pflag, mutuallyExclusive, mutuallyExclusiveGroupStatus) + processFlagForGroupAnnotation(flags, pflag, requiredAsGroupAnnotation, groupStatus) + processFlagForGroupAnnotation(flags, pflag, oneRequiredAnnotation, oneRequiredGroupStatus) + processFlagForGroupAnnotation(flags, pflag, mutuallyExclusiveAnnotation, mutuallyExclusiveGroupStatus) }) if err := validateRequiredFlagGroups(groupStatus); err != nil { return err } + if err := validateOneRequiredFlagGroups(oneRequiredGroupStatus); err != nil { + return err + } if err := validateExclusiveFlagGroups(mutuallyExclusiveGroupStatus); err != nil { return err } @@ -108,7 +130,7 @@ func processFlagForGroupAnnotation(flags *flag.FlagSet, pflag *flag.Flag, annota continue } - groupStatus[group] = map[string]bool{} + groupStatus[group] = make(map[string]bool, len(flagnames)) for _, name := range flagnames { groupStatus[group][name] = false } @@ -142,6 +164,27 @@ func validateRequiredFlagGroups(data map[string]map[string]bool) error { return nil } +func validateOneRequiredFlagGroups(data map[string]map[string]bool) error { + keys := sortedKeys(data) + for _, flagList := range keys { + flagnameAndStatus := data[flagList] + var set []string + for flagname, isSet := range flagnameAndStatus { + if isSet { + set = append(set, flagname) + } + } + if len(set) >= 1 { + continue + } + + // Sort values, so they can be tested/scripted against consistently. + sort.Strings(set) + return fmt.Errorf("at least one of the flags in the group [%v] is required", flagList) + } + return nil +} + func validateExclusiveFlagGroups(data map[string]map[string]bool) error { keys := sortedKeys(data) for _, flagList := range keys { @@ -176,6 +219,7 @@ func sortedKeys(m map[string]map[string]bool) []string { // enforceFlagGroupsForCompletion will do the following: // - when a flag in a group is present, other flags in the group will be marked required +// - when none of the flags in a one-required group are present, all flags in the group will be marked required // - when a flag in a mutually exclusive group is present, other flags in the group will be marked as hidden // This allows the standard completion logic to behave appropriately for flag groups func (c *Command) enforceFlagGroupsForCompletion() { @@ -185,10 +229,12 @@ func (c *Command) enforceFlagGroupsForCompletion() { flags := c.Flags() groupStatus := map[string]map[string]bool{} + oneRequiredGroupStatus := map[string]map[string]bool{} mutuallyExclusiveGroupStatus := map[string]map[string]bool{} c.Flags().VisitAll(func(pflag *flag.Flag) { - processFlagForGroupAnnotation(flags, pflag, requiredAsGroup, groupStatus) - processFlagForGroupAnnotation(flags, pflag, mutuallyExclusive, mutuallyExclusiveGroupStatus) + processFlagForGroupAnnotation(flags, pflag, requiredAsGroupAnnotation, groupStatus) + processFlagForGroupAnnotation(flags, pflag, oneRequiredAnnotation, oneRequiredGroupStatus) + processFlagForGroupAnnotation(flags, pflag, mutuallyExclusiveAnnotation, mutuallyExclusiveGroupStatus) }) // If a flag that is part of a group is present, we make all the other flags @@ -204,6 +250,26 @@ func (c *Command) enforceFlagGroupsForCompletion() { } } + // If none of the flags of a one-required group are present, we make all the flags + // of that group required so that the shell completion suggests them automatically + for flagList, flagnameAndStatus := range oneRequiredGroupStatus { + isSet := false + + for _, isSet = range flagnameAndStatus { + if isSet { + break + } + } + + // None of the flags of the group are set, mark all flags in the group + // as required + if !isSet { + for _, fName := range strings.Split(flagList, " ") { + _ = c.MarkFlagRequired(fName) + } + } + } + // If a flag that is mutually exclusive to others is present, we hide the other // flags of that group so the shell completion does not suggest them for flagList, flagnameAndStatus := range mutuallyExclusiveGroupStatus { diff --git a/vendor/github.com/spf13/cobra/powershell_completions.go b/vendor/github.com/spf13/cobra/powershell_completions.go index 177d275..a830b7b 100644 --- a/vendor/github.com/spf13/cobra/powershell_completions.go +++ b/vendor/github.com/spf13/cobra/powershell_completions.go @@ -28,8 +28,8 @@ import ( func genPowerShellComp(buf io.StringWriter, name string, includeDesc bool) { // Variables should not contain a '-' or ':' character nameForVar := name - nameForVar = strings.Replace(nameForVar, "-", "_", -1) - nameForVar = strings.Replace(nameForVar, ":", "_", -1) + nameForVar = strings.ReplaceAll(nameForVar, "-", "_") + nameForVar = strings.ReplaceAll(nameForVar, ":", "_") compCmd := ShellCompRequestCmd if !includeDesc { @@ -47,7 +47,7 @@ filter __%[1]s_escapeStringWithSpecialChars { `+" $_ -replace '\\s|#|@|\\$|;|,|''|\\{|\\}|\\(|\\)|\"|`|\\||<|>|&','`$&'"+` } -[scriptblock]$__%[2]sCompleterBlock = { +[scriptblock]${__%[2]sCompleterBlock} = { param( $WordToComplete, $CommandAst, @@ -122,7 +122,7 @@ filter __%[1]s_escapeStringWithSpecialChars { __%[1]s_debug "Calling $RequestComp" # First disable ActiveHelp which is not supported for Powershell - $env:%[10]s=0 + ${env:%[10]s}=0 #call the command store the output in $out and redirect stderr and stdout to null # $Out is an array contains each line per element @@ -279,7 +279,7 @@ filter __%[1]s_escapeStringWithSpecialChars { } } -Register-ArgumentCompleter -CommandName '%[1]s' -ScriptBlock $__%[2]sCompleterBlock +Register-ArgumentCompleter -CommandName '%[1]s' -ScriptBlock ${__%[2]sCompleterBlock} `, name, nameForVar, compCmd, ShellCompDirectiveError, ShellCompDirectiveNoSpace, ShellCompDirectiveNoFileComp, ShellCompDirectiveFilterFileExt, ShellCompDirectiveFilterDirs, ShellCompDirectiveKeepOrder, activeHelpEnvVar(name))) diff --git a/vendor/github.com/spf13/cobra/powershell_completions.md b/vendor/github.com/spf13/cobra/powershell_completions.md deleted file mode 100644 index c449f1e..0000000 --- a/vendor/github.com/spf13/cobra/powershell_completions.md +++ /dev/null @@ -1,3 +0,0 @@ -# Generating PowerShell Completions For Your Own cobra.Command - -Please refer to [Shell Completions](shell_completions.md#powershell-completions) for details. diff --git a/vendor/github.com/spf13/cobra/projects_using_cobra.md b/vendor/github.com/spf13/cobra/projects_using_cobra.md deleted file mode 100644 index 8a291eb..0000000 --- a/vendor/github.com/spf13/cobra/projects_using_cobra.md +++ /dev/null @@ -1,64 +0,0 @@ -## Projects using Cobra - -- [Allero](https://github.com/allero-io/allero) -- [Arewefastyet](https://benchmark.vitess.io) -- [Arduino CLI](https://github.com/arduino/arduino-cli) -- [Bleve](https://blevesearch.com/) -- [Cilium](https://cilium.io/) -- [CloudQuery](https://github.com/cloudquery/cloudquery) -- [CockroachDB](https://www.cockroachlabs.com/) -- [Constellation](https://github.com/edgelesssys/constellation) -- [Cosmos SDK](https://github.com/cosmos/cosmos-sdk) -- [Datree](https://github.com/datreeio/datree) -- [Delve](https://github.com/derekparker/delve) -- [Docker (distribution)](https://github.com/docker/distribution) -- [Etcd](https://etcd.io/) -- [Gardener](https://github.com/gardener/gardenctl) -- [Giant Swarm's gsctl](https://github.com/giantswarm/gsctl) -- [Git Bump](https://github.com/erdaltsksn/git-bump) -- [GitHub CLI](https://github.com/cli/cli) -- [GitHub Labeler](https://github.com/erdaltsksn/gh-label) -- [Golangci-lint](https://golangci-lint.run) -- [GopherJS](https://github.com/gopherjs/gopherjs) -- [GoReleaser](https://goreleaser.com) -- [Helm](https://helm.sh) -- [Hugo](https://gohugo.io) -- [Infracost](https://github.com/infracost/infracost) -- [Istio](https://istio.io) -- [Kool](https://github.com/kool-dev/kool) -- [Kubernetes](https://kubernetes.io/) -- [Kubescape](https://github.com/kubescape/kubescape) -- [KubeVirt](https://github.com/kubevirt/kubevirt) -- [Linkerd](https://linkerd.io/) -- [Mattermost-server](https://github.com/mattermost/mattermost-server) -- [Mercure](https://mercure.rocks/) -- [Meroxa CLI](https://github.com/meroxa/cli) -- [Metal Stack CLI](https://github.com/metal-stack/metalctl) -- [Moby (former Docker)](https://github.com/moby/moby) -- [Moldy](https://github.com/Moldy-Community/moldy) -- [Multi-gitter](https://github.com/lindell/multi-gitter) -- [Nanobox](https://github.com/nanobox-io/nanobox)/[Nanopack](https://github.com/nanopack) -- [nFPM](https://nfpm.goreleaser.com) -- [Okteto](https://github.com/okteto/okteto) -- [OpenShift](https://www.openshift.com/) -- [Ory Hydra](https://github.com/ory/hydra) -- [Ory Kratos](https://github.com/ory/kratos) -- [Pixie](https://github.com/pixie-io/pixie) -- [Polygon Edge](https://github.com/0xPolygon/polygon-edge) -- [Pouch](https://github.com/alibaba/pouch) -- [ProjectAtomic (enterprise)](https://www.projectatomic.io/) -- [Prototool](https://github.com/uber/prototool) -- [Pulumi](https://www.pulumi.com) -- [QRcp](https://github.com/claudiodangelis/qrcp) -- [Random](https://github.com/erdaltsksn/random) -- [Rclone](https://rclone.org/) -- [Scaleway CLI](https://github.com/scaleway/scaleway-cli) -- [Sia](https://github.com/SiaFoundation/siad) -- [Skaffold](https://skaffold.dev/) -- [Tendermint](https://github.com/tendermint/tendermint) -- [Twitch CLI](https://github.com/twitchdev/twitch-cli) -- [UpCloud CLI (`upctl`)](https://github.com/UpCloudLtd/upcloud-cli) -- [Vitess](https://vitess.io) -- VMware's [Tanzu Community Edition](https://github.com/vmware-tanzu/community-edition) & [Tanzu Framework](https://github.com/vmware-tanzu/tanzu-framework) -- [Werf](https://werf.io/) -- [ZITADEL](https://github.com/zitadel/zitadel) diff --git a/vendor/github.com/spf13/cobra/shell_completions.md b/vendor/github.com/spf13/cobra/shell_completions.md deleted file mode 100644 index 065c062..0000000 --- a/vendor/github.com/spf13/cobra/shell_completions.md +++ /dev/null @@ -1,576 +0,0 @@ -# Generating shell completions - -Cobra can generate shell completions for multiple shells. -The currently supported shells are: -- Bash -- Zsh -- fish -- PowerShell - -Cobra will automatically provide your program with a fully functional `completion` command, -similarly to how it provides the `help` command. - -## Creating your own completion command - -If you do not wish to use the default `completion` command, you can choose to -provide your own, which will take precedence over the default one. (This also provides -backwards-compatibility with programs that already have their own `completion` command.) - -If you are using the `cobra-cli` generator, -which can be found at [spf13/cobra-cli](https://github.com/spf13/cobra-cli), -you can create a completion command by running - -```bash -cobra-cli add completion -``` -and then modifying the generated `cmd/completion.go` file to look something like this -(writing the shell script to stdout allows the most flexible use): - -```go -var completionCmd = &cobra.Command{ - Use: "completion [bash|zsh|fish|powershell]", - Short: "Generate completion script", - Long: fmt.Sprintf(`To load completions: - -Bash: - - $ source <(%[1]s completion bash) - - # To load completions for each session, execute once: - # Linux: - $ %[1]s completion bash > /etc/bash_completion.d/%[1]s - # macOS: - $ %[1]s completion bash > $(brew --prefix)/etc/bash_completion.d/%[1]s - -Zsh: - - # If shell completion is not already enabled in your environment, - # you will need to enable it. You can execute the following once: - - $ echo "autoload -U compinit; compinit" >> ~/.zshrc - - # To load completions for each session, execute once: - $ %[1]s completion zsh > "${fpath[1]}/_%[1]s" - - # You will need to start a new shell for this setup to take effect. - -fish: - - $ %[1]s completion fish | source - - # To load completions for each session, execute once: - $ %[1]s completion fish > ~/.config/fish/completions/%[1]s.fish - -PowerShell: - - PS> %[1]s completion powershell | Out-String | Invoke-Expression - - # To load completions for every new session, run: - PS> %[1]s completion powershell > %[1]s.ps1 - # and source this file from your PowerShell profile. -`,cmd.Root().Name()), - DisableFlagsInUseLine: true, - ValidArgs: []string{"bash", "zsh", "fish", "powershell"}, - Args: cobra.MatchAll(cobra.ExactArgs(1), cobra.OnlyValidArgs), - Run: func(cmd *cobra.Command, args []string) { - switch args[0] { - case "bash": - cmd.Root().GenBashCompletion(os.Stdout) - case "zsh": - cmd.Root().GenZshCompletion(os.Stdout) - case "fish": - cmd.Root().GenFishCompletion(os.Stdout, true) - case "powershell": - cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout) - } - }, -} -``` - -**Note:** The cobra generator may include messages printed to stdout, for example, if the config file is loaded; this will break the auto-completion script so must be removed. - -## Adapting the default completion command - -Cobra provides a few options for the default `completion` command. To configure such options you must set -the `CompletionOptions` field on the *root* command. - -To tell Cobra *not* to provide the default `completion` command: -``` -rootCmd.CompletionOptions.DisableDefaultCmd = true -``` - -To tell Cobra to mark the default `completion` command as *hidden*: -``` -rootCmd.CompletionOptions.HiddenDefaultCmd = true -``` - -To tell Cobra *not* to provide the user with the `--no-descriptions` flag to the completion sub-commands: -``` -rootCmd.CompletionOptions.DisableNoDescFlag = true -``` - -To tell Cobra to completely disable descriptions for completions: -``` -rootCmd.CompletionOptions.DisableDescriptions = true -``` - -# Customizing completions - -The generated completion scripts will automatically handle completing commands and flags. However, you can make your completions much more powerful by providing information to complete your program's nouns and flag values. - -## Completion of nouns - -### Static completion of nouns - -Cobra allows you to provide a pre-defined list of completion choices for your nouns using the `ValidArgs` field. -For example, if you want `kubectl get [tab][tab]` to show a list of valid "nouns" you have to set them. -Some simplified code from `kubectl get` looks like: - -```go -validArgs = []string{ "pod", "node", "service", "replicationcontroller" } - -cmd := &cobra.Command{ - Use: "get [(-o|--output=)json|yaml|template|...] (RESOURCE [NAME] | RESOURCE/NAME ...)", - Short: "Display one or many resources", - Long: get_long, - Example: get_example, - Run: func(cmd *cobra.Command, args []string) { - cobra.CheckErr(RunGet(f, out, cmd, args)) - }, - ValidArgs: validArgs, -} -``` - -Notice we put the `ValidArgs` field on the `get` sub-command. Doing so will give results like: - -```bash -$ kubectl get [tab][tab] -node pod replicationcontroller service -``` - -#### Aliases for nouns - -If your nouns have aliases, you can define them alongside `ValidArgs` using `ArgAliases`: - -```go -argAliases = []string { "pods", "nodes", "services", "svc", "replicationcontrollers", "rc" } - -cmd := &cobra.Command{ - ... - ValidArgs: validArgs, - ArgAliases: argAliases -} -``` - -The aliases are shown to the user on tab completion only if no completions were found within sub-commands or `ValidArgs`. - -### Dynamic completion of nouns - -In some cases it is not possible to provide a list of completions in advance. Instead, the list of completions must be determined at execution-time. In a similar fashion as for static completions, you can use the `ValidArgsFunction` field to provide a Go function that Cobra will execute when it needs the list of completion choices for the nouns of a command. Note that either `ValidArgs` or `ValidArgsFunction` can be used for a single cobra command, but not both. -Simplified code from `helm status` looks like: - -```go -cmd := &cobra.Command{ - Use: "status RELEASE_NAME", - Short: "Display the status of the named release", - Long: status_long, - RunE: func(cmd *cobra.Command, args []string) { - RunGet(args[0]) - }, - ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - if len(args) != 0 { - return nil, cobra.ShellCompDirectiveNoFileComp - } - return getReleasesFromCluster(toComplete), cobra.ShellCompDirectiveNoFileComp - }, -} -``` -Where `getReleasesFromCluster()` is a Go function that obtains the list of current Helm releases running on the Kubernetes cluster. -Notice we put the `ValidArgsFunction` on the `status` sub-command. Let's assume the Helm releases on the cluster are: `harbor`, `notary`, `rook` and `thanos` then this dynamic completion will give results like: - -```bash -$ helm status [tab][tab] -harbor notary rook thanos -``` -You may have noticed the use of `cobra.ShellCompDirective`. These directives are bit fields allowing to control some shell completion behaviors for your particular completion. You can combine them with the bit-or operator such as `cobra.ShellCompDirectiveNoSpace | cobra.ShellCompDirectiveNoFileComp` -```go -// Indicates that the shell will perform its default behavior after completions -// have been provided (this implies none of the other directives). -ShellCompDirectiveDefault - -// Indicates an error occurred and completions should be ignored. -ShellCompDirectiveError - -// Indicates that the shell should not add a space after the completion, -// even if there is a single completion provided. -ShellCompDirectiveNoSpace - -// Indicates that the shell should not provide file completion even when -// no completion is provided. -ShellCompDirectiveNoFileComp - -// Indicates that the returned completions should be used as file extension filters. -// For example, to complete only files of the form *.json or *.yaml: -// return []string{"yaml", "json"}, ShellCompDirectiveFilterFileExt -// For flags, using MarkFlagFilename() and MarkPersistentFlagFilename() -// is a shortcut to using this directive explicitly. -// -ShellCompDirectiveFilterFileExt - -// Indicates that only directory names should be provided in file completion. -// For example: -// return nil, ShellCompDirectiveFilterDirs -// For flags, using MarkFlagDirname() is a shortcut to using this directive explicitly. -// -// To request directory names within another directory, the returned completions -// should specify a single directory name within which to search. For example, -// to complete directories within "themes/": -// return []string{"themes"}, ShellCompDirectiveFilterDirs -// -ShellCompDirectiveFilterDirs - -// ShellCompDirectiveKeepOrder indicates that the shell should preserve the order -// in which the completions are provided -ShellCompDirectiveKeepOrder -``` - -***Note***: When using the `ValidArgsFunction`, Cobra will call your registered function after having parsed all flags and arguments provided in the command-line. You therefore don't need to do this parsing yourself. For example, when a user calls `helm status --namespace my-rook-ns [tab][tab]`, Cobra will call your registered `ValidArgsFunction` after having parsed the `--namespace` flag, as it would have done when calling the `RunE` function. - -#### Debugging - -Cobra achieves dynamic completion through the use of a hidden command called by the completion script. To debug your Go completion code, you can call this hidden command directly: -```bash -$ helm __complete status har -harbor -:4 -Completion ended with directive: ShellCompDirectiveNoFileComp # This is on stderr -``` -***Important:*** If the noun to complete is empty (when the user has not yet typed any letters of that noun), you must pass an empty parameter to the `__complete` command: -```bash -$ helm __complete status "" -harbor -notary -rook -thanos -:4 -Completion ended with directive: ShellCompDirectiveNoFileComp # This is on stderr -``` -Calling the `__complete` command directly allows you to run the Go debugger to troubleshoot your code. You can also add printouts to your code; Cobra provides the following functions to use for printouts in Go completion code: -```go -// Prints to the completion script debug file (if BASH_COMP_DEBUG_FILE -// is set to a file path) and optionally prints to stderr. -cobra.CompDebug(msg string, printToStdErr bool) { -cobra.CompDebugln(msg string, printToStdErr bool) - -// Prints to the completion script debug file (if BASH_COMP_DEBUG_FILE -// is set to a file path) and to stderr. -cobra.CompError(msg string) -cobra.CompErrorln(msg string) -``` -***Important:*** You should **not** leave traces that print directly to stdout in your completion code as they will be interpreted as completion choices by the completion script. Instead, use the cobra-provided debugging traces functions mentioned above. - -## Completions for flags - -### Mark flags as required - -Most of the time completions will only show sub-commands. But if a flag is required to make a sub-command work, you probably want it to show up when the user types [tab][tab]. You can mark a flag as 'Required' like so: - -```go -cmd.MarkFlagRequired("pod") -cmd.MarkFlagRequired("container") -``` - -and you'll get something like - -```bash -$ kubectl exec [tab][tab] --c --container= -p --pod= -``` - -### Specify dynamic flag completion - -As for nouns, Cobra provides a way of defining dynamic completion of flags. To provide a Go function that Cobra will execute when it needs the list of completion choices for a flag, you must register the function using the `command.RegisterFlagCompletionFunc()` function. - -```go -flagName := "output" -cmd.RegisterFlagCompletionFunc(flagName, func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - return []string{"json", "table", "yaml"}, cobra.ShellCompDirectiveDefault -}) -``` -Notice that calling `RegisterFlagCompletionFunc()` is done through the `command` with which the flag is associated. In our example this dynamic completion will give results like so: - -```bash -$ helm status --output [tab][tab] -json table yaml -``` - -#### Debugging - -You can also easily debug your Go completion code for flags: -```bash -$ helm __complete status --output "" -json -table -yaml -:4 -Completion ended with directive: ShellCompDirectiveNoFileComp # This is on stderr -``` -***Important:*** You should **not** leave traces that print to stdout in your completion code as they will be interpreted as completion choices by the completion script. Instead, use the cobra-provided debugging traces functions mentioned further above. - -### Specify valid filename extensions for flags that take a filename - -To limit completions of flag values to file names with certain extensions you can either use the different `MarkFlagFilename()` functions or a combination of `RegisterFlagCompletionFunc()` and `ShellCompDirectiveFilterFileExt`, like so: -```go -flagName := "output" -cmd.MarkFlagFilename(flagName, "yaml", "json") -``` -or -```go -flagName := "output" -cmd.RegisterFlagCompletionFunc(flagName, func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - return []string{"yaml", "json"}, ShellCompDirectiveFilterFileExt}) -``` - -### Limit flag completions to directory names - -To limit completions of flag values to directory names you can either use the `MarkFlagDirname()` functions or a combination of `RegisterFlagCompletionFunc()` and `ShellCompDirectiveFilterDirs`, like so: -```go -flagName := "output" -cmd.MarkFlagDirname(flagName) -``` -or -```go -flagName := "output" -cmd.RegisterFlagCompletionFunc(flagName, func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - return nil, cobra.ShellCompDirectiveFilterDirs -}) -``` -To limit completions of flag values to directory names *within another directory* you can use a combination of `RegisterFlagCompletionFunc()` and `ShellCompDirectiveFilterDirs` like so: -```go -flagName := "output" -cmd.RegisterFlagCompletionFunc(flagName, func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - return []string{"themes"}, cobra.ShellCompDirectiveFilterDirs -}) -``` -### Descriptions for completions - -Cobra provides support for completion descriptions. Such descriptions are supported for each shell -(however, for bash, it is only available in the [completion V2 version](#bash-completion-v2)). -For commands and flags, Cobra will provide the descriptions automatically, based on usage information. -For example, using zsh: -``` -$ helm s[tab] -search -- search for a keyword in charts -show -- show information of a chart -status -- displays the status of the named release -``` -while using fish: -``` -$ helm s[tab] -search (search for a keyword in charts) show (show information of a chart) status (displays the status of the named release) -``` - -Cobra allows you to add descriptions to your own completions. Simply add the description text after each completion, following a `\t` separator. This technique applies to completions returned by `ValidArgs`, `ValidArgsFunction` and `RegisterFlagCompletionFunc()`. For example: -```go -ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - return []string{"harbor\tAn image registry", "thanos\tLong-term metrics"}, cobra.ShellCompDirectiveNoFileComp -}} -``` -or -```go -ValidArgs: []string{"bash\tCompletions for bash", "zsh\tCompletions for zsh"} -``` - -If you don't want to show descriptions in the completions, you can add `--no-descriptions` to the default `completion` command to disable them, like: - -```bash -$ source <(helm completion bash) -$ helm completion [tab][tab] -bash (generate autocompletion script for bash) powershell (generate autocompletion script for powershell) -fish (generate autocompletion script for fish) zsh (generate autocompletion script for zsh) - -$ source <(helm completion bash --no-descriptions) -$ helm completion [tab][tab] -bash fish powershell zsh -``` -## Bash completions - -### Dependencies - -The bash completion script generated by Cobra requires the `bash_completion` package. You should update the help text of your completion command to show how to install the `bash_completion` package ([Kubectl docs](https://kubernetes.io/docs/tasks/tools/install-kubectl/#enabling-shell-autocompletion)) - -### Aliases - -You can also configure `bash` aliases for your program and they will also support completions. - -```bash -alias aliasname=origcommand -complete -o default -F __start_origcommand aliasname - -# and now when you run `aliasname` completion will make -# suggestions as it did for `origcommand`. - -$ aliasname -completion firstcommand secondcommand -``` -### Bash legacy dynamic completions - -For backward compatibility, Cobra still supports its bash legacy dynamic completion solution. -Please refer to [Bash Completions](bash_completions.md) for details. - -### Bash completion V2 - -Cobra provides two versions for bash completion. The original bash completion (which started it all!) can be used by calling -`GenBashCompletion()` or `GenBashCompletionFile()`. - -A new V2 bash completion version is also available. This version can be used by calling `GenBashCompletionV2()` or -`GenBashCompletionFileV2()`. The V2 version does **not** support the legacy dynamic completion -(see [Bash Completions](bash_completions.md)) but instead works only with the Go dynamic completion -solution described in this document. -Unless your program already uses the legacy dynamic completion solution, it is recommended that you use the bash -completion V2 solution which provides the following extra features: -- Supports completion descriptions (like the other shells) -- Small completion script of less than 300 lines (v1 generates scripts of thousands of lines; `kubectl` for example has a bash v1 completion script of over 13K lines) -- Streamlined user experience thanks to a completion behavior aligned with the other shells - -`Bash` completion V2 supports descriptions for completions. When calling `GenBashCompletionV2()` or `GenBashCompletionFileV2()` -you must provide these functions with a parameter indicating if the completions should be annotated with a description; Cobra -will provide the description automatically based on usage information. You can choose to make this option configurable by -your users. - -``` -# With descriptions -$ helm s[tab][tab] -search (search for a keyword in charts) status (display the status of the named release) -show (show information of a chart) - -# Without descriptions -$ helm s[tab][tab] -search show status -``` -**Note**: Cobra's default `completion` command uses bash completion V2. If for some reason you need to use bash completion V1, you will need to implement your own `completion` command. -## Zsh completions - -Cobra supports native zsh completion generated from the root `cobra.Command`. -The generated completion script should be put somewhere in your `$fpath` and be named -`_`. You will need to start a new shell for the completions to become available. - -Zsh supports descriptions for completions. Cobra will provide the description automatically, -based on usage information. Cobra provides a way to completely disable such descriptions by -using `GenZshCompletionNoDesc()` or `GenZshCompletionFileNoDesc()`. You can choose to make -this a configurable option to your users. -``` -# With descriptions -$ helm s[tab] -search -- search for a keyword in charts -show -- show information of a chart -status -- displays the status of the named release - -# Without descriptions -$ helm s[tab] -search show status -``` -*Note*: Because of backward-compatibility requirements, we were forced to have a different API to disable completion descriptions between `zsh` and `fish`. - -### Limitations - -* Custom completions implemented in Bash scripting (legacy) are not supported and will be ignored for `zsh` (including the use of the `BashCompCustom` flag annotation). - * You should instead use `ValidArgsFunction` and `RegisterFlagCompletionFunc()` which are portable to the different shells (`bash`, `zsh`, `fish`, `powershell`). -* The function `MarkFlagCustom()` is not supported and will be ignored for `zsh`. - * You should instead use `RegisterFlagCompletionFunc()`. - -### Zsh completions standardization - -Cobra 1.1 standardized its zsh completion support to align it with its other shell completions. Although the API was kept backward-compatible, some small changes in behavior were introduced. -Please refer to [Zsh Completions](zsh_completions.md) for details. - -## fish completions - -Cobra supports native fish completions generated from the root `cobra.Command`. You can use the `command.GenFishCompletion()` or `command.GenFishCompletionFile()` functions. You must provide these functions with a parameter indicating if the completions should be annotated with a description; Cobra will provide the description automatically based on usage information. You can choose to make this option configurable by your users. -``` -# With descriptions -$ helm s[tab] -search (search for a keyword in charts) show (show information of a chart) status (displays the status of the named release) - -# Without descriptions -$ helm s[tab] -search show status -``` -*Note*: Because of backward-compatibility requirements, we were forced to have a different API to disable completion descriptions between `zsh` and `fish`. - -### Limitations - -* Custom completions implemented in bash scripting (legacy) are not supported and will be ignored for `fish` (including the use of the `BashCompCustom` flag annotation). - * You should instead use `ValidArgsFunction` and `RegisterFlagCompletionFunc()` which are portable to the different shells (`bash`, `zsh`, `fish`, `powershell`). -* The function `MarkFlagCustom()` is not supported and will be ignored for `fish`. - * You should instead use `RegisterFlagCompletionFunc()`. -* The following flag completion annotations are not supported and will be ignored for `fish`: - * `BashCompFilenameExt` (filtering by file extension) - * `BashCompSubdirsInDir` (filtering by directory) -* The functions corresponding to the above annotations are consequently not supported and will be ignored for `fish`: - * `MarkFlagFilename()` and `MarkPersistentFlagFilename()` (filtering by file extension) - * `MarkFlagDirname()` and `MarkPersistentFlagDirname()` (filtering by directory) -* Similarly, the following completion directives are not supported and will be ignored for `fish`: - * `ShellCompDirectiveFilterFileExt` (filtering by file extension) - * `ShellCompDirectiveFilterDirs` (filtering by directory) - -## PowerShell completions - -Cobra supports native PowerShell completions generated from the root `cobra.Command`. You can use the `command.GenPowerShellCompletion()` or `command.GenPowerShellCompletionFile()` functions. To include descriptions use `command.GenPowerShellCompletionWithDesc()` and `command.GenPowerShellCompletionFileWithDesc()`. Cobra will provide the description automatically based on usage information. You can choose to make this option configurable by your users. - -The script is designed to support all three PowerShell completion modes: - -* TabCompleteNext (default windows style - on each key press the next option is displayed) -* Complete (works like bash) -* MenuComplete (works like zsh) - -You set the mode with `Set-PSReadLineKeyHandler -Key Tab -Function `. Descriptions are only displayed when using the `Complete` or `MenuComplete` mode. - -Users need PowerShell version 5.0 or above, which comes with Windows 10 and can be downloaded separately for Windows 7 or 8.1. They can then write the completions to a file and source this file from their PowerShell profile, which is referenced by the `$Profile` environment variable. See `Get-Help about_Profiles` for more info about PowerShell profiles. - -``` -# With descriptions and Mode 'Complete' -$ helm s[tab] -search (search for a keyword in charts) show (show information of a chart) status (displays the status of the named release) - -# With descriptions and Mode 'MenuComplete' The description of the current selected value will be displayed below the suggestions. -$ helm s[tab] -search show status - -search for a keyword in charts - -# Without descriptions -$ helm s[tab] -search show status -``` -### Aliases - -You can also configure `powershell` aliases for your program and they will also support completions. - -``` -$ sal aliasname origcommand -$ Register-ArgumentCompleter -CommandName 'aliasname' -ScriptBlock $__origcommandCompleterBlock - -# and now when you run `aliasname` completion will make -# suggestions as it did for `origcommand`. - -$ aliasname -completion firstcommand secondcommand -``` -The name of the completer block variable is of the form `$__CompleterBlock` where every `-` and `:` in the program name have been replaced with `_`, to respect powershell naming syntax. - -### Limitations - -* Custom completions implemented in bash scripting (legacy) are not supported and will be ignored for `powershell` (including the use of the `BashCompCustom` flag annotation). - * You should instead use `ValidArgsFunction` and `RegisterFlagCompletionFunc()` which are portable to the different shells (`bash`, `zsh`, `fish`, `powershell`). -* The function `MarkFlagCustom()` is not supported and will be ignored for `powershell`. - * You should instead use `RegisterFlagCompletionFunc()`. -* The following flag completion annotations are not supported and will be ignored for `powershell`: - * `BashCompFilenameExt` (filtering by file extension) - * `BashCompSubdirsInDir` (filtering by directory) -* The functions corresponding to the above annotations are consequently not supported and will be ignored for `powershell`: - * `MarkFlagFilename()` and `MarkPersistentFlagFilename()` (filtering by file extension) - * `MarkFlagDirname()` and `MarkPersistentFlagDirname()` (filtering by directory) -* Similarly, the following completion directives are not supported and will be ignored for `powershell`: - * `ShellCompDirectiveFilterFileExt` (filtering by file extension) - * `ShellCompDirectiveFilterDirs` (filtering by directory) diff --git a/vendor/github.com/spf13/cobra/user_guide.md b/vendor/github.com/spf13/cobra/user_guide.md deleted file mode 100644 index 85201d8..0000000 --- a/vendor/github.com/spf13/cobra/user_guide.md +++ /dev/null @@ -1,726 +0,0 @@ -# User Guide - -While you are welcome to provide your own organization, typically a Cobra-based -application will follow the following organizational structure: - -``` - ▾ appName/ - ▾ cmd/ - add.go - your.go - commands.go - here.go - main.go -``` - -In a Cobra app, typically the main.go file is very bare. It serves one purpose: initializing Cobra. - -```go -package main - -import ( - "{pathToYourApp}/cmd" -) - -func main() { - cmd.Execute() -} -``` - -## Using the Cobra Generator - -Cobra-CLI is its own program that will create your application and add any -commands you want. It's the easiest way to incorporate Cobra into your application. - -For complete details on using the Cobra generator, please refer to [The Cobra-CLI Generator README](https://github.com/spf13/cobra-cli/blob/main/README.md) - -## Using the Cobra Library - -To manually implement Cobra you need to create a bare main.go file and a rootCmd file. -You will optionally provide additional commands as you see fit. - -### Create rootCmd - -Cobra doesn't require any special constructors. Simply create your commands. - -Ideally you place this in app/cmd/root.go: - -```go -var rootCmd = &cobra.Command{ - Use: "hugo", - Short: "Hugo is a very fast static site generator", - Long: `A Fast and Flexible Static Site Generator built with - love by spf13 and friends in Go. - Complete documentation is available at https://gohugo.io/documentation/`, - Run: func(cmd *cobra.Command, args []string) { - // Do Stuff Here - }, -} - -func Execute() { - if err := rootCmd.Execute(); err != nil { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } -} -``` - -You will additionally define flags and handle configuration in your init() function. - -For example cmd/root.go: - -```go -package cmd - -import ( - "fmt" - "os" - - "github.com/spf13/cobra" - "github.com/spf13/viper" -) - -var ( - // Used for flags. - cfgFile string - userLicense string - - rootCmd = &cobra.Command{ - Use: "cobra-cli", - Short: "A generator for Cobra based Applications", - Long: `Cobra is a CLI library for Go that empowers applications. -This application is a tool to generate the needed files -to quickly create a Cobra application.`, - } -) - -// Execute executes the root command. -func Execute() error { - return rootCmd.Execute() -} - -func init() { - cobra.OnInitialize(initConfig) - - rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.cobra.yaml)") - rootCmd.PersistentFlags().StringP("author", "a", "YOUR NAME", "author name for copyright attribution") - rootCmd.PersistentFlags().StringVarP(&userLicense, "license", "l", "", "name of license for the project") - rootCmd.PersistentFlags().Bool("viper", true, "use Viper for configuration") - viper.BindPFlag("author", rootCmd.PersistentFlags().Lookup("author")) - viper.BindPFlag("useViper", rootCmd.PersistentFlags().Lookup("viper")) - viper.SetDefault("author", "NAME HERE ") - viper.SetDefault("license", "apache") - - rootCmd.AddCommand(addCmd) - rootCmd.AddCommand(initCmd) -} - -func initConfig() { - if cfgFile != "" { - // Use config file from the flag. - viper.SetConfigFile(cfgFile) - } else { - // Find home directory. - home, err := os.UserHomeDir() - cobra.CheckErr(err) - - // Search config in home directory with name ".cobra" (without extension). - viper.AddConfigPath(home) - viper.SetConfigType("yaml") - viper.SetConfigName(".cobra") - } - - viper.AutomaticEnv() - - if err := viper.ReadInConfig(); err == nil { - fmt.Println("Using config file:", viper.ConfigFileUsed()) - } -} -``` - -### Create your main.go - -With the root command you need to have your main function execute it. -Execute should be run on the root for clarity, though it can be called on any command. - -In a Cobra app, typically the main.go file is very bare. It serves one purpose: to initialize Cobra. - -```go -package main - -import ( - "{pathToYourApp}/cmd" -) - -func main() { - cmd.Execute() -} -``` - -### Create additional commands - -Additional commands can be defined and typically are each given their own file -inside of the cmd/ directory. - -If you wanted to create a version command you would create cmd/version.go and -populate it with the following: - -```go -package cmd - -import ( - "fmt" - - "github.com/spf13/cobra" -) - -func init() { - rootCmd.AddCommand(versionCmd) -} - -var versionCmd = &cobra.Command{ - Use: "version", - Short: "Print the version number of Hugo", - Long: `All software has versions. This is Hugo's`, - Run: func(cmd *cobra.Command, args []string) { - fmt.Println("Hugo Static Site Generator v0.9 -- HEAD") - }, -} -``` - -### Organizing subcommands - -A command may have subcommands which in turn may have other subcommands. This is achieved by using -`AddCommand`. In some cases, especially in larger applications, each subcommand may be defined in -its own go package. - -The suggested approach is for the parent command to use `AddCommand` to add its most immediate -subcommands. For example, consider the following directory structure: - -```text -├── cmd -│   ├── root.go -│   └── sub1 -│   ├── sub1.go -│   └── sub2 -│   ├── leafA.go -│   ├── leafB.go -│   └── sub2.go -└── main.go -``` - -In this case: - -* The `init` function of `root.go` adds the command defined in `sub1.go` to the root command. -* The `init` function of `sub1.go` adds the command defined in `sub2.go` to the sub1 command. -* The `init` function of `sub2.go` adds the commands defined in `leafA.go` and `leafB.go` to the - sub2 command. - -This approach ensures the subcommands are always included at compile time while avoiding cyclic -references. - -### Returning and handling errors - -If you wish to return an error to the caller of a command, `RunE` can be used. - -```go -package cmd - -import ( - "fmt" - - "github.com/spf13/cobra" -) - -func init() { - rootCmd.AddCommand(tryCmd) -} - -var tryCmd = &cobra.Command{ - Use: "try", - Short: "Try and possibly fail at something", - RunE: func(cmd *cobra.Command, args []string) error { - if err := someFunc(); err != nil { - return err - } - return nil - }, -} -``` - -The error can then be caught at the execute function call. - -## Working with Flags - -Flags provide modifiers to control how the action command operates. - -### Assign flags to a command - -Since the flags are defined and used in different locations, we need to -define a variable outside with the correct scope to assign the flag to -work with. - -```go -var Verbose bool -var Source string -``` - -There are two different approaches to assign a flag. - -### Persistent Flags - -A flag can be 'persistent', meaning that this flag will be available to the -command it's assigned to as well as every command under that command. For -global flags, assign a flag as a persistent flag on the root. - -```go -rootCmd.PersistentFlags().BoolVarP(&Verbose, "verbose", "v", false, "verbose output") -``` - -### Local Flags - -A flag can also be assigned locally, which will only apply to that specific command. - -```go -localCmd.Flags().StringVarP(&Source, "source", "s", "", "Source directory to read from") -``` - -### Local Flag on Parent Commands - -By default, Cobra only parses local flags on the target command, and any local flags on -parent commands are ignored. By enabling `Command.TraverseChildren`, Cobra will -parse local flags on each command before executing the target command. - -```go -command := cobra.Command{ - Use: "print [OPTIONS] [COMMANDS]", - TraverseChildren: true, -} -``` - -### Bind Flags with Config - -You can also bind your flags with [viper](https://github.com/spf13/viper): -```go -var author string - -func init() { - rootCmd.PersistentFlags().StringVar(&author, "author", "YOUR NAME", "Author name for copyright attribution") - viper.BindPFlag("author", rootCmd.PersistentFlags().Lookup("author")) -} -``` - -In this example, the persistent flag `author` is bound with `viper`. -**Note**: the variable `author` will not be set to the value from config, -when the `--author` flag is provided by user. - -More in [viper documentation](https://github.com/spf13/viper#working-with-flags). - -### Required flags - -Flags are optional by default. If instead you wish your command to report an error -when a flag has not been set, mark it as required: -```go -rootCmd.Flags().StringVarP(&Region, "region", "r", "", "AWS region (required)") -rootCmd.MarkFlagRequired("region") -``` - -Or, for persistent flags: -```go -rootCmd.PersistentFlags().StringVarP(&Region, "region", "r", "", "AWS region (required)") -rootCmd.MarkPersistentFlagRequired("region") -``` - -### Flag Groups - -If you have different flags that must be provided together (e.g. if they provide the `--username` flag they MUST provide the `--password` flag as well) then -Cobra can enforce that requirement: -```go -rootCmd.Flags().StringVarP(&u, "username", "u", "", "Username (required if password is set)") -rootCmd.Flags().StringVarP(&pw, "password", "p", "", "Password (required if username is set)") -rootCmd.MarkFlagsRequiredTogether("username", "password") -``` - -You can also prevent different flags from being provided together if they represent mutually -exclusive options such as specifying an output format as either `--json` or `--yaml` but never both: -```go -rootCmd.Flags().BoolVar(&ofJson, "json", false, "Output in JSON") -rootCmd.Flags().BoolVar(&ofYaml, "yaml", false, "Output in YAML") -rootCmd.MarkFlagsMutuallyExclusive("json", "yaml") -``` - -In both of these cases: - - both local and persistent flags can be used - - **NOTE:** the group is only enforced on commands where every flag is defined - - a flag may appear in multiple groups - - a group may contain any number of flags - -## Positional and Custom Arguments - -Validation of positional arguments can be specified using the `Args` field of `Command`. -The following validators are built in: - -- Number of arguments: - - `NoArgs` - report an error if there are any positional args. - - `ArbitraryArgs` - accept any number of args. - - `MinimumNArgs(int)` - report an error if less than N positional args are provided. - - `MaximumNArgs(int)` - report an error if more than N positional args are provided. - - `ExactArgs(int)` - report an error if there are not exactly N positional args. - - `RangeArgs(min, max)` - report an error if the number of args is not between `min` and `max`. -- Content of the arguments: - - `OnlyValidArgs` - report an error if there are any positional args not specified in the `ValidArgs` field of `Command`, which can optionally be set to a list of valid values for positional args. - -If `Args` is undefined or `nil`, it defaults to `ArbitraryArgs`. - -Moreover, `MatchAll(pargs ...PositionalArgs)` enables combining existing checks with arbitrary other checks. -For instance, if you want to report an error if there are not exactly N positional args OR if there are any positional -args that are not in the `ValidArgs` field of `Command`, you can call `MatchAll` on `ExactArgs` and `OnlyValidArgs`, as -shown below: - -```go -var cmd = &cobra.Command{ - Short: "hello", - Args: cobra.MatchAll(cobra.ExactArgs(2), cobra.OnlyValidArgs), - Run: func(cmd *cobra.Command, args []string) { - fmt.Println("Hello, World!") - }, -} -``` - -It is possible to set any custom validator that satisfies `func(cmd *cobra.Command, args []string) error`. -For example: - -```go -var cmd = &cobra.Command{ - Short: "hello", - Args: func(cmd *cobra.Command, args []string) error { - // Optionally run one of the validators provided by cobra - if err := cobra.MinimumNArgs(1)(cmd, args); err != nil { - return err - } - // Run the custom validation logic - if myapp.IsValidColor(args[0]) { - return nil - } - return fmt.Errorf("invalid color specified: %s", args[0]) - }, - Run: func(cmd *cobra.Command, args []string) { - fmt.Println("Hello, World!") - }, -} -``` - -## Example - -In the example below, we have defined three commands. Two are at the top level -and one (cmdTimes) is a child of one of the top commands. In this case the root -is not executable, meaning that a subcommand is required. This is accomplished -by not providing a 'Run' for the 'rootCmd'. - -We have only defined one flag for a single command. - -More documentation about flags is available at https://github.com/spf13/pflag - -```go -package main - -import ( - "fmt" - "strings" - - "github.com/spf13/cobra" -) - -func main() { - var echoTimes int - - var cmdPrint = &cobra.Command{ - Use: "print [string to print]", - Short: "Print anything to the screen", - Long: `print is for printing anything back to the screen. -For many years people have printed back to the screen.`, - Args: cobra.MinimumNArgs(1), - Run: func(cmd *cobra.Command, args []string) { - fmt.Println("Print: " + strings.Join(args, " ")) - }, - } - - var cmdEcho = &cobra.Command{ - Use: "echo [string to echo]", - Short: "Echo anything to the screen", - Long: `echo is for echoing anything back. -Echo works a lot like print, except it has a child command.`, - Args: cobra.MinimumNArgs(1), - Run: func(cmd *cobra.Command, args []string) { - fmt.Println("Echo: " + strings.Join(args, " ")) - }, - } - - var cmdTimes = &cobra.Command{ - Use: "times [string to echo]", - Short: "Echo anything to the screen more times", - Long: `echo things multiple times back to the user by providing -a count and a string.`, - Args: cobra.MinimumNArgs(1), - Run: func(cmd *cobra.Command, args []string) { - for i := 0; i < echoTimes; i++ { - fmt.Println("Echo: " + strings.Join(args, " ")) - } - }, - } - - cmdTimes.Flags().IntVarP(&echoTimes, "times", "t", 1, "times to echo the input") - - var rootCmd = &cobra.Command{Use: "app"} - rootCmd.AddCommand(cmdPrint, cmdEcho) - cmdEcho.AddCommand(cmdTimes) - rootCmd.Execute() -} -``` - -For a more complete example of a larger application, please checkout [Hugo](https://gohugo.io/). - -## Help Command - -Cobra automatically adds a help command to your application when you have subcommands. -This will be called when a user runs 'app help'. Additionally, help will also -support all other commands as input. Say, for instance, you have a command called -'create' without any additional configuration; Cobra will work when 'app help -create' is called. Every command will automatically have the '--help' flag added. - -### Example - -The following output is automatically generated by Cobra. Nothing beyond the -command and flag definitions are needed. - - $ cobra-cli help - - Cobra is a CLI library for Go that empowers applications. - This application is a tool to generate the needed files - to quickly create a Cobra application. - - Usage: - cobra-cli [command] - - Available Commands: - add Add a command to a Cobra Application - completion Generate the autocompletion script for the specified shell - help Help about any command - init Initialize a Cobra Application - - Flags: - -a, --author string author name for copyright attribution (default "YOUR NAME") - --config string config file (default is $HOME/.cobra.yaml) - -h, --help help for cobra-cli - -l, --license string name of license for the project - --viper use Viper for configuration - - Use "cobra-cli [command] --help" for more information about a command. - - -Help is just a command like any other. There is no special logic or behavior -around it. In fact, you can provide your own if you want. - -### Grouping commands in help - -Cobra supports grouping of available commands in the help output. To group commands, each group must be explicitly -defined using `AddGroup()` on the parent command. Then a subcommand can be added to a group using the `GroupID` element -of that subcommand. The groups will appear in the help output in the same order as they are defined using different -calls to `AddGroup()`. If you use the generated `help` or `completion` commands, you can set their group ids using -`SetHelpCommandGroupId()` and `SetCompletionCommandGroupId()` on the root command, respectively. - -### Defining your own help - -You can provide your own Help command or your own template for the default command to use -with the following functions: - -```go -cmd.SetHelpCommand(cmd *Command) -cmd.SetHelpFunc(f func(*Command, []string)) -cmd.SetHelpTemplate(s string) -``` - -The latter two will also apply to any children commands. - -## Usage Message - -When the user provides an invalid flag or invalid command, Cobra responds by -showing the user the 'usage'. - -### Example -You may recognize this from the help above. That's because the default help -embeds the usage as part of its output. - - $ cobra-cli --invalid - Error: unknown flag: --invalid - Usage: - cobra-cli [command] - - Available Commands: - add Add a command to a Cobra Application - completion Generate the autocompletion script for the specified shell - help Help about any command - init Initialize a Cobra Application - - Flags: - -a, --author string author name for copyright attribution (default "YOUR NAME") - --config string config file (default is $HOME/.cobra.yaml) - -h, --help help for cobra-cli - -l, --license string name of license for the project - --viper use Viper for configuration - - Use "cobra [command] --help" for more information about a command. - -### Defining your own usage -You can provide your own usage function or template for Cobra to use. -Like help, the function and template are overridable through public methods: - -```go -cmd.SetUsageFunc(f func(*Command) error) -cmd.SetUsageTemplate(s string) -``` - -## Version Flag - -Cobra adds a top-level '--version' flag if the Version field is set on the root command. -Running an application with the '--version' flag will print the version to stdout using -the version template. The template can be customized using the -`cmd.SetVersionTemplate(s string)` function. - -## PreRun and PostRun Hooks - -It is possible to run functions before or after the main `Run` function of your command. The `PersistentPreRun` and `PreRun` functions will be executed before `Run`. `PersistentPostRun` and `PostRun` will be executed after `Run`. The `Persistent*Run` functions will be inherited by children if they do not declare their own. These functions are run in the following order: - -- `PersistentPreRun` -- `PreRun` -- `Run` -- `PostRun` -- `PersistentPostRun` - -An example of two commands which use all of these features is below. When the subcommand is executed, it will run the root command's `PersistentPreRun` but not the root command's `PersistentPostRun`: - -```go -package main - -import ( - "fmt" - - "github.com/spf13/cobra" -) - -func main() { - - var rootCmd = &cobra.Command{ - Use: "root [sub]", - Short: "My root command", - PersistentPreRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside rootCmd PersistentPreRun with args: %v\n", args) - }, - PreRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside rootCmd PreRun with args: %v\n", args) - }, - Run: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside rootCmd Run with args: %v\n", args) - }, - PostRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside rootCmd PostRun with args: %v\n", args) - }, - PersistentPostRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside rootCmd PersistentPostRun with args: %v\n", args) - }, - } - - var subCmd = &cobra.Command{ - Use: "sub [no options!]", - Short: "My subcommand", - PreRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside subCmd PreRun with args: %v\n", args) - }, - Run: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside subCmd Run with args: %v\n", args) - }, - PostRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside subCmd PostRun with args: %v\n", args) - }, - PersistentPostRun: func(cmd *cobra.Command, args []string) { - fmt.Printf("Inside subCmd PersistentPostRun with args: %v\n", args) - }, - } - - rootCmd.AddCommand(subCmd) - - rootCmd.SetArgs([]string{""}) - rootCmd.Execute() - fmt.Println() - rootCmd.SetArgs([]string{"sub", "arg1", "arg2"}) - rootCmd.Execute() -} -``` - -Output: -``` -Inside rootCmd PersistentPreRun with args: [] -Inside rootCmd PreRun with args: [] -Inside rootCmd Run with args: [] -Inside rootCmd PostRun with args: [] -Inside rootCmd PersistentPostRun with args: [] - -Inside rootCmd PersistentPreRun with args: [arg1 arg2] -Inside subCmd PreRun with args: [arg1 arg2] -Inside subCmd Run with args: [arg1 arg2] -Inside subCmd PostRun with args: [arg1 arg2] -Inside subCmd PersistentPostRun with args: [arg1 arg2] -``` - -## Suggestions when "unknown command" happens - -Cobra will print automatic suggestions when "unknown command" errors happen. This allows Cobra to behave similarly to the `git` command when a typo happens. For example: - -``` -$ hugo srever -Error: unknown command "srever" for "hugo" - -Did you mean this? - server - -Run 'hugo --help' for usage. -``` - -Suggestions are automatically generated based on existing subcommands and use an implementation of [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance). Every registered command that matches a minimum distance of 2 (ignoring case) will be displayed as a suggestion. - -If you need to disable suggestions or tweak the string distance in your command, use: - -```go -command.DisableSuggestions = true -``` - -or - -```go -command.SuggestionsMinimumDistance = 1 -``` - -You can also explicitly set names for which a given command will be suggested using the `SuggestFor` attribute. This allows suggestions for strings that are not close in terms of string distance, but make sense in your set of commands but for which -you don't want aliases. Example: - -``` -$ kubectl remove -Error: unknown command "remove" for "kubectl" - -Did you mean this? - delete - -Run 'kubectl help' for usage. -``` - -## Generating documentation for your command - -Cobra can generate documentation based on subcommands, flags, etc. Read more about it in the [docs generation documentation](doc/README.md). - -## Generating shell completions - -Cobra can generate a shell-completion file for the following shells: bash, zsh, fish, PowerShell. If you add more information to your commands, these completions can be amazingly powerful and flexible. Read more about it in [Shell Completions](shell_completions.md). - -## Providing Active Help - -Cobra makes use of the shell-completion system to define a framework allowing you to provide Active Help to your users. Active Help are messages (hints, warnings, etc) printed as the program is being used. Read more about it in [Active Help](active_help.md). diff --git a/vendor/github.com/spf13/cobra/zsh_completions.md b/vendor/github.com/spf13/cobra/zsh_completions.md deleted file mode 100644 index 7cff617..0000000 --- a/vendor/github.com/spf13/cobra/zsh_completions.md +++ /dev/null @@ -1,48 +0,0 @@ -## Generating Zsh Completion For Your cobra.Command - -Please refer to [Shell Completions](shell_completions.md) for details. - -## Zsh completions standardization - -Cobra 1.1 standardized its zsh completion support to align it with its other shell completions. Although the API was kept backwards-compatible, some small changes in behavior were introduced. - -### Deprecation summary - -See further below for more details on these deprecations. - -* `cmd.MarkZshCompPositionalArgumentFile(pos, []string{})` is no longer needed. It is therefore **deprecated** and silently ignored. -* `cmd.MarkZshCompPositionalArgumentFile(pos, glob[])` is **deprecated** and silently ignored. - * Instead use `ValidArgsFunction` with `ShellCompDirectiveFilterFileExt`. -* `cmd.MarkZshCompPositionalArgumentWords()` is **deprecated** and silently ignored. - * Instead use `ValidArgsFunction`. - -### Behavioral changes - -**Noun completion** -|Old behavior|New behavior| -|---|---| -|No file completion by default (opposite of bash)|File completion by default; use `ValidArgsFunction` with `ShellCompDirectiveNoFileComp` to turn off file completion on a per-argument basis| -|Completion of flag names without the `-` prefix having been typed|Flag names are only completed if the user has typed the first `-`| -`cmd.MarkZshCompPositionalArgumentFile(pos, []string{})` used to turn on file completion on a per-argument position basis|File completion for all arguments by default; `cmd.MarkZshCompPositionalArgumentFile()` is **deprecated** and silently ignored| -|`cmd.MarkZshCompPositionalArgumentFile(pos, glob[])` used to turn on file completion **with glob filtering** on a per-argument position basis (zsh-specific)|`cmd.MarkZshCompPositionalArgumentFile()` is **deprecated** and silently ignored; use `ValidArgsFunction` with `ShellCompDirectiveFilterFileExt` for file **extension** filtering (not full glob filtering)| -|`cmd.MarkZshCompPositionalArgumentWords(pos, words[])` used to provide completion choices on a per-argument position basis (zsh-specific)|`cmd.MarkZshCompPositionalArgumentWords()` is **deprecated** and silently ignored; use `ValidArgsFunction` to achieve the same behavior| - -**Flag-value completion** - -|Old behavior|New behavior| -|---|---| -|No file completion by default (opposite of bash)|File completion by default; use `RegisterFlagCompletionFunc()` with `ShellCompDirectiveNoFileComp` to turn off file completion| -|`cmd.MarkFlagFilename(flag, []string{})` and similar used to turn on file completion|File completion by default; `cmd.MarkFlagFilename(flag, []string{})` no longer needed in this context and silently ignored| -|`cmd.MarkFlagFilename(flag, glob[])` used to turn on file completion **with glob filtering** (syntax of `[]string{"*.yaml", "*.yml"}` incompatible with bash)|Will continue to work, however, support for bash syntax is added and should be used instead so as to work for all shells (`[]string{"yaml", "yml"}`)| -|`cmd.MarkFlagDirname(flag)` only completes directories (zsh-specific)|Has been added for all shells| -|Completion of a flag name does not repeat, unless flag is of type `*Array` or `*Slice` (not supported by bash)|Retained for `zsh` and added to `fish`| -|Completion of a flag name does not provide the `=` form (unlike bash)|Retained for `zsh` and added to `fish`| - -**Improvements** - -* Custom completion support (`ValidArgsFunction` and `RegisterFlagCompletionFunc()`) -* File completion by default if no other completions found -* Handling of required flags -* File extension filtering no longer mutually exclusive with bash usage -* Completion of directory names *within* another directory -* Support for `=` form of flags diff --git a/vendor/go.etcd.io/bbolt/.go-version b/vendor/go.etcd.io/bbolt/.go-version new file mode 100644 index 0000000..013173a --- /dev/null +++ b/vendor/go.etcd.io/bbolt/.go-version @@ -0,0 +1 @@ +1.22.6 diff --git a/vendor/go.etcd.io/bbolt/Makefile b/vendor/go.etcd.io/bbolt/Makefile index 18154c6..2140779 100644 --- a/vendor/go.etcd.io/bbolt/Makefile +++ b/vendor/go.etcd.io/bbolt/Makefile @@ -41,6 +41,15 @@ coverage: TEST_FREELIST_TYPE=array go test -v -timeout 30m \ -coverprofile cover-freelist-array.out -covermode atomic +BOLT_CMD=bbolt + +build: + go build -o bin/${BOLT_CMD} ./cmd/${BOLT_CMD} + +.PHONY: clean +clean: # Clean binaries + rm -f ./bin/${BOLT_CMD} + .PHONY: gofail-enable gofail-enable: install-gofail gofail enable . @@ -61,3 +70,7 @@ test-failpoint: @echo "[failpoint] array freelist test" TEST_FREELIST_TYPE=array go test -v ${TESTFLAGS} -timeout 30m ./tests/failpoint +.PHONY: test-robustness # Running robustness tests requires root permission +test-robustness: + go test -v ${TESTFLAGS} ./tests/dmflakey -test.root + go test -v ${TESTFLAGS} ./tests/robustness -test.root diff --git a/vendor/go.etcd.io/bbolt/README.md b/vendor/go.etcd.io/bbolt/README.md index 2be669a..495a93e 100644 --- a/vendor/go.etcd.io/bbolt/README.md +++ b/vendor/go.etcd.io/bbolt/README.md @@ -421,10 +421,19 @@ Prev() Move to the previous key. ``` Each of those functions has a return signature of `(key []byte, value []byte)`. -When you have iterated to the end of the cursor then `Next()` will return a -`nil` key. You must seek to a position using `First()`, `Last()`, or `Seek()` -before calling `Next()` or `Prev()`. If you do not seek to a position then -these functions will return a `nil` key. +You must seek to a position using `First()`, `Last()`, or `Seek()` before calling +`Next()` or `Prev()`. If you do not seek to a position then these functions will +return a `nil` key. + +When you have iterated to the end of the cursor, then `Next()` will return a +`nil` key and the cursor still points to the last element if present. When you +have iterated to the beginning of the cursor, then `Prev()` will return a `nil` +key and the cursor still points to the first element if present. + +If you remove key/value pairs during iteration, the cursor may automatically +move to the next position if present in current node each time removing a key. +When you call `c.Next()` after removing a key, it may skip one key/value pair. +Refer to [pull/611](https://github.com/etcd-io/bbolt/pull/611) to get more detailed info. During iteration, if the key is non-`nil` but the value is `nil`, that means the key refers to a bucket rather than a value. Use `Bucket.Bucket()` to @@ -850,6 +859,12 @@ Here are a few things to note when evaluating and using Bolt: to grow. However, it's important to note that deleting large chunks of data will not allow you to reclaim that space on disk. +* Removing key/values pairs in a bucket during iteration on the bucket using + cursor may not work properly. Each time when removing a key/value pair, the + cursor may automatically move to the next position if present. When users + call `c.Next()` after removing a key, it may skip one key/value pair. + Refer to https://github.com/etcd-io/bbolt/pull/611 for more detailed info. + For more information on page allocation, [see this comment][page-allocation]. [page-allocation]: https://github.com/boltdb/bolt/issues/308#issuecomment-74811638 diff --git a/vendor/go.etcd.io/bbolt/bolt_openbsd.go b/vendor/go.etcd.io/bbolt/bolt_openbsd.go index d7f5035..bf47aa1 100644 --- a/vendor/go.etcd.io/bbolt/bolt_openbsd.go +++ b/vendor/go.etcd.io/bbolt/bolt_openbsd.go @@ -1,22 +1,11 @@ package bbolt import ( - "syscall" - "unsafe" -) - -const ( - msAsync = 1 << iota // perform asynchronous writes - msSync // perform synchronous writes - msInvalidate // invalidate cached data + "golang.org/x/sys/unix" ) func msync(db *DB) error { - _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate) - if errno != 0 { - return errno - } - return nil + return unix.Msync(db.data[:db.datasz], unix.MS_INVALIDATE) } func fdatasync(db *DB) error { diff --git a/vendor/go.etcd.io/bbolt/bucket.go b/vendor/go.etcd.io/bbolt/bucket.go index 054467a..f3533d3 100644 --- a/vendor/go.etcd.io/bbolt/bucket.go +++ b/vendor/go.etcd.io/bbolt/bucket.go @@ -162,12 +162,17 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { return nil, ErrBucketNameRequired } + // Insert into node. + // Tip: Use a new variable `newKey` instead of reusing the existing `key` to prevent + // it from being marked as leaking, and accordingly cannot be allocated on stack. + newKey := cloneBytes(key) + // Move cursor to correct position. c := b.Cursor() - k, _, flags := c.seek(key) + k, _, flags := c.seek(newKey) // Return an error if there is an existing key. - if bytes.Equal(key, k) { + if bytes.Equal(newKey, k) { if (flags & bucketLeafFlag) != 0 { return nil, ErrBucketExists } @@ -182,16 +187,14 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { } var value = bucket.write() - // Insert into node. - key = cloneBytes(key) - c.node().put(key, key, value, 0, bucketLeafFlag) + c.node().put(newKey, newKey, value, 0, bucketLeafFlag) // Since subbuckets are not allowed on inline buckets, we need to // dereference the inline page, if it exists. This will cause the bucket // to be treated as a regular, non-inline bucket for the rest of the tx. b.page = nil - return b.Bucket(key), nil + return b.Bucket(newKey), nil } // CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it. @@ -288,18 +291,23 @@ func (b *Bucket) Put(key []byte, value []byte) error { return ErrValueTooLarge } + // Insert into node. + // Tip: Use a new variable `newKey` instead of reusing the existing `key` to prevent + // it from being marked as leaking, and accordingly cannot be allocated on stack. + newKey := cloneBytes(key) + // Move cursor to correct position. c := b.Cursor() - k, _, flags := c.seek(key) + k, _, flags := c.seek(newKey) // Return an error if there is an existing key with a bucket value. - if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 { + if bytes.Equal(newKey, k) && (flags&bucketLeafFlag) != 0 { return ErrIncompatibleValue } - // Insert into node. - key = cloneBytes(key) - c.node().put(key, key, value, 0, 0) + // gofail: var beforeBucketPut struct{} + + c.node().put(newKey, newKey, value, 0, 0) return nil } diff --git a/vendor/go.etcd.io/bbolt/cursor.go b/vendor/go.etcd.io/bbolt/cursor.go index 5dafb0c..bbfd92a 100644 --- a/vendor/go.etcd.io/bbolt/cursor.go +++ b/vendor/go.etcd.io/bbolt/cursor.go @@ -71,7 +71,7 @@ func (c *Cursor) Last() (key []byte, value []byte) { // If this is an empty page (calling Delete may result in empty pages) // we call prev to find the last page that is not empty - for len(c.stack) > 0 && c.stack[len(c.stack)-1].count() == 0 { + for len(c.stack) > 1 && c.stack[len(c.stack)-1].count() == 0 { c.prev() } @@ -254,6 +254,15 @@ func (c *Cursor) prev() (key []byte, value []byte, flags uint32) { elem.index-- break } + // If we've hit the beginning, we should stop moving the cursor, + // and stay at the first element, so that users can continue to + // iterate over the elements in reverse direction by calling `Next`. + // We should return nil in such case. + // Refer to https://github.com/etcd-io/bbolt/issues/733 + if len(c.stack) == 1 { + c.first() + return nil, nil, 0 + } c.stack = c.stack[:i] } diff --git a/vendor/go.etcd.io/bbolt/db.go b/vendor/go.etcd.io/bbolt/db.go index c942212..822798e 100644 --- a/vendor/go.etcd.io/bbolt/db.go +++ b/vendor/go.etcd.io/bbolt/db.go @@ -57,6 +57,12 @@ const ( // All data access is performed through transactions which can be obtained through the DB. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. type DB struct { + // Put `stats` at the first field to ensure it's 64-bit aligned. Note that + // the first word in an allocated struct can be relied upon to be 64-bit + // aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. Also + // refer to discussion in https://github.com/etcd-io/bbolt/issues/577. + stats Stats + // When enabled, the database will perform a Check() after every commit. // A panic is issued if the database is in an inconsistent state. This // flag has a large performance impact so it should only be used for @@ -147,7 +153,6 @@ type DB struct { opened bool rwtx *Tx txs []*Tx - stats Stats freelist *freelist freelistLoad sync.Once @@ -424,7 +429,7 @@ func (db *DB) hasSyncedFreelist() bool { // mmap opens the underlying memory-mapped file and initializes the meta references. // minsz is the minimum size that the new mmap can be. -func (db *DB) mmap(minsz int) error { +func (db *DB) mmap(minsz int) (err error) { db.mmaplock.Lock() defer db.mmaplock.Unlock() @@ -459,17 +464,27 @@ func (db *DB) mmap(minsz int) error { } // Unmap existing data before continuing. - if err := db.munmap(); err != nil { + if err = db.munmap(); err != nil { return err } // Memory-map the data file as a byte slice. // gofail: var mapError string // return errors.New(mapError) - if err := mmap(db, size); err != nil { + if err = mmap(db, size); err != nil { return err } + // Perform unmmap on any error to reset all data fields: + // dataref, data, datasz, meta0 and meta1. + defer func() { + if err != nil { + if unmapErr := db.munmap(); unmapErr != nil { + err = fmt.Errorf("%w; rollback unmap also failed: %v", err, unmapErr) + } + } + }() + if db.Mlock { // Don't allow swapping of data file if err := db.mlock(fileSize); err != nil { @@ -509,7 +524,7 @@ func (db *DB) munmap() error { // gofail: var unmapError string // return errors.New(unmapError) if err := munmap(db); err != nil { - return fmt.Errorf("unmap error: " + err.Error()) + return fmt.Errorf("unmap error: %v", err.Error()) } return nil @@ -553,15 +568,19 @@ func (db *DB) mmapSize(size int) (int, error) { } func (db *DB) munlock(fileSize int) error { + // gofail: var munlockError string + // return errors.New(munlockError) if err := munlock(db, fileSize); err != nil { - return fmt.Errorf("munlock error: " + err.Error()) + return fmt.Errorf("munlock error: %v", err.Error()) } return nil } func (db *DB) mlock(fileSize int) error { + // gofail: var mlockError string + // return errors.New(mlockError) if err := mlock(db, fileSize); err != nil { - return fmt.Errorf("mlock error: " + err.Error()) + return fmt.Errorf("mlock error: %v", err.Error()) } return nil } @@ -649,9 +668,10 @@ func (db *DB) close() error { // Clear ops. db.ops.writeAt = nil + var errs []error // Close the mmap. if err := db.munmap(); err != nil { - return err + errs = append(errs, err) } // Close file handles. @@ -660,18 +680,22 @@ func (db *DB) close() error { if !db.readOnly { // Unlock the file. if err := funlock(db); err != nil { - return fmt.Errorf("bolt.Close(): funlock error: %w", err) + errs = append(errs, fmt.Errorf("bolt.Close(): funlock error: %w", err)) } } // Close the file descriptor. if err := db.file.Close(); err != nil { - return fmt.Errorf("db file close: %s", err) + errs = append(errs, fmt.Errorf("db file close: %w", err)) } db.file = nil } db.path = "" + + if len(errs) > 0 { + return errs[0] + } return nil } @@ -1135,6 +1159,8 @@ func (db *DB) grow(sz int) error { // https://github.com/boltdb/bolt/issues/284 if !db.NoGrowSync && !db.readOnly { if runtime.GOOS != "windows" { + // gofail: var resizeFileError string + // return errors.New(resizeFileError) if err := db.file.Truncate(int64(sz)); err != nil { return fmt.Errorf("file resize error: %s", err) } @@ -1263,6 +1289,12 @@ var DefaultOptions = &Options{ // Stats represents statistics about the database. type Stats struct { + // Put `TxStats` at the first field to ensure it's 64-bit aligned. Note + // that the first word in an allocated struct can be relied upon to be + // 64-bit aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. + // Also refer to discussion in https://github.com/etcd-io/bbolt/issues/577. + TxStats TxStats // global, ongoing stats. + // Freelist stats FreePageN int // total number of free pages on the freelist PendingPageN int // total number of pending pages on the freelist @@ -1272,8 +1304,6 @@ type Stats struct { // Transaction stats TxN int // total number of started read transactions OpenTxN int // number of currently open read transactions - - TxStats TxStats // global, ongoing stats. } // Sub calculates and returns the difference between two sets of database stats. diff --git a/vendor/go.etcd.io/bbolt/freelist.go b/vendor/go.etcd.io/bbolt/freelist.go index 50f2d0e..dffc7bc 100644 --- a/vendor/go.etcd.io/bbolt/freelist.go +++ b/vendor/go.etcd.io/bbolt/freelist.go @@ -252,6 +252,14 @@ func (f *freelist) rollback(txid txid) { } // Remove pages from pending list and mark as free if allocated by txid. delete(f.pending, txid) + + // Remove pgids which are allocated by this txid + for pgid, tid := range f.allocs { + if tid == txid { + delete(f.allocs, pgid) + } + } + f.mergeSpans(m) } @@ -282,9 +290,8 @@ func (f *freelist) read(p *page) { if count == 0 { f.ids = nil } else { - var ids []pgid - data := unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), unsafe.Sizeof(ids[0]), idx) - unsafeSlice(unsafe.Pointer(&ids), data, count) + data := unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), unsafe.Sizeof(pgid(0)), idx) + ids := unsafe.Slice((*pgid)(data), count) // copy the ids, so we don't modify on the freelist page directly idsCopy := make([]pgid, count) @@ -322,15 +329,13 @@ func (f *freelist) write(p *page) error { p.count = uint16(l) } else if l < 0xFFFF { p.count = uint16(l) - var ids []pgid data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) - unsafeSlice(unsafe.Pointer(&ids), data, l) + ids := unsafe.Slice((*pgid)(data), l) f.copyall(ids) } else { p.count = 0xFFFF - var ids []pgid data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) - unsafeSlice(unsafe.Pointer(&ids), data, l+1) + ids := unsafe.Slice((*pgid)(data), l+1) ids[0] = pgid(l) f.copyall(ids[1:]) } diff --git a/vendor/go.etcd.io/bbolt/page.go b/vendor/go.etcd.io/bbolt/page.go index 379645c..bb081b0 100644 --- a/vendor/go.etcd.io/bbolt/page.go +++ b/vendor/go.etcd.io/bbolt/page.go @@ -74,9 +74,8 @@ func (p *page) leafPageElements() []leafPageElement { if p.count == 0 { return nil } - var elems []leafPageElement data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) - unsafeSlice(unsafe.Pointer(&elems), data, int(p.count)) + elems := unsafe.Slice((*leafPageElement)(data), int(p.count)) return elems } @@ -91,9 +90,8 @@ func (p *page) branchPageElements() []branchPageElement { if p.count == 0 { return nil } - var elems []branchPageElement data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) - unsafeSlice(unsafe.Pointer(&elems), data, int(p.count)) + elems := unsafe.Slice((*branchPageElement)(data), int(p.count)) return elems } diff --git a/vendor/go.etcd.io/bbolt/tx.go b/vendor/go.etcd.io/bbolt/tx.go index 2fac8c0..766395d 100644 --- a/vendor/go.etcd.io/bbolt/tx.go +++ b/vendor/go.etcd.io/bbolt/tx.go @@ -1,6 +1,7 @@ package bbolt import ( + "errors" "fmt" "io" "os" @@ -185,6 +186,10 @@ func (tx *Tx) Commit() error { // If the high water mark has moved up then attempt to grow the database. if tx.meta.pgid > opgid { + _ = errors.New("") + // gofail: var lackOfDiskSpace string + // tx.rollback() + // return errors.New(lackOfDiskSpace) if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { tx.rollback() return err @@ -470,6 +475,7 @@ func (tx *Tx) write() error { // Ignore file sync if flag is set on DB. if !tx.db.NoSync || IgnoreNoSync { + // gofail: var beforeSyncDataPages struct{} if err := fdatasync(tx.db); err != nil { return err } @@ -507,6 +513,7 @@ func (tx *Tx) writeMeta() error { return err } if !tx.db.NoSync || IgnoreNoSync { + // gofail: var beforeSyncMetaPage struct{} if err := fdatasync(tx.db); err != nil { return err } diff --git a/vendor/go.etcd.io/bbolt/unsafe.go b/vendor/go.etcd.io/bbolt/unsafe.go index c0e5037..7745d32 100644 --- a/vendor/go.etcd.io/bbolt/unsafe.go +++ b/vendor/go.etcd.io/bbolt/unsafe.go @@ -1,7 +1,6 @@ package bbolt import ( - "reflect" "unsafe" ) @@ -26,14 +25,3 @@ func unsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte { // all), so this is believed to be correct. return (*[maxAllocSize]byte)(unsafeAdd(base, offset))[i:j:j] } - -// unsafeSlice modifies the data, len, and cap of a slice variable pointed to by -// the slice parameter. This helper should be used over other direct -// manipulation of reflect.SliceHeader to prevent misuse, namely, converting -// from reflect.SliceHeader to a Go slice type. -func unsafeSlice(slice, data unsafe.Pointer, len int) { - s := (*reflect.SliceHeader)(slice) - s.Data = uintptr(data) - s.Cap = len - s.Len = len -} diff --git a/vendor/go.uber.org/mock/mockgen/deprecated.go b/vendor/go.uber.org/mock/mockgen/deprecated.go new file mode 100644 index 0000000..0b45a2e --- /dev/null +++ b/vendor/go.uber.org/mock/mockgen/deprecated.go @@ -0,0 +1,41 @@ +package main + +import ( + "flag" + "log" + "os" +) + +const ( + deprecatedFlagProgOnly = "prog_only" + deprecatedFlagExecOnly = "exec_only" +) + +var ( + _ = flag.Bool("prog_only", false, "DEPRECATED (reflect mode) Only generate the reflection program; write it to stdout and exit.") + _ = flag.String("exec_only", "", "DEPRECATED (reflect mode) If set, execute this reflection program.") +) + +// notifyAboutDeprecatedFlags prints a warning message for a deprecated flags if they are set. +func notifyAboutDeprecatedFlags() { + const resetColorPostfix = "\033[0m" + logger := initWarningLogger() + + flag.Visit(func(f *flag.Flag) { + switch f.Name { + case deprecatedFlagProgOnly: + logger.Println("The -prog_only flag is deprecated and has no effect.", resetColorPostfix) + case deprecatedFlagExecOnly: + logger.Println("The -exec_only flag is deprecated and has no effect.", resetColorPostfix) + } + }) +} + +func initWarningLogger() *log.Logger { + const ( + yellowColor = "\033[33m" + warningPrefix = yellowColor + "WARNING: " + ) + + return log.New(os.Stdout, warningPrefix, log.Ldate|log.Ltime) +} diff --git a/vendor/go.uber.org/mock/mockgen/generic_go118.go b/vendor/go.uber.org/mock/mockgen/generic.go similarity index 84% rename from vendor/go.uber.org/mock/mockgen/generic_go118.go rename to vendor/go.uber.org/mock/mockgen/generic.go index b7b4494..c2289c2 100644 --- a/vendor/go.uber.org/mock/mockgen/generic_go118.go +++ b/vendor/go.uber.org/mock/mockgen/generic.go @@ -5,9 +5,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -//go:build go1.18 -// +build go1.18 - package main import ( @@ -15,7 +12,6 @@ import ( "fmt" "go/ast" "go/token" - "strings" "go.uber.org/mock/mockgen/model" ) @@ -67,29 +63,6 @@ func (p *fileParser) parseGenericType(pkg string, typ ast.Expr, tps map[string]m return nil, nil } -func getIdentTypeParams(decl any) string { - if decl == nil { - return "" - } - ts, ok := decl.(*ast.TypeSpec) - if !ok { - return "" - } - if ts.TypeParams == nil || len(ts.TypeParams.List) == 0 { - return "" - } - var sb strings.Builder - sb.WriteString("[") - for i, v := range ts.TypeParams.List { - if i != 0 { - sb.WriteString(", ") - } - sb.WriteString(v.Names[0].Name) - } - sb.WriteString("]") - return sb.String() -} - func (p *fileParser) parseGenericMethod(field *ast.Field, it *namedInterface, iface *model.Interface, pkg string, tps map[string]model.Type) ([]*model.Method, error) { var indices []ast.Expr var typ ast.Expr diff --git a/vendor/go.uber.org/mock/mockgen/generic_notgo118.go b/vendor/go.uber.org/mock/mockgen/generic_notgo118.go deleted file mode 100644 index 8a779c8..0000000 --- a/vendor/go.uber.org/mock/mockgen/generic_notgo118.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build !go1.18 -// +build !go1.18 - -package main - -import ( - "fmt" - "go/ast" - - "go.uber.org/mock/mockgen/model" -) - -func getTypeSpecTypeParams(ts *ast.TypeSpec) []*ast.Field { - return nil -} - -func (p *fileParser) parseGenericType(pkg string, typ ast.Expr, tps map[string]model.Type) (model.Type, error) { - return nil, nil -} - -func getIdentTypeParams(decl any) string { - return "" -} - -func (p *fileParser) parseGenericMethod(field *ast.Field, it *namedInterface, iface *model.Interface, pkg string, tps map[string]model.Type) ([]*model.Method, error) { - return nil, fmt.Errorf("don't know how to mock method of type %T", field.Type) -} diff --git a/vendor/go.uber.org/mock/mockgen/gob.go b/vendor/go.uber.org/mock/mockgen/gob.go new file mode 100644 index 0000000..b5ab066 --- /dev/null +++ b/vendor/go.uber.org/mock/mockgen/gob.go @@ -0,0 +1,21 @@ +package main + +import ( + "encoding/gob" + "os" + + "go.uber.org/mock/mockgen/model" +) + +func gobMode(path string) (*model.Package, error) { + in, err := os.Open(path) + if err != nil { + return nil, err + } + defer in.Close() + var pkg model.Package + if err := gob.NewDecoder(in).Decode(&pkg); err != nil { + return nil, err + } + return &pkg, nil +} diff --git a/vendor/go.uber.org/mock/mockgen/mockgen.go b/vendor/go.uber.org/mock/mockgen/mockgen.go index df7d85f..b5365de 100644 --- a/vendor/go.uber.org/mock/mockgen/mockgen.go +++ b/vendor/go.uber.org/mock/mockgen/mockgen.go @@ -59,14 +59,17 @@ var ( mockNames = flag.String("mock_names", "", "Comma-separated interfaceName=mockName pairs of explicit mock names to use. Mock names default to 'Mock'+ interfaceName suffix.") packageOut = flag.String("package", "", "Package of the generated code; defaults to the package of the input with a 'mock_' prefix.") selfPackage = flag.String("self_package", "", "The full package import path for the generated code. The purpose of this flag is to prevent import cycles in the generated code by trying to include its own package. This can happen if the mock's package is set to one of its inputs (usually the main one) and the output is stdio so mockgen cannot detect the final output package. Setting this flag will then tell mockgen which import to exclude.") + writeCmdComment = flag.Bool("write_command_comment", true, "Writes the command used as a comment if true.") writePkgComment = flag.Bool("write_package_comment", true, "Writes package documentation comment (godoc) if true.") - writeSourceComment = flag.Bool("write_source_comment", true, "Writes original file (source mode) or interface names (reflect mode) comment if true.") + writeSourceComment = flag.Bool("write_source_comment", true, "Writes original file (source mode) or interface names (package mode) comment if true.") writeGenerateDirective = flag.Bool("write_generate_directive", false, "Add //go:generate directive to regenerate the mock") copyrightFile = flag.String("copyright_file", "", "Copyright file used to add copyright header") + buildConstraint = flag.String("build_constraint", "", "If non-empty, added as //go:build ") typed = flag.Bool("typed", false, "Generate Type-safe 'Return', 'Do', 'DoAndReturn' function") imports = flag.String("imports", "", "(source mode) Comma-separated name=path pairs of explicit imports to use.") auxFiles = flag.String("aux_files", "", "(source mode) Comma-separated pkg=path pairs of auxiliary Go source files.") - excludeInterfaces = flag.String("exclude_interfaces", "", "Comma-separated names of interfaces to be excluded") + excludeInterfaces = flag.String("exclude_interfaces", "", "(source mode) Comma-separated names of interfaces to be excluded") + modelGob = flag.String("model_gob", "", "Skip package/source loading entirely and use the gob encoded model.Package at the given path") debugParser = flag.Bool("debug_parser", false, "Print out parser results only.") showVersion = flag.Bool("version", false, "Print version.") @@ -76,6 +79,8 @@ func main() { flag.Usage = usage flag.Parse() + notifyAboutDeprecatedFlags() + if *showVersion { printVersion() return @@ -84,7 +89,9 @@ func main() { var pkg *model.Package var err error var packageName string - if *source != "" { + if *modelGob != "" { + pkg, err = gobMode(*modelGob) + } else if *source != "" { pkg, err = sourceMode(*source) } else { if flag.NArg() != 2 { @@ -103,7 +110,8 @@ func main() { log.Fatalf("Parse package name failed: %v", err) } } - pkg, err = reflectMode(packageName, interfaces) + parser := packageModeParser{} + pkg, err = parser.parsePackage(packageName, interfaces) } if err != nil { log.Fatalf("Loading input failed: %v", err) @@ -116,7 +124,7 @@ func main() { outputPackageName := *packageOut if outputPackageName == "" { - // pkg.Name in reflect mode is the base name of the import path, + // pkg.Name in package mode is the base name of the import path, // which might have characters that are illegal to have in package names. outputPackageName = "mock_" + sanitize(pkg.Name) } @@ -142,7 +150,9 @@ func main() { } } - g := new(generator) + g := &generator{ + buildConstraint: *buildConstraint, + } if *source != "" { g.filename = *source } else { @@ -225,20 +235,21 @@ func usage() { flag.PrintDefaults() } -const usageText = `mockgen has two modes of operation: source and reflect. +const usageText = `mockgen has two modes of operation: source and package. Source mode generates mock interfaces from a source file. It is enabled by using the -source flag. Other flags that -may be useful in this mode are -imports and -aux_files. +may be useful in this mode are -imports, -aux_files and -exclude_interfaces. Example: mockgen -source=foo.go [other options] -Reflect mode generates mock interfaces by building a program -that uses reflection to understand interfaces. It is enabled -by passing two non-flag arguments: an import path, and a -comma-separated list of symbols. +Package mode works by specifying the package and interface names. +It is enabled by passing two non-flag arguments: an import path, and a +comma-separated list of symbols. +You can use "." to refer to the current path's package. Example: mockgen database/sql/driver Conn,Driver + mockgen . SomeInterface ` @@ -250,12 +261,13 @@ type generator struct { destination string // may be empty srcPackage, srcInterfaces string // may be empty copyrightHeader string + buildConstraint string // may be empty packageMap map[string]string // map from import path to package name } func (g *generator) p(format string, args ...any) { - fmt.Fprintf(&g.buf, g.indent+format+"\n", args...) + _, _ = fmt.Fprintf(&g.buf, g.indent+format+"\n", args...) } func (g *generator) in() { @@ -305,6 +317,12 @@ func (g *generator) Generate(pkg *model.Package, outputPkgName string, outputPac g.p("") } + if g.buildConstraint != "" { + g.p("//go:build %s", g.buildConstraint) + // https://pkg.go.dev/cmd/go#hdr-Build_constraints:~:text=a%20build%20constraint%20should%20be%20followed%20by%20a%20blank%20line + g.p("") + } + g.p("// Code generated by MockGen. DO NOT EDIT.") if *writeSourceComment { if g.filename != "" { @@ -313,16 +331,18 @@ func (g *generator) Generate(pkg *model.Package, outputPkgName string, outputPac g.p("// Source: %v (interfaces: %v)", g.srcPackage, g.srcInterfaces) } } - g.p("//") - g.p("// Generated by this command:") - g.p("//") - // only log the name of the executable, not the full path - name := filepath.Base(os.Args[0]) - if runtime.GOOS == "windows" { - name = strings.TrimSuffix(name, ".exe") + if *writeCmdComment { + g.p("//") + g.p("// Generated by this command:") + g.p("//") + // only log the name of the executable, not the full path + name := filepath.Base(os.Args[0]) + if runtime.GOOS == "windows" { + name = strings.TrimSuffix(name, ".exe") + } + g.p("//\t%v", strings.Join(append([]string{name}, os.Args[1:]...), " ")) + g.p("//") } - g.p("//\t%v", strings.Join(append([]string{name}, os.Args[1:]...), " ")) - g.p("//") // Get all required imports, and generate unique names for them all. im := pkg.Imports() @@ -392,11 +412,13 @@ func (g *generator) Generate(pkg *model.Package, outputPkgName string, outputPac localNames[pkgName] = true } - if *writePkgComment { - // Ensure there's an empty line before the package to follow the recommendations: - // https://github.com/golang/go/wiki/CodeReviewComments#package-comments - g.p("") + // Ensure there is an empty line between “generated by” block and + // package documentation comments to follow the recommendations: + // https://go.dev/wiki/CodeReviewComments#package-comments + // That is, “generated by” should not be a package comment. + g.p("") + if *writePkgComment { g.p("// Package %v is a generated GoMock package.", outputPkgName) } g.p("package %v", outputPkgName) @@ -472,6 +494,7 @@ func (g *generator) GenerateMockInterface(intf *model.Interface, outputPackagePa g.in() g.p("ctrl *gomock.Controller") g.p("recorder *%vMockRecorder%v", mockType, shortTp) + g.p("isgomock struct{}") g.out() g.p("}") g.p("") @@ -816,7 +839,7 @@ func createPackageMap(importPaths []string) map[string]string { } pkgMap := make(map[string]string) b := bytes.NewBuffer(nil) - args := []string{"list", "-json"} + args := []string{"list", "-json=ImportPath,Name"} args = append(args, importPaths...) cmd := exec.Command("go", args...) cmd.Stdout = b diff --git a/vendor/go.uber.org/mock/mockgen/package_mode.go b/vendor/go.uber.org/mock/mockgen/package_mode.go new file mode 100644 index 0000000..abc9c7a --- /dev/null +++ b/vendor/go.uber.org/mock/mockgen/package_mode.go @@ -0,0 +1,358 @@ +package main + +import ( + "errors" + "flag" + "fmt" + "go/types" + "strings" + + "go.uber.org/mock/mockgen/model" + "golang.org/x/tools/go/packages" +) + +var ( + buildFlags = flag.String("build_flags", "", "(package mode) Additional flags for go build.") +) + +type packageModeParser struct { + pkgName string +} + +func (p *packageModeParser) parsePackage(packageName string, ifaces []string) (*model.Package, error) { + p.pkgName = packageName + + pkg, err := p.loadPackage(packageName) + if err != nil { + return nil, fmt.Errorf("load package: %w", err) + } + + interfaces, err := p.extractInterfacesFromPackage(pkg, ifaces) + if err != nil { + return nil, fmt.Errorf("extract interfaces from package: %w", err) + } + + return &model.Package{ + Name: pkg.Types.Name(), + PkgPath: packageName, + Interfaces: interfaces, + }, nil +} + +func (p *packageModeParser) loadPackage(packageName string) (*packages.Package, error) { + var buildFlagsSet []string + if *buildFlags != "" { + buildFlagsSet = strings.Split(*buildFlags, " ") + } + + cfg := &packages.Config{ + Mode: packages.NeedDeps | packages.NeedImports | packages.NeedTypes | packages.NeedTypesInfo | packages.NeedEmbedFiles, + BuildFlags: buildFlagsSet, + } + pkgs, err := packages.Load(cfg, packageName) + if err != nil { + return nil, fmt.Errorf("load packages: %w", err) + } + + if len(pkgs) != 1 { + return nil, fmt.Errorf("packages length must be 1: %d", len(pkgs)) + } + + if len(pkgs[0].Errors) > 0 { + errs := make([]error, len(pkgs[0].Errors)) + for i, err := range pkgs[0].Errors { + errs[i] = err + } + + return nil, errors.Join(errs...) + } + + return pkgs[0], nil +} + +func (p *packageModeParser) extractInterfacesFromPackage(pkg *packages.Package, ifaces []string) ([]*model.Interface, error) { + interfaces := make([]*model.Interface, len(ifaces)) + for i, iface := range ifaces { + obj := pkg.Types.Scope().Lookup(iface) + if obj == nil { + return nil, fmt.Errorf("interface %s does not exist", iface) + } + + modelIface, err := p.parseInterface(obj) + if err != nil { + return nil, newParseTypeError("parse interface", obj.Name(), err) + } + + interfaces[i] = modelIface + } + + return interfaces, nil +} + +func (p *packageModeParser) parseInterface(obj types.Object) (*model.Interface, error) { + named, ok := types.Unalias(obj.Type()).(*types.Named) + if !ok { + return nil, fmt.Errorf("%s is not an interface. it is a %T", obj.Name(), obj.Type().Underlying()) + } + + iface, ok := named.Underlying().(*types.Interface) + if !ok { + return nil, fmt.Errorf("%s is not an interface. it is a %T", obj.Name(), obj.Type().Underlying()) + } + + if p.isConstraint(iface) { + return nil, fmt.Errorf("interface %s is a constraint", obj.Name()) + } + + methods := make([]*model.Method, iface.NumMethods()) + for i := range iface.NumMethods() { + method := iface.Method(i) + typedMethod, ok := method.Type().(*types.Signature) + if !ok { + return nil, fmt.Errorf("method %s is not a signature", method.Name()) + } + + modelFunc, err := p.parseFunc(typedMethod) + if err != nil { + return nil, newParseTypeError("parse method", typedMethod.String(), err) + } + + methods[i] = &model.Method{ + Name: method.Name(), + In: modelFunc.In, + Out: modelFunc.Out, + Variadic: modelFunc.Variadic, + } + } + + if named.TypeParams() == nil { + return &model.Interface{Name: obj.Name(), Methods: methods}, nil + } + + typeParams := make([]*model.Parameter, named.TypeParams().Len()) + for i := range named.TypeParams().Len() { + param := named.TypeParams().At(i) + typeParam, err := p.parseConstraint(param) + if err != nil { + return nil, newParseTypeError("parse type parameter", param.String(), err) + } + + typeParams[i] = &model.Parameter{Name: param.Obj().Name(), Type: typeParam} + } + + return &model.Interface{Name: obj.Name(), Methods: methods, TypeParams: typeParams}, nil +} + +func (o *packageModeParser) isConstraint(t *types.Interface) bool { + for i := range t.NumEmbeddeds() { + embed := t.EmbeddedType(i) + if _, ok := embed.Underlying().(*types.Interface); !ok { + return true + } + } + + return false +} + +func (p *packageModeParser) parseType(t types.Type) (model.Type, error) { + switch t := t.(type) { + case *types.Array: + elementType, err := p.parseType(t.Elem()) + if err != nil { + return nil, newParseTypeError("parse array type", t.Elem().String(), err) + } + return &model.ArrayType{Len: int(t.Len()), Type: elementType}, nil + case *types.Slice: + elementType, err := p.parseType(t.Elem()) + if err != nil { + return nil, newParseTypeError("parse slice type", t.Elem().String(), err) + } + + return &model.ArrayType{Len: -1, Type: elementType}, nil + case *types.Chan: + var dir model.ChanDir + switch t.Dir() { + case types.RecvOnly: + dir = model.RecvDir + case types.SendOnly: + dir = model.SendDir + } + + chanType, err := p.parseType(t.Elem()) + if err != nil { + return nil, newParseTypeError("parse chan type", t.Elem().String(), err) + } + + return &model.ChanType{Dir: dir, Type: chanType}, nil + case *types.Signature: + sig, err := p.parseFunc(t) + if err != nil { + return nil, newParseTypeError("parse signature", t.String(), err) + } + + return sig, nil + case *types.Named, *types.Alias: + object := t.(interface{ Obj() *types.TypeName }) + var pkg string + if object.Obj().Pkg() != nil { + pkg = object.Obj().Pkg().Path() + } + + // TypeArgs method not available for aliases in go1.22 + genericType, ok := t.(interface{ TypeArgs() *types.TypeList }) + if !ok || genericType.TypeArgs() == nil { + return &model.NamedType{ + Package: pkg, + Type: object.Obj().Name(), + }, nil + } + + typeParams := &model.TypeParametersType{TypeParameters: make([]model.Type, genericType.TypeArgs().Len())} + for i := range genericType.TypeArgs().Len() { + typeParam := genericType.TypeArgs().At(i) + typedParam, err := p.parseType(typeParam) + if err != nil { + return nil, newParseTypeError("parse type parameter", typeParam.String(), err) + } + + typeParams.TypeParameters[i] = typedParam + } + + return &model.NamedType{ + Package: pkg, + Type: object.Obj().Name(), + TypeParams: typeParams, + }, nil + case *types.Interface: + if t.Empty() { + return model.PredeclaredType("any"), nil + } + + return nil, fmt.Errorf("cannot handle non-empty unnamed interfaces") + case *types.Map: + key, err := p.parseType(t.Key()) + if err != nil { + return nil, newParseTypeError("parse map key", t.Key().String(), err) + } + value, err := p.parseType(t.Elem()) + if err != nil { + return nil, newParseTypeError("parse map value", t.Elem().String(), err) + } + + return &model.MapType{Key: key, Value: value}, nil + case *types.Pointer: + valueType, err := p.parseType(t.Elem()) + if err != nil { + return nil, newParseTypeError("parse pointer type", t.Elem().String(), err) + } + + return &model.PointerType{Type: valueType}, nil + case *types.Struct: + if t.NumFields() > 0 { + return nil, fmt.Errorf("cannot handle non-empty unnamed structs") + } + + return model.PredeclaredType("struct{}"), nil + case *types.Basic: + return model.PredeclaredType(t.Name()), nil + case *types.Tuple: + panic("tuple field") // TODO + case *types.TypeParam: + return &model.NamedType{Type: t.Obj().Name()}, nil + default: + panic("unknown type") // TODO + } +} + +func (p *packageModeParser) parseFunc(sig *types.Signature) (*model.FuncType, error) { + var variadic *model.Parameter + params := make([]*model.Parameter, 0, sig.Params().Len()) + for i := range sig.Params().Len() { + param := sig.Params().At(i) + + isVariadicParam := i == sig.Params().Len()-1 && sig.Variadic() + parseType := param.Type() + if isVariadicParam { + sliceType, ok := param.Type().(*types.Slice) + if !ok { + return nil, newParseTypeError("variadic parameter is not a slice", param.String(), nil) + } + + parseType = sliceType.Elem() + } + + paramType, err := p.parseType(parseType) + if err != nil { + return nil, newParseTypeError("parse parameter type", parseType.String(), err) + } + + modelParameter := &model.Parameter{Type: paramType, Name: param.Name()} + + if isVariadicParam { + variadic = modelParameter + } else { + params = append(params, modelParameter) + } + } + + if len(params) == 0 { + params = nil + } + + results := make([]*model.Parameter, sig.Results().Len()) + for i := range sig.Results().Len() { + result := sig.Results().At(i) + + resultType, err := p.parseType(result.Type()) + if err != nil { + return nil, newParseTypeError("parse result type", result.Type().String(), err) + } + + results[i] = &model.Parameter{Type: resultType, Name: result.Name()} + } + + if len(results) == 0 { + results = nil + } + + return &model.FuncType{ + In: params, + Out: results, + Variadic: variadic, + }, nil +} + +func (p *packageModeParser) parseConstraint(t *types.TypeParam) (model.Type, error) { + if t == nil { + return nil, fmt.Errorf("nil type param") + } + + typeParam, err := p.parseType(t.Constraint()) + if err != nil { + return nil, newParseTypeError("parse constraint type", t.Constraint().String(), err) + } + + return typeParam, nil +} + +type parseTypeError struct { + message string + typeString string + error error +} + +func newParseTypeError(message string, typeString string, error error) *parseTypeError { + return &parseTypeError{typeString: typeString, error: error, message: message} +} + +func (p parseTypeError) Error() string { + if p.error != nil { + return fmt.Sprintf("%s: error parsing %s: %s", p.message, p.typeString, p.error) + } + + return fmt.Sprintf("%s: error parsing type %s", p.message, p.typeString) +} + +func (p parseTypeError) Unwrap() error { + return p.error +} diff --git a/vendor/go.uber.org/mock/mockgen/reflect.go b/vendor/go.uber.org/mock/mockgen/reflect.go deleted file mode 100644 index ca80ebb..0000000 --- a/vendor/go.uber.org/mock/mockgen/reflect.go +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright 2012 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -// This file contains the model construction by reflection. - -import ( - "bytes" - "encoding/gob" - "flag" - "fmt" - "go/build" - "io" - "log" - "os" - "os/exec" - "path/filepath" - "runtime" - "strings" - "text/template" - - "go.uber.org/mock/mockgen/model" -) - -var ( - progOnly = flag.Bool("prog_only", false, "(reflect mode) Only generate the reflection program; write it to stdout and exit.") - execOnly = flag.String("exec_only", "", "(reflect mode) If set, execute this reflection program.") - buildFlags = flag.String("build_flags", "", "(reflect mode) Additional flags for go build.") -) - -// reflectMode generates mocks via reflection on an interface. -func reflectMode(importPath string, symbols []string) (*model.Package, error) { - if *execOnly != "" { - return run(*execOnly) - } - - program, err := writeProgram(importPath, symbols) - if err != nil { - return nil, err - } - - if *progOnly { - if _, err := os.Stdout.Write(program); err != nil { - return nil, err - } - os.Exit(0) - } - - wd, _ := os.Getwd() - - // Try to run the reflection program in the current working directory. - if p, err := runInDir(program, wd); err == nil { - return p, nil - } - - // Try to run the program in the same directory as the input package. - if p, err := build.Import(importPath, wd, build.FindOnly); err == nil { - dir := p.Dir - if p, err := runInDir(program, dir); err == nil { - return p, nil - } - } - - // Try to run it in a standard temp directory. - return runInDir(program, "") -} - -func writeProgram(importPath string, symbols []string) ([]byte, error) { - var program bytes.Buffer - data := reflectData{ - ImportPath: importPath, - Symbols: symbols, - } - if err := reflectProgram.Execute(&program, &data); err != nil { - return nil, err - } - return program.Bytes(), nil -} - -// run the given program and parse the output as a model.Package. -func run(program string) (*model.Package, error) { - f, err := os.CreateTemp("", "") - if err != nil { - return nil, err - } - - filename := f.Name() - defer os.Remove(filename) - if err := f.Close(); err != nil { - return nil, err - } - - // Run the program. - cmd := exec.Command(program, "-output", filename) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - return nil, err - } - - f, err = os.Open(filename) - if err != nil { - return nil, err - } - - // Process output. - var pkg model.Package - if err := gob.NewDecoder(f).Decode(&pkg); err != nil { - return nil, err - } - - if err := f.Close(); err != nil { - return nil, err - } - - return &pkg, nil -} - -// runInDir writes the given program into the given dir, runs it there, and -// parses the output as a model.Package. -func runInDir(program []byte, dir string) (*model.Package, error) { - // We use TempDir instead of TempFile so we can control the filename. - tmpDir, err := os.MkdirTemp(dir, "gomock_reflect_") - if err != nil { - return nil, err - } - defer func() { - if err := os.RemoveAll(tmpDir); err != nil { - log.Printf("failed to remove temp directory: %s", err) - } - }() - const progSource = "prog.go" - var progBinary = "prog.bin" - if runtime.GOOS == "windows" { - // Windows won't execute a program unless it has a ".exe" suffix. - progBinary += ".exe" - } - - if err := os.WriteFile(filepath.Join(tmpDir, progSource), program, 0600); err != nil { - return nil, err - } - - cmdArgs := []string{} - cmdArgs = append(cmdArgs, "build") - if *buildFlags != "" { - cmdArgs = append(cmdArgs, strings.Split(*buildFlags, " ")...) - } - cmdArgs = append(cmdArgs, "-o", progBinary, progSource) - - // Build the program. - buf := bytes.NewBuffer(nil) - cmd := exec.Command("go", cmdArgs...) - cmd.Dir = tmpDir - cmd.Stdout = os.Stdout - cmd.Stderr = io.MultiWriter(os.Stderr, buf) - if err := cmd.Run(); err != nil { - sErr := buf.String() - if strings.Contains(sErr, `cannot find package "."`) && - strings.Contains(sErr, "go.uber.org/mock/mockgen/model") { - fmt.Fprint(os.Stderr, "Please reference the steps in the README to fix this error:\n\thttps://go.uber.org/mock#reflect-vendoring-error.\n") - return nil, err - } - return nil, err - } - - return run(filepath.Join(tmpDir, progBinary)) -} - -type reflectData struct { - ImportPath string - Symbols []string -} - -// This program reflects on an interface value, and prints the -// gob encoding of a model.Package to standard output. -// JSON doesn't work because of the model.Type interface. -var reflectProgram = template.Must(template.New("program").Parse(` -// Code generated by MockGen. DO NOT EDIT. -package main - -import ( - "encoding/gob" - "flag" - "fmt" - "os" - "path" - "reflect" - - "go.uber.org/mock/mockgen/model" - - pkg_ {{printf "%q" .ImportPath}} -) - -var output = flag.String("output", "", "The output file name, or empty to use stdout.") - -func main() { - flag.Parse() - - its := []struct{ - sym string - typ reflect.Type - }{ - {{range .Symbols}} - { {{printf "%q" .}}, reflect.TypeOf((*pkg_.{{.}})(nil)).Elem()}, - {{end}} - } - pkg := &model.Package{ - // NOTE: This behaves contrary to documented behaviour if the - // package name is not the final component of the import path. - // The reflect package doesn't expose the package name, though. - Name: path.Base({{printf "%q" .ImportPath}}), - } - - for _, it := range its { - intf, err := model.InterfaceFromInterfaceType(it.typ) - if err != nil { - fmt.Fprintf(os.Stderr, "Reflection: %v\n", err) - os.Exit(1) - } - intf.Name = it.sym - pkg.Interfaces = append(pkg.Interfaces, intf) - } - - outfile := os.Stdout - if len(*output) != 0 { - var err error - outfile, err = os.Create(*output) - if err != nil { - fmt.Fprintf(os.Stderr, "failed to open output file %q", *output) - } - defer func() { - if err := outfile.Close(); err != nil { - fmt.Fprintf(os.Stderr, "failed to close output file %q", *output) - os.Exit(1) - } - }() - } - - if err := gob.NewEncoder(outfile).Encode(pkg); err != nil { - fmt.Fprintf(os.Stderr, "gob encode: %v\n", err) - os.Exit(1) - } -} -`)) diff --git a/vendor/golang.org/x/crypto/LICENSE b/vendor/golang.org/x/crypto/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/crypto/LICENSE +++ b/vendor/golang.org/x/crypto/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s index 9ae8206..f75162e 100644 --- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s +++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s @@ -1,722 +1,4517 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 - -#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 -#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 -#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e -#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 -#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 - -#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ - VPADDQ m0, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m1, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y1_Y1; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y3_Y3; \ - VPADDQ m2, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m3, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y3_Y3; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y1_Y1 - -#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E -#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 -#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E -#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 -#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E - -#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n -#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n -#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n -#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n -#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n - -#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 -#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 -#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 -#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 -#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 - -#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 - -#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 -#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 - -// load msg: Y12 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y12, Y12 - -// load msg: Y13 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ - VMOVQ_SI_X13(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X13(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y13, Y13 - -// load msg: Y14 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ - VMOVQ_SI_X14(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X14(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y14, Y14 - -// load msg: Y15 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ - VMOVQ_SI_X15(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X15(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X11(6*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ - LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ - LOAD_MSG_AVX2_Y15(9, 11, 13, 15) - -#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ - LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ - LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ - VMOVQ_SI_X11(11*8); \ - VPSHUFD $0x4E, 0*8(SI), X14; \ - VPINSRQ_1_SI_X11(5*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(12, 2, 7, 3) - -#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ - VMOVQ_SI_X11(5*8); \ - VMOVDQU 11*8(SI), X12; \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - VMOVQ_SI_X13(8*8); \ - VMOVQ_SI_X11(2*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X11(13*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ - LOAD_MSG_AVX2_Y15(14, 6, 1, 4) - -#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ - LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ - LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ - LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ - VMOVQ_SI_X15(6*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X15(10*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ - LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X13(7*8); \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ - LOAD_MSG_AVX2_Y15(1, 12, 8, 13) - -#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ - LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ - LOAD_MSG_AVX2_Y15(13, 5, 14, 9) - -#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ - LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ - LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ - VMOVQ_SI_X14_0; \ - VPSHUFD $0x4E, 8*8(SI), X11; \ - VPINSRQ_1_SI_X14(6*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(7, 3, 2, 11) - -#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ - LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ - LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ - LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ - VMOVQ_SI_X15_0; \ - VMOVQ_SI_X11(6*8); \ - VPINSRQ_1_SI_X15(4*8); \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ - VMOVQ_SI_X12(6*8); \ - VMOVQ_SI_X11(11*8); \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ - VMOVQ_SI_X11(1*8); \ - VMOVDQU 12*8(SI), X14; \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - VMOVQ_SI_X15(2*8); \ - VMOVDQU 4*8(SI), X11; \ - VPINSRQ_1_SI_X15(7*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ - LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ - VMOVQ_SI_X13(2*8); \ - VPSHUFD $0x4E, 5*8(SI), X11; \ - VPINSRQ_1_SI_X13(4*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ - VMOVQ_SI_X15(11*8); \ - VMOVQ_SI_X11(12*8); \ - VPINSRQ_1_SI_X15(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y15, Y15 - // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, DX - ADDQ $31, DX - ANDQ $~31, DX - - MOVQ CX, 16(DX) - XORQ CX, CX - MOVQ CX, 24(DX) - - VMOVDQU ·AVX2_c40<>(SB), Y4 - VMOVDQU ·AVX2_c48<>(SB), Y5 - - VMOVDQU 0(AX), Y8 +// Requires: AVX, AVX2 +TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, DX + ADDQ $+31, DX + ANDQ $-32, DX + MOVQ CX, 16(DX) + XORQ CX, CX + MOVQ CX, 24(DX) + VMOVDQU ·AVX2_c40<>+0(SB), Y4 + VMOVDQU ·AVX2_c48<>+0(SB), Y5 + VMOVDQU (AX), Y8 VMOVDQU 32(AX), Y9 - VMOVDQU ·AVX2_iv0<>(SB), Y6 - VMOVDQU ·AVX2_iv1<>(SB), Y7 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 - MOVQ R9, 8(DX) + VMOVDQU ·AVX2_iv0<>+0(SB), Y6 + VMOVDQU ·AVX2_iv1<>+0(SB), Y7 + MOVQ (BX), R8 + MOVQ 8(BX), R9 + MOVQ R9, 8(DX) loop: - ADDQ $128, R8 - MOVQ R8, 0(DX) - CMPQ R8, $128 + ADDQ $0x80, R8 + MOVQ R8, (DX) + CMPQ R8, $0x80 JGE noinc INCQ R9 MOVQ R9, 8(DX) noinc: - VMOVDQA Y8, Y0 - VMOVDQA Y9, Y1 - VMOVDQA Y6, Y2 - VPXOR 0(DX), Y7, Y3 - - LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() - VMOVDQA Y12, 32(DX) - VMOVDQA Y13, 64(DX) - VMOVDQA Y14, 96(DX) - VMOVDQA Y15, 128(DX) - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() - VMOVDQA Y12, 160(DX) - VMOVDQA Y13, 192(DX) - VMOVDQA Y14, 224(DX) - VMOVDQA Y15, 256(DX) - - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - - ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) - ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) - - VPXOR Y0, Y8, Y8 - VPXOR Y1, Y9, Y9 - VPXOR Y2, Y8, Y8 - VPXOR Y3, Y9, Y9 - - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop - - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - - VMOVDQU Y8, 0(AX) - VMOVDQU Y9, 32(AX) + VMOVDQA Y8, Y0 + VMOVDQA Y9, Y1 + VMOVDQA Y6, Y2 + VPXOR (DX), Y7, Y3 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x48 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + VPSHUFD $0x4e, (SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x28 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + VMOVDQU 88(SI), X12 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + VPSHUFD $0x4e, 64(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x10 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + VMOVDQU 96(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + VMOVDQU 32(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + VPSHUFD $0x4e, 40(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 32(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 64(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 96(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 128(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 160(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 192(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 224(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 256(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPXOR Y0, Y8, Y8 + VPXOR Y1, Y9, Y9 + VPXOR Y2, Y8, Y8 + VPXOR Y3, Y9, Y9 + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) VZEROUPPER - RET -#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA -#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB -#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF -#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD -#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE - -#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF -#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF - -#define SHUFFLE_AVX() \ - VMOVDQA X6, X13; \ - VMOVDQA X2, X14; \ - VMOVDQA X4, X6; \ - VPUNPCKLQDQ_X13_X13_X15; \ - VMOVDQA X5, X4; \ - VMOVDQA X6, X5; \ - VPUNPCKHQDQ_X15_X7_X6; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X13_X7; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VPUNPCKHQDQ_X15_X2_X2; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X3_X3; \ - -#define SHUFFLE_AVX_INV() \ - VMOVDQA X2, X13; \ - VMOVDQA X4, X14; \ - VPUNPCKLQDQ_X2_X2_X15; \ - VMOVDQA X5, X4; \ - VPUNPCKHQDQ_X15_X3_X2; \ - VMOVDQA X14, X5; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VMOVDQA X6, X14; \ - VPUNPCKHQDQ_X15_X13_X3; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X6_X6; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X7_X7; \ - -#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - VPADDQ m0, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m1, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFD $-79, v6, v6; \ - VPSHUFD $-79, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPSHUFB c40, v2, v2; \ - VPSHUFB c40, v3, v3; \ - VPADDQ m2, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m3, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFB c48, v6, v6; \ - VPSHUFB c48, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPADDQ v2, v2, t0; \ - VPSRLQ $63, v2, v2; \ - VPXOR t0, v2, v2; \ - VPADDQ v3, v3, t0; \ - VPSRLQ $63, v3, v3; \ - VPXOR t0, v3, v3 - -// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) -// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 -#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X13(i2*8); \ - VMOVQ_SI_X14(i4*8); \ - VMOVQ_SI_X15(i6*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X13(i3*8); \ - VPINSRQ_1_SI_X14(i5*8); \ - VPINSRQ_1_SI_X15(i7*8) - -// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) -#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(1*8); \ - VMOVQ_SI_X15(5*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X13(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(7*8) - -// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) -#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ - VPSHUFD $0x4E, 0*8(SI), X12; \ - VMOVQ_SI_X13(11*8); \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(7*8); \ - VPINSRQ_1_SI_X13(5*8); \ - VPINSRQ_1_SI_X14(2*8); \ - VPINSRQ_1_SI_X15(3*8) - -// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) -#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ - VMOVDQU 11*8(SI), X12; \ - VMOVQ_SI_X13(5*8); \ - VMOVQ_SI_X14(8*8); \ - VMOVQ_SI_X15(2*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14_0; \ - VPINSRQ_1_SI_X15(13*8) - -// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) -#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(6*8); \ - VMOVQ_SI_X15_0; \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(8*8) +DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) -#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ - VMOVQ_SI_X12(9*8); \ - VMOVQ_SI_X13(2*8); \ - VMOVQ_SI_X14_0; \ - VMOVQ_SI_X15(4*8); \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VPINSRQ_1_SI_X15(15*8) +DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) -#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(11*8); \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X13(8*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(3*8) +DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) -#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ - MOVQ 0*8(SI), X12; \ - VPSHUFD $0x4E, 8*8(SI), X13; \ - MOVQ 7*8(SI), X14; \ - MOVQ 2*8(SI), X15; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(11*8) - -// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) -#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ - MOVQ 6*8(SI), X12; \ - MOVQ 11*8(SI), X13; \ - MOVQ 15*8(SI), X14; \ - MOVQ 3*8(SI), X15; \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X14(9*8); \ - VPINSRQ_1_SI_X15(8*8) - -// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) -#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ - MOVQ 5*8(SI), X12; \ - MOVQ 8*8(SI), X13; \ - MOVQ 0*8(SI), X14; \ - MOVQ 6*8(SI), X15; \ - VPINSRQ_1_SI_X12(15*8); \ - VPINSRQ_1_SI_X13(2*8); \ - VPINSRQ_1_SI_X14(4*8); \ - VPINSRQ_1_SI_X15(10*8) - -// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) -#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ - VMOVDQU 12*8(SI), X12; \ - MOVQ 1*8(SI), X13; \ - MOVQ 2*8(SI), X14; \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VMOVDQU 4*8(SI), X15 - -// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) -#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ - MOVQ 15*8(SI), X12; \ - MOVQ 3*8(SI), X13; \ - MOVQ 11*8(SI), X14; \ - MOVQ 12*8(SI), X15; \ - VPINSRQ_1_SI_X12(9*8); \ - VPINSRQ_1_SI_X13(13*8); \ - VPINSRQ_1_SI_X14(14*8); \ - VPINSRQ_1_SI_X15_0 +DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f +DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - VMOVDQU ·AVX_c40<>(SB), X0 - VMOVDQU ·AVX_c48<>(SB), X1 +// Requires: AVX, SSE2 +TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + VMOVDQU ·AVX_c40<>+0(SB), X0 + VMOVDQU ·AVX_c48<>+0(SB), X1 VMOVDQA X0, X8 VMOVDQA X1, X9 - - VMOVDQU ·AVX_iv3<>(SB), X0 - VMOVDQA X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) - - VMOVDQU 0(AX), X10 + VMOVDQU ·AVX_iv3<>+0(SB), X0 + VMOVDQA X0, (R10) + XORQ CX, (R10) + VMOVDQU (AX), X10 VMOVDQU 16(AX), X11 VMOVDQU 32(AX), X2 VMOVDQU 48(AX), X3 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - VMOVQ_R8_X15 - VPINSRQ_1_R9_X15 - + BYTE $0xc4 + BYTE $0x41 + BYTE $0xf9 + BYTE $0x6e + BYTE $0xf8 + BYTE $0xc4 + BYTE $0x43 + BYTE $0x81 + BYTE $0x22 + BYTE $0xf9 + BYTE $0x01 VMOVDQA X10, X0 VMOVDQA X11, X1 - VMOVDQU ·AVX_iv0<>(SB), X4 - VMOVDQU ·AVX_iv1<>(SB), X5 - VMOVDQU ·AVX_iv2<>(SB), X6 - + VMOVDQU ·AVX_iv0<>+0(SB), X4 + VMOVDQU ·AVX_iv1<>+0(SB), X5 + VMOVDQU ·AVX_iv2<>+0(SB), X6 VPXOR X15, X6, X6 - VMOVDQA 0(R10), X7 - - LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA (R10), X7 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 VMOVDQA X12, 16(R10) VMOVDQA X13, 32(R10) VMOVDQA X14, 48(R10) VMOVDQA X15, 64(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 VMOVDQA X12, 80(R10) VMOVDQA X13, 96(R10) VMOVDQA X14, 112(R10) VMOVDQA X15, 128(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 VMOVDQA X12, 144(R10) VMOVDQA X13, 160(R10) VMOVDQA X14, 176(R10) VMOVDQA X15, 192(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPSHUFD $0x4e, (SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 VMOVDQA X12, 208(R10) VMOVDQA X13, 224(R10) VMOVDQA X14, 240(R10) VMOVDQA X15, 256(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_11_12_5_15_8_0_2_13() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_2_5_4_15_6_10_0_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_9_5_2_10_0_7_4_15() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_2_6_0_8_12_10_11_3() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_0_6_9_8_7_3_2_11() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_5_15_8_2_0_4_6_10() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_6_14_11_0_15_9_3_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_12_13_1_10_2_7_4_5() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_15_9_3_13_11_14_12_0() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VMOVDQU 88(SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x36 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ (SI), X12 + VPSHUFD $0x4e, 64(SI), X13 + MOVQ 56(SI), X14 + MOVQ 16(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 40(SI), X12 + MOVQ 64(SI), X13 + MOVQ (SI), X14 + MOVQ 48(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + MOVQ 48(SI), X12 + MOVQ 88(SI), X13 + MOVQ 120(SI), X14 + MOVQ 24(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VMOVDQU 96(SI), X12 + MOVQ 8(SI), X13 + MOVQ 16(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + VMOVDQU 32(SI), X15 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 120(SI), X12 + MOVQ 24(SI), X13 + MOVQ 88(SI), X14 + MOVQ 96(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x3e + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 16(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 32(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 48(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 64(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 80(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 96(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 112(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 128(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 144(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 160(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 176(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 192(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 208(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 224(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 240(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 256(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff VMOVDQU 32(AX), X14 VMOVDQU 48(AX), X15 VPXOR X0, X10, X10 @@ -729,16 +4524,36 @@ noinc: VPXOR X7, X15, X3 VMOVDQU X2, 32(AX) VMOVDQU X3, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + VMOVDQU X10, (AX) + VMOVDQU X11, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VZEROUPPER + RET - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16 - VMOVDQU X10, 0(AX) - VMOVDQU X11, 16(AX) +DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - VZEROUPPER +DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16 - RET +DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16 diff --git a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s index adfac00..9a0ce21 100644 --- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s +++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s @@ -1,278 +1,1441 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - PADDQ m0, v0; \ - PADDQ m1, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v6, v6; \ - PSHUFD $0xB1, v7, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - PSHUFB c40, v2; \ - PSHUFB c40, v3; \ - PADDQ m2, v0; \ - PADDQ m3, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFB c48, v6; \ - PSHUFB c48, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - MOVOU v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVOU v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ - MOVQ i0*8(src), m0; \ - PINSRQ $1, i1*8(src), m0; \ - MOVQ i2*8(src), m1; \ - PINSRQ $1, i3*8(src), m1; \ - MOVQ i4*8(src), m2; \ - PINSRQ $1, i5*8(src), m2; \ - MOVQ i6*8(src), m3; \ - PINSRQ $1, i7*8(src), m3 - // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) - - MOVOU ·c40<>(SB), X13 - MOVOU ·c48<>(SB), X14 - - MOVOU 0(AX), X12 +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + MOVOU ·iv3<>+0(SB), X0 + MOVO X0, (R10) + XORQ CX, (R10) + MOVOU ·c40<>+0(SB), X13 + MOVOU ·c48<>+0(SB), X14 + MOVOU (AX), X12 MOVOU 16(AX), X15 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - MOVQ R8, X8 - PINSRQ $1, R9, X8 - - MOVO X12, X0 - MOVO X15, X1 - MOVOU 32(AX), X2 - MOVOU 48(AX), X3 - MOVOU ·iv0<>(SB), X4 - MOVOU ·iv1<>(SB), X5 - MOVOU ·iv2<>(SB), X6 - - PXOR X8, X6 - MOVO 0(R10), X7 - - LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(R10) - MOVO X9, 32(R10) - MOVO X10, 48(R10) - MOVO X11, 64(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(R10) - MOVO X9, 96(R10) - MOVO X10, 112(R10) - MOVO X11, 128(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(R10) - MOVO X9, 160(R10) - MOVO X10, 176(R10) - MOVO X11, 192(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(R10) - MOVO X9, 224(R10) - MOVO X10, 240(R10) - MOVO X11, 256(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + MOVQ R8, X8 + PINSRQ $0x01, R9, X8 + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>+0(SB), X4 + MOVOU ·iv1<>+0(SB), X5 + MOVOU ·iv2<>+0(SB), X6 + PXOR X8, X6 + MOVO (R10), X7 + MOVQ (SI), X8 + PINSRQ $0x01, 16(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 48(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 40(SI), X11 + PINSRQ $0x01, 56(SI), X11 + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 64(SI), X8 + PINSRQ $0x01, 80(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 112(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 88(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 120(SI), X11 + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 112(SI), X8 + PINSRQ $0x01, 32(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 80(SI), X10 + PINSRQ $0x01, 64(SI), X10 + MOVQ 120(SI), X11 + PINSRQ $0x01, 48(SI), X11 + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 8(SI), X8 + PINSRQ $0x01, (SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, 40(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 16(SI), X10 + MOVQ 56(SI), X11 + PINSRQ $0x01, 24(SI), X11 + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 88(SI), X8 + PINSRQ $0x01, 96(SI), X8 + MOVQ 40(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 64(SI), X10 + PINSRQ $0x01, (SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 80(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 72(SI), X9 + MOVQ 112(SI), X10 + PINSRQ $0x01, 48(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 32(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 56(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 104(SI), X9 + PINSRQ $0x01, 88(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 8(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, 112(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 16(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 48(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ (SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 72(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 16(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 120(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 112(SI), X8 + PINSRQ $0x01, 88(SI), X8 + MOVQ 48(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 96(SI), X10 + MOVQ 64(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 16(SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ (SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ 88(SI), X11 + PINSRQ $0x01, 24(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 32(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 120(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 104(SI), X10 + PINSRQ $0x01, 40(SI), X10 + MOVQ 112(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 96(SI), X8 + PINSRQ $0x01, 8(SI), X8 + MOVQ 112(SI), X9 + PINSRQ $0x01, 32(SI), X9 + MOVQ 40(SI), X10 + PINSRQ $0x01, 120(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ (SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 56(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 88(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 104(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 40(SI), X8 + PINSRQ $0x01, 120(SI), X8 + MOVQ 64(SI), X9 + PINSRQ $0x01, 16(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 48(SI), X8 + PINSRQ $0x01, 112(SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, (SI), X9 + MOVQ 120(SI), X10 + PINSRQ $0x01, 72(SI), X10 + MOVQ 24(SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 96(SI), X8 + PINSRQ $0x01, 104(SI), X8 + MOVQ 8(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 80(SI), X8 + PINSRQ $0x01, 64(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 120(SI), X8 + PINSRQ $0x01, 72(SI), X8 + MOVQ 24(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, (SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 16(R10), X0 + PADDQ 32(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 48(R10), X0 + PADDQ 64(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 80(R10), X0 + PADDQ 96(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 112(R10), X0 + PADDQ 128(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 144(R10), X0 + PADDQ 160(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 176(R10), X0 + PADDQ 192(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 208(R10), X0 + PADDQ 224(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 240(R10), X0 + PADDQ 256(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVOU X12, (AX) + MOVOU X15, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + RET - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) +DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), RODATA|NOPTR, $16 - MOVOU 32(AX), X10 - MOVOU 48(AX), X11 - PXOR X0, X12 - PXOR X1, X15 - PXOR X2, X10 - PXOR X3, X11 - PXOR X4, X12 - PXOR X5, X15 - PXOR X6, X10 - PXOR X7, X11 - MOVOU X10, 32(AX) - MOVOU X11, 48(AX) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 - MOVOU X12, 0(AX) - MOVOU X15, 16(AX) +DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) +DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 - RET +DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), RODATA|NOPTR, $16 diff --git a/vendor/golang.org/x/crypto/blake2s/blake2s.go b/vendor/golang.org/x/crypto/blake2s/blake2s.go index e3f46aa..c25d07d 100644 --- a/vendor/golang.org/x/crypto/blake2s/blake2s.go +++ b/vendor/golang.org/x/crypto/blake2s/blake2s.go @@ -16,9 +16,10 @@ // // BLAKE2X is a construction to compute hash values larger than 32 bytes. It // can produce hash values between 0 and 65535 bytes. -package blake2s // import "golang.org/x/crypto/blake2s" +package blake2s import ( + "crypto" "encoding/binary" "errors" "hash" @@ -55,6 +56,13 @@ func Sum256(data []byte) [Size]byte { // and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash. func New256(key []byte) (hash.Hash, error) { return newDigest(Size, key) } +func init() { + crypto.RegisterHash(crypto.BLAKE2s_256, func() hash.Hash { + h, _ := New256(nil) + return h + }) +} + // New128 returns a new hash.Hash computing the BLAKE2s-128 checksum given a // non-empty key. Note that a 128-bit digest is too small to be secure as a // cryptographic hash and should only be used as a MAC, thus the key argument diff --git a/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s b/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s index fe4b818..57d510f 100644 --- a/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s +++ b/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s @@ -1,432 +1,2173 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2s_amd64_asm.go -out ../blake2s_amd64.s -pkg blake2s. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA iv0<>+0x00(SB)/4, $0x6a09e667 -DATA iv0<>+0x04(SB)/4, $0xbb67ae85 -DATA iv0<>+0x08(SB)/4, $0x3c6ef372 -DATA iv0<>+0x0c(SB)/4, $0xa54ff53a -GLOBL iv0<>(SB), (NOPTR+RODATA), $16 - -DATA iv1<>+0x00(SB)/4, $0x510e527f -DATA iv1<>+0x04(SB)/4, $0x9b05688c -DATA iv1<>+0x08(SB)/4, $0x1f83d9ab -DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 -GLOBL iv1<>(SB), (NOPTR+RODATA), $16 - -DATA rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -GLOBL rol16<>(SB), (NOPTR+RODATA), $16 - -DATA rol8<>+0x00(SB)/8, $0x0407060500030201 -DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 -GLOBL rol8<>(SB), (NOPTR+RODATA), $16 - -DATA counter<>+0x00(SB)/8, $0x40 -DATA counter<>+0x08(SB)/8, $0x0 -GLOBL counter<>(SB), (NOPTR+RODATA), $16 - -#define ROTL_SSE2(n, t, v) \ - MOVO v, t; \ - PSLLL $n, t; \ - PSRLL $(32-n), v; \ - PXOR t, v - -#define ROTL_SSSE3(c, v) \ - PSHUFB c, v - -#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ - PADDL m0, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m1, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(24, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3; \ - PADDL m2, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(16, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m3, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSE2(24, t, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v3, v3; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v1, v1 - -#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ - PADDL m0, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c16, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m1, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c8, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v1, v1; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v3, v3; \ - PADDL m2, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c16, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(20, t, v1); \ - PADDL m3, v0; \ - PADDL v1, v0; \ - PXOR v0, v3; \ - ROTL_SSSE3(c8, v3); \ - PADDL v3, v2; \ - PXOR v2, v1; \ - ROTL_SSE2(25, t, v1); \ - PSHUFL $0x39, v3, v3; \ - PSHUFL $0x4E, v2, v2; \ - PSHUFL $0x93, v1, v1 - - -#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \ - MOVL i0*4(src), m0; \ - PINSRD $1, i1*4(src), m0; \ - PINSRD $2, i2*4(src), m0; \ - PINSRD $3, i3*4(src), m0; \ - MOVL i4*4(src), m1; \ - PINSRD $1, i5*4(src), m1; \ - PINSRD $2, i6*4(src), m1; \ - PINSRD $3, i7*4(src), m1; \ - MOVL i8*4(src), m2; \ - PINSRD $1, i9*4(src), m2; \ - PINSRD $2, i10*4(src), m2; \ - PINSRD $3, i11*4(src), m2; \ - MOVL i12*4(src), m3; \ - PINSRD $1, i13*4(src), m3; \ - PINSRD $2, i14*4(src), m3; \ - PINSRD $3, i15*4(src), m3 +// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +// Requires: SSE2 +TEXT ·hashBlocksSSE2(SB), $672-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 -#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \ - MOVQ 0*4(src), R8; \ - MOVQ 2*4(src), R9; \ - MOVQ 4*4(src), R10; \ - MOVQ 6*4(src), R11; \ - MOVQ 8*4(src), R12; \ - MOVQ 10*4(src), R13; \ - MOVQ 12*4(src), R14; \ - MOVQ 14*4(src), R15; \ - \ - MOVL R8, 0*4+off+0(dst); \ - MOVL R8, 9*4+off+64(dst); \ - MOVL R8, 5*4+off+128(dst); \ - MOVL R8, 14*4+off+192(dst); \ - MOVL R8, 4*4+off+256(dst); \ - MOVL R8, 2*4+off+320(dst); \ - MOVL R8, 8*4+off+384(dst); \ - MOVL R8, 12*4+off+448(dst); \ - MOVL R8, 3*4+off+512(dst); \ - MOVL R8, 15*4+off+576(dst); \ - SHRQ $32, R8; \ - MOVL R8, 4*4+off+0(dst); \ - MOVL R8, 8*4+off+64(dst); \ - MOVL R8, 14*4+off+128(dst); \ - MOVL R8, 5*4+off+192(dst); \ - MOVL R8, 12*4+off+256(dst); \ - MOVL R8, 11*4+off+320(dst); \ - MOVL R8, 1*4+off+384(dst); \ - MOVL R8, 6*4+off+448(dst); \ - MOVL R8, 10*4+off+512(dst); \ - MOVL R8, 3*4+off+576(dst); \ - \ - MOVL R9, 1*4+off+0(dst); \ - MOVL R9, 13*4+off+64(dst); \ - MOVL R9, 6*4+off+128(dst); \ - MOVL R9, 8*4+off+192(dst); \ - MOVL R9, 2*4+off+256(dst); \ - MOVL R9, 0*4+off+320(dst); \ - MOVL R9, 14*4+off+384(dst); \ - MOVL R9, 11*4+off+448(dst); \ - MOVL R9, 12*4+off+512(dst); \ - MOVL R9, 4*4+off+576(dst); \ - SHRQ $32, R9; \ - MOVL R9, 5*4+off+0(dst); \ - MOVL R9, 15*4+off+64(dst); \ - MOVL R9, 9*4+off+128(dst); \ - MOVL R9, 1*4+off+192(dst); \ - MOVL R9, 11*4+off+256(dst); \ - MOVL R9, 7*4+off+320(dst); \ - MOVL R9, 13*4+off+384(dst); \ - MOVL R9, 3*4+off+448(dst); \ - MOVL R9, 6*4+off+512(dst); \ - MOVL R9, 10*4+off+576(dst); \ - \ - MOVL R10, 2*4+off+0(dst); \ - MOVL R10, 1*4+off+64(dst); \ - MOVL R10, 15*4+off+128(dst); \ - MOVL R10, 10*4+off+192(dst); \ - MOVL R10, 6*4+off+256(dst); \ - MOVL R10, 8*4+off+320(dst); \ - MOVL R10, 3*4+off+384(dst); \ - MOVL R10, 13*4+off+448(dst); \ - MOVL R10, 14*4+off+512(dst); \ - MOVL R10, 5*4+off+576(dst); \ - SHRQ $32, R10; \ - MOVL R10, 6*4+off+0(dst); \ - MOVL R10, 11*4+off+64(dst); \ - MOVL R10, 2*4+off+128(dst); \ - MOVL R10, 9*4+off+192(dst); \ - MOVL R10, 1*4+off+256(dst); \ - MOVL R10, 13*4+off+320(dst); \ - MOVL R10, 4*4+off+384(dst); \ - MOVL R10, 8*4+off+448(dst); \ - MOVL R10, 15*4+off+512(dst); \ - MOVL R10, 7*4+off+576(dst); \ - \ - MOVL R11, 3*4+off+0(dst); \ - MOVL R11, 7*4+off+64(dst); \ - MOVL R11, 13*4+off+128(dst); \ - MOVL R11, 12*4+off+192(dst); \ - MOVL R11, 10*4+off+256(dst); \ - MOVL R11, 1*4+off+320(dst); \ - MOVL R11, 9*4+off+384(dst); \ - MOVL R11, 14*4+off+448(dst); \ - MOVL R11, 0*4+off+512(dst); \ - MOVL R11, 6*4+off+576(dst); \ - SHRQ $32, R11; \ - MOVL R11, 7*4+off+0(dst); \ - MOVL R11, 14*4+off+64(dst); \ - MOVL R11, 10*4+off+128(dst); \ - MOVL R11, 0*4+off+192(dst); \ - MOVL R11, 5*4+off+256(dst); \ - MOVL R11, 9*4+off+320(dst); \ - MOVL R11, 12*4+off+384(dst); \ - MOVL R11, 1*4+off+448(dst); \ - MOVL R11, 13*4+off+512(dst); \ - MOVL R11, 2*4+off+576(dst); \ - \ - MOVL R12, 8*4+off+0(dst); \ - MOVL R12, 5*4+off+64(dst); \ - MOVL R12, 4*4+off+128(dst); \ - MOVL R12, 15*4+off+192(dst); \ - MOVL R12, 14*4+off+256(dst); \ - MOVL R12, 3*4+off+320(dst); \ - MOVL R12, 11*4+off+384(dst); \ - MOVL R12, 10*4+off+448(dst); \ - MOVL R12, 7*4+off+512(dst); \ - MOVL R12, 1*4+off+576(dst); \ - SHRQ $32, R12; \ - MOVL R12, 12*4+off+0(dst); \ - MOVL R12, 2*4+off+64(dst); \ - MOVL R12, 11*4+off+128(dst); \ - MOVL R12, 4*4+off+192(dst); \ - MOVL R12, 0*4+off+256(dst); \ - MOVL R12, 15*4+off+320(dst); \ - MOVL R12, 10*4+off+384(dst); \ - MOVL R12, 7*4+off+448(dst); \ - MOVL R12, 5*4+off+512(dst); \ - MOVL R12, 9*4+off+576(dst); \ - \ - MOVL R13, 9*4+off+0(dst); \ - MOVL R13, 4*4+off+64(dst); \ - MOVL R13, 8*4+off+128(dst); \ - MOVL R13, 13*4+off+192(dst); \ - MOVL R13, 3*4+off+256(dst); \ - MOVL R13, 5*4+off+320(dst); \ - MOVL R13, 7*4+off+384(dst); \ - MOVL R13, 15*4+off+448(dst); \ - MOVL R13, 11*4+off+512(dst); \ - MOVL R13, 0*4+off+576(dst); \ - SHRQ $32, R13; \ - MOVL R13, 13*4+off+0(dst); \ - MOVL R13, 10*4+off+64(dst); \ - MOVL R13, 0*4+off+128(dst); \ - MOVL R13, 3*4+off+192(dst); \ - MOVL R13, 9*4+off+256(dst); \ - MOVL R13, 6*4+off+320(dst); \ - MOVL R13, 15*4+off+384(dst); \ - MOVL R13, 4*4+off+448(dst); \ - MOVL R13, 2*4+off+512(dst); \ - MOVL R13, 12*4+off+576(dst); \ - \ - MOVL R14, 10*4+off+0(dst); \ - MOVL R14, 12*4+off+64(dst); \ - MOVL R14, 1*4+off+128(dst); \ - MOVL R14, 6*4+off+192(dst); \ - MOVL R14, 13*4+off+256(dst); \ - MOVL R14, 4*4+off+320(dst); \ - MOVL R14, 0*4+off+384(dst); \ - MOVL R14, 2*4+off+448(dst); \ - MOVL R14, 8*4+off+512(dst); \ - MOVL R14, 14*4+off+576(dst); \ - SHRQ $32, R14; \ - MOVL R14, 14*4+off+0(dst); \ - MOVL R14, 3*4+off+64(dst); \ - MOVL R14, 7*4+off+128(dst); \ - MOVL R14, 2*4+off+192(dst); \ - MOVL R14, 15*4+off+256(dst); \ - MOVL R14, 12*4+off+320(dst); \ - MOVL R14, 6*4+off+384(dst); \ - MOVL R14, 0*4+off+448(dst); \ - MOVL R14, 9*4+off+512(dst); \ - MOVL R14, 11*4+off+576(dst); \ - \ - MOVL R15, 11*4+off+0(dst); \ - MOVL R15, 0*4+off+64(dst); \ - MOVL R15, 12*4+off+128(dst); \ - MOVL R15, 7*4+off+192(dst); \ - MOVL R15, 8*4+off+256(dst); \ - MOVL R15, 14*4+off+320(dst); \ - MOVL R15, 2*4+off+384(dst); \ - MOVL R15, 5*4+off+448(dst); \ - MOVL R15, 1*4+off+512(dst); \ - MOVL R15, 13*4+off+576(dst); \ - SHRQ $32, R15; \ - MOVL R15, 15*4+off+0(dst); \ - MOVL R15, 6*4+off+64(dst); \ - MOVL R15, 3*4+off+128(dst); \ - MOVL R15, 11*4+off+192(dst); \ - MOVL R15, 7*4+off+256(dst); \ - MOVL R15, 10*4+off+320(dst); \ - MOVL R15, 5*4+off+384(dst); \ - MOVL R15, 9*4+off+448(dst); \ - MOVL R15, 4*4+off+512(dst); \ - MOVL R15, 8*4+off+576(dst) +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + MOVQ 32(SI), R12 + MOVQ 40(SI), R13 + MOVQ 48(SI), R14 + MOVQ 56(SI), R15 + MOVL R8, 16(BP) + MOVL R8, 116(BP) + MOVL R8, 164(BP) + MOVL R8, 264(BP) + MOVL R8, 288(BP) + MOVL R8, 344(BP) + MOVL R8, 432(BP) + MOVL R8, 512(BP) + MOVL R8, 540(BP) + MOVL R8, 652(BP) + SHRQ $0x20, R8 + MOVL R8, 32(BP) + MOVL R8, 112(BP) + MOVL R8, 200(BP) + MOVL R8, 228(BP) + MOVL R8, 320(BP) + MOVL R8, 380(BP) + MOVL R8, 404(BP) + MOVL R8, 488(BP) + MOVL R8, 568(BP) + MOVL R8, 604(BP) + MOVL R9, 20(BP) + MOVL R9, 132(BP) + MOVL R9, 168(BP) + MOVL R9, 240(BP) + MOVL R9, 280(BP) + MOVL R9, 336(BP) + MOVL R9, 456(BP) + MOVL R9, 508(BP) + MOVL R9, 576(BP) + MOVL R9, 608(BP) + SHRQ $0x20, R9 + MOVL R9, 36(BP) + MOVL R9, 140(BP) + MOVL R9, 180(BP) + MOVL R9, 212(BP) + MOVL R9, 316(BP) + MOVL R9, 364(BP) + MOVL R9, 452(BP) + MOVL R9, 476(BP) + MOVL R9, 552(BP) + MOVL R9, 632(BP) + MOVL R10, 24(BP) + MOVL R10, 84(BP) + MOVL R10, 204(BP) + MOVL R10, 248(BP) + MOVL R10, 296(BP) + MOVL R10, 368(BP) + MOVL R10, 412(BP) + MOVL R10, 516(BP) + MOVL R10, 584(BP) + MOVL R10, 612(BP) + SHRQ $0x20, R10 + MOVL R10, 40(BP) + MOVL R10, 124(BP) + MOVL R10, 152(BP) + MOVL R10, 244(BP) + MOVL R10, 276(BP) + MOVL R10, 388(BP) + MOVL R10, 416(BP) + MOVL R10, 496(BP) + MOVL R10, 588(BP) + MOVL R10, 620(BP) + MOVL R11, 28(BP) + MOVL R11, 108(BP) + MOVL R11, 196(BP) + MOVL R11, 256(BP) + MOVL R11, 312(BP) + MOVL R11, 340(BP) + MOVL R11, 436(BP) + MOVL R11, 520(BP) + MOVL R11, 528(BP) + MOVL R11, 616(BP) + SHRQ $0x20, R11 + MOVL R11, 44(BP) + MOVL R11, 136(BP) + MOVL R11, 184(BP) + MOVL R11, 208(BP) + MOVL R11, 292(BP) + MOVL R11, 372(BP) + MOVL R11, 448(BP) + MOVL R11, 468(BP) + MOVL R11, 580(BP) + MOVL R11, 600(BP) + MOVL R12, 48(BP) + MOVL R12, 100(BP) + MOVL R12, 160(BP) + MOVL R12, 268(BP) + MOVL R12, 328(BP) + MOVL R12, 348(BP) + MOVL R12, 444(BP) + MOVL R12, 504(BP) + MOVL R12, 556(BP) + MOVL R12, 596(BP) + SHRQ $0x20, R12 + MOVL R12, 64(BP) + MOVL R12, 88(BP) + MOVL R12, 188(BP) + MOVL R12, 224(BP) + MOVL R12, 272(BP) + MOVL R12, 396(BP) + MOVL R12, 440(BP) + MOVL R12, 492(BP) + MOVL R12, 548(BP) + MOVL R12, 628(BP) + MOVL R13, 52(BP) + MOVL R13, 96(BP) + MOVL R13, 176(BP) + MOVL R13, 260(BP) + MOVL R13, 284(BP) + MOVL R13, 356(BP) + MOVL R13, 428(BP) + MOVL R13, 524(BP) + MOVL R13, 572(BP) + MOVL R13, 592(BP) + SHRQ $0x20, R13 + MOVL R13, 68(BP) + MOVL R13, 120(BP) + MOVL R13, 144(BP) + MOVL R13, 220(BP) + MOVL R13, 308(BP) + MOVL R13, 360(BP) + MOVL R13, 460(BP) + MOVL R13, 480(BP) + MOVL R13, 536(BP) + MOVL R13, 640(BP) + MOVL R14, 56(BP) + MOVL R14, 128(BP) + MOVL R14, 148(BP) + MOVL R14, 232(BP) + MOVL R14, 324(BP) + MOVL R14, 352(BP) + MOVL R14, 400(BP) + MOVL R14, 472(BP) + MOVL R14, 560(BP) + MOVL R14, 648(BP) + SHRQ $0x20, R14 + MOVL R14, 72(BP) + MOVL R14, 92(BP) + MOVL R14, 172(BP) + MOVL R14, 216(BP) + MOVL R14, 332(BP) + MOVL R14, 384(BP) + MOVL R14, 424(BP) + MOVL R14, 464(BP) + MOVL R14, 564(BP) + MOVL R14, 636(BP) + MOVL R15, 60(BP) + MOVL R15, 80(BP) + MOVL R15, 192(BP) + MOVL R15, 236(BP) + MOVL R15, 304(BP) + MOVL R15, 392(BP) + MOVL R15, 408(BP) + MOVL R15, 484(BP) + MOVL R15, 532(BP) + MOVL R15, 644(BP) + SHRQ $0x20, R15 + MOVL R15, 76(BP) + MOVL R15, 104(BP) + MOVL R15, 156(BP) + MOVL R15, 252(BP) + MOVL R15, 300(BP) + MOVL R15, 376(BP) + MOVL R15, 420(BP) + MOVL R15, 500(BP) + MOVL R15, 544(BP) + MOVL R15, 624(BP) + PADDL 16(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 32(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 48(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 64(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 80(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 96(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 112(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 128(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 144(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 160(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 176(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 192(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 208(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 224(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 240(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 256(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 272(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 288(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 304(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 320(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 336(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 352(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 368(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 384(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 400(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 416(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 432(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 448(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 464(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 480(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 496(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 512(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 528(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 544(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 560(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 576(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 592(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 608(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 624(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x10, X8 + PSRLL $0x10, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 640(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + MOVO X7, X8 + PSLLL $0x18, X8 + PSRLL $0x08, X7 + PXOR X8, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) + RET -#define BLAKE2s_SSE2() \ - PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ - ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \ - ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8) +DATA iv0<>+0(SB)/4, $0x6a09e667 +DATA iv0<>+4(SB)/4, $0xbb67ae85 +DATA iv0<>+8(SB)/4, $0x3c6ef372 +DATA iv0<>+12(SB)/4, $0xa54ff53a +GLOBL iv0<>(SB), RODATA|NOPTR, $16 -#define BLAKE2s_SSSE3() \ - PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ - ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \ - ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14) +DATA iv1<>+0(SB)/4, $0x510e527f +DATA iv1<>+4(SB)/4, $0x9b05688c +DATA iv1<>+8(SB)/4, $0x1f83d9ab +DATA iv1<>+12(SB)/4, $0x5be0cd19 +GLOBL iv1<>(SB), RODATA|NOPTR, $16 -#define BLAKE2s_SSE4() \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ - LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \ - ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) +DATA counter<>+0(SB)/8, $0x0000000000000040 +DATA counter<>+8(SB)/8, $0x0000000000000000 +GLOBL counter<>(SB), RODATA|NOPTR, $16 -#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \ - MOVQ h, AX; \ - MOVQ c, BX; \ - MOVL flag, CX; \ - MOVQ blocks_base, SI; \ - MOVQ blocks_len, DX; \ - \ - MOVQ SP, BP; \ - ADDQ $15, BP; \ - ANDQ $~15, BP; \ - \ - MOVQ 0(BX), R9; \ - MOVQ R9, 0(BP); \ - MOVQ CX, 8(BP); \ - \ - MOVOU 0(AX), X0; \ - MOVOU 16(AX), X1; \ - MOVOU iv0<>(SB), X2; \ - MOVOU iv1<>(SB), X3 \ - \ - MOVOU counter<>(SB), X12; \ - MOVOU rol16<>(SB), X13; \ - MOVOU rol8<>(SB), X14; \ - MOVO 0(BP), X15; \ - \ - loop: \ - MOVO X0, X4; \ - MOVO X1, X5; \ - MOVO X2, X6; \ - MOVO X3, X7; \ - \ - PADDQ X12, X15; \ - PXOR X15, X7; \ - \ - BLAKE2s_FUNC(); \ - \ - PXOR X4, X0; \ - PXOR X5, X1; \ - PXOR X6, X0; \ - PXOR X7, X1; \ - \ - LEAQ 64(SI), SI; \ - SUBQ $64, DX; \ - JNE loop; \ - \ - MOVO X15, 0(BP); \ - MOVQ 0(BP), R9; \ - MOVQ R9, 0(BX); \ - \ - MOVOU X0, 0(AX); \ - MOVOU X1, 16(AX) +DATA rol16<>+0(SB)/8, $0x0504070601000302 +DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +GLOBL rol16<>(SB), RODATA|NOPTR, $16 -// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2) - RET +DATA rol8<>+0(SB)/8, $0x0407060500030201 +DATA rol8<>+8(SB)/8, $0x0c0f0e0d080b0a09 +GLOBL rol8<>(SB), RODATA|NOPTR, $16 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3) +// Requires: SSE2, SSSE3 +TEXT ·hashBlocksSSSE3(SB), $672-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + MOVQ 32(SI), R12 + MOVQ 40(SI), R13 + MOVQ 48(SI), R14 + MOVQ 56(SI), R15 + MOVL R8, 16(BP) + MOVL R8, 116(BP) + MOVL R8, 164(BP) + MOVL R8, 264(BP) + MOVL R8, 288(BP) + MOVL R8, 344(BP) + MOVL R8, 432(BP) + MOVL R8, 512(BP) + MOVL R8, 540(BP) + MOVL R8, 652(BP) + SHRQ $0x20, R8 + MOVL R8, 32(BP) + MOVL R8, 112(BP) + MOVL R8, 200(BP) + MOVL R8, 228(BP) + MOVL R8, 320(BP) + MOVL R8, 380(BP) + MOVL R8, 404(BP) + MOVL R8, 488(BP) + MOVL R8, 568(BP) + MOVL R8, 604(BP) + MOVL R9, 20(BP) + MOVL R9, 132(BP) + MOVL R9, 168(BP) + MOVL R9, 240(BP) + MOVL R9, 280(BP) + MOVL R9, 336(BP) + MOVL R9, 456(BP) + MOVL R9, 508(BP) + MOVL R9, 576(BP) + MOVL R9, 608(BP) + SHRQ $0x20, R9 + MOVL R9, 36(BP) + MOVL R9, 140(BP) + MOVL R9, 180(BP) + MOVL R9, 212(BP) + MOVL R9, 316(BP) + MOVL R9, 364(BP) + MOVL R9, 452(BP) + MOVL R9, 476(BP) + MOVL R9, 552(BP) + MOVL R9, 632(BP) + MOVL R10, 24(BP) + MOVL R10, 84(BP) + MOVL R10, 204(BP) + MOVL R10, 248(BP) + MOVL R10, 296(BP) + MOVL R10, 368(BP) + MOVL R10, 412(BP) + MOVL R10, 516(BP) + MOVL R10, 584(BP) + MOVL R10, 612(BP) + SHRQ $0x20, R10 + MOVL R10, 40(BP) + MOVL R10, 124(BP) + MOVL R10, 152(BP) + MOVL R10, 244(BP) + MOVL R10, 276(BP) + MOVL R10, 388(BP) + MOVL R10, 416(BP) + MOVL R10, 496(BP) + MOVL R10, 588(BP) + MOVL R10, 620(BP) + MOVL R11, 28(BP) + MOVL R11, 108(BP) + MOVL R11, 196(BP) + MOVL R11, 256(BP) + MOVL R11, 312(BP) + MOVL R11, 340(BP) + MOVL R11, 436(BP) + MOVL R11, 520(BP) + MOVL R11, 528(BP) + MOVL R11, 616(BP) + SHRQ $0x20, R11 + MOVL R11, 44(BP) + MOVL R11, 136(BP) + MOVL R11, 184(BP) + MOVL R11, 208(BP) + MOVL R11, 292(BP) + MOVL R11, 372(BP) + MOVL R11, 448(BP) + MOVL R11, 468(BP) + MOVL R11, 580(BP) + MOVL R11, 600(BP) + MOVL R12, 48(BP) + MOVL R12, 100(BP) + MOVL R12, 160(BP) + MOVL R12, 268(BP) + MOVL R12, 328(BP) + MOVL R12, 348(BP) + MOVL R12, 444(BP) + MOVL R12, 504(BP) + MOVL R12, 556(BP) + MOVL R12, 596(BP) + SHRQ $0x20, R12 + MOVL R12, 64(BP) + MOVL R12, 88(BP) + MOVL R12, 188(BP) + MOVL R12, 224(BP) + MOVL R12, 272(BP) + MOVL R12, 396(BP) + MOVL R12, 440(BP) + MOVL R12, 492(BP) + MOVL R12, 548(BP) + MOVL R12, 628(BP) + MOVL R13, 52(BP) + MOVL R13, 96(BP) + MOVL R13, 176(BP) + MOVL R13, 260(BP) + MOVL R13, 284(BP) + MOVL R13, 356(BP) + MOVL R13, 428(BP) + MOVL R13, 524(BP) + MOVL R13, 572(BP) + MOVL R13, 592(BP) + SHRQ $0x20, R13 + MOVL R13, 68(BP) + MOVL R13, 120(BP) + MOVL R13, 144(BP) + MOVL R13, 220(BP) + MOVL R13, 308(BP) + MOVL R13, 360(BP) + MOVL R13, 460(BP) + MOVL R13, 480(BP) + MOVL R13, 536(BP) + MOVL R13, 640(BP) + MOVL R14, 56(BP) + MOVL R14, 128(BP) + MOVL R14, 148(BP) + MOVL R14, 232(BP) + MOVL R14, 324(BP) + MOVL R14, 352(BP) + MOVL R14, 400(BP) + MOVL R14, 472(BP) + MOVL R14, 560(BP) + MOVL R14, 648(BP) + SHRQ $0x20, R14 + MOVL R14, 72(BP) + MOVL R14, 92(BP) + MOVL R14, 172(BP) + MOVL R14, 216(BP) + MOVL R14, 332(BP) + MOVL R14, 384(BP) + MOVL R14, 424(BP) + MOVL R14, 464(BP) + MOVL R14, 564(BP) + MOVL R14, 636(BP) + MOVL R15, 60(BP) + MOVL R15, 80(BP) + MOVL R15, 192(BP) + MOVL R15, 236(BP) + MOVL R15, 304(BP) + MOVL R15, 392(BP) + MOVL R15, 408(BP) + MOVL R15, 484(BP) + MOVL R15, 532(BP) + MOVL R15, 644(BP) + SHRQ $0x20, R15 + MOVL R15, 76(BP) + MOVL R15, 104(BP) + MOVL R15, 156(BP) + MOVL R15, 252(BP) + MOVL R15, 300(BP) + MOVL R15, 376(BP) + MOVL R15, 420(BP) + MOVL R15, 500(BP) + MOVL R15, 544(BP) + MOVL R15, 624(BP) + PADDL 16(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 32(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 48(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 64(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 80(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 96(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 112(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 128(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 144(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 160(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 176(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 192(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 208(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 224(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 240(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 256(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 272(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 288(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 304(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 320(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 336(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 352(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 368(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 384(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 400(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 416(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 432(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 448(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 464(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 480(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 496(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 512(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 528(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 544(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 560(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 576(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PADDL 592(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 608(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL 624(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL 640(BP), X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) RET // func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment - HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4) +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), $32-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVL flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DX + MOVQ SP, BP + ADDQ $0x0f, BP + ANDQ $-16, BP + MOVQ (BX), R9 + MOVQ R9, (BP) + MOVQ CX, 8(BP) + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU iv0<>+0(SB), X2 + MOVOU iv1<>+0(SB), X3 + MOVOU counter<>+0(SB), X12 + MOVOU rol16<>+0(SB), X13 + MOVOU rol8<>+0(SB), X14 + MOVO (BP), X15 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X12, X15 + PXOR X15, X7 + MOVL (SI), X8 + PINSRD $0x01, 8(SI), X8 + PINSRD $0x02, 16(SI), X8 + PINSRD $0x03, 24(SI), X8 + MOVL 4(SI), X9 + PINSRD $0x01, 12(SI), X9 + PINSRD $0x02, 20(SI), X9 + PINSRD $0x03, 28(SI), X9 + MOVL 32(SI), X10 + PINSRD $0x01, 40(SI), X10 + PINSRD $0x02, 48(SI), X10 + PINSRD $0x03, 56(SI), X10 + MOVL 36(SI), X11 + PINSRD $0x01, 44(SI), X11 + PINSRD $0x02, 52(SI), X11 + PINSRD $0x03, 60(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 56(SI), X8 + PINSRD $0x01, 16(SI), X8 + PINSRD $0x02, 36(SI), X8 + PINSRD $0x03, 52(SI), X8 + MOVL 40(SI), X9 + PINSRD $0x01, 32(SI), X9 + PINSRD $0x02, 60(SI), X9 + PINSRD $0x03, 24(SI), X9 + MOVL 4(SI), X10 + PINSRD $0x01, (SI), X10 + PINSRD $0x02, 44(SI), X10 + PINSRD $0x03, 20(SI), X10 + MOVL 48(SI), X11 + PINSRD $0x01, 8(SI), X11 + PINSRD $0x02, 28(SI), X11 + PINSRD $0x03, 12(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 44(SI), X8 + PINSRD $0x01, 48(SI), X8 + PINSRD $0x02, 20(SI), X8 + PINSRD $0x03, 60(SI), X8 + MOVL 32(SI), X9 + PINSRD $0x01, (SI), X9 + PINSRD $0x02, 8(SI), X9 + PINSRD $0x03, 52(SI), X9 + MOVL 40(SI), X10 + PINSRD $0x01, 12(SI), X10 + PINSRD $0x02, 28(SI), X10 + PINSRD $0x03, 36(SI), X10 + MOVL 56(SI), X11 + PINSRD $0x01, 24(SI), X11 + PINSRD $0x02, 4(SI), X11 + PINSRD $0x03, 16(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 28(SI), X8 + PINSRD $0x01, 12(SI), X8 + PINSRD $0x02, 52(SI), X8 + PINSRD $0x03, 44(SI), X8 + MOVL 36(SI), X9 + PINSRD $0x01, 4(SI), X9 + PINSRD $0x02, 48(SI), X9 + PINSRD $0x03, 56(SI), X9 + MOVL 8(SI), X10 + PINSRD $0x01, 20(SI), X10 + PINSRD $0x02, 16(SI), X10 + PINSRD $0x03, 60(SI), X10 + MOVL 24(SI), X11 + PINSRD $0x01, 40(SI), X11 + PINSRD $0x02, (SI), X11 + PINSRD $0x03, 32(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 36(SI), X8 + PINSRD $0x01, 20(SI), X8 + PINSRD $0x02, 8(SI), X8 + PINSRD $0x03, 40(SI), X8 + MOVL (SI), X9 + PINSRD $0x01, 28(SI), X9 + PINSRD $0x02, 16(SI), X9 + PINSRD $0x03, 60(SI), X9 + MOVL 56(SI), X10 + PINSRD $0x01, 44(SI), X10 + PINSRD $0x02, 24(SI), X10 + PINSRD $0x03, 12(SI), X10 + MOVL 4(SI), X11 + PINSRD $0x01, 48(SI), X11 + PINSRD $0x02, 32(SI), X11 + PINSRD $0x03, 52(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 8(SI), X8 + PINSRD $0x01, 24(SI), X8 + PINSRD $0x02, (SI), X8 + PINSRD $0x03, 32(SI), X8 + MOVL 48(SI), X9 + PINSRD $0x01, 40(SI), X9 + PINSRD $0x02, 44(SI), X9 + PINSRD $0x03, 12(SI), X9 + MOVL 16(SI), X10 + PINSRD $0x01, 28(SI), X10 + PINSRD $0x02, 60(SI), X10 + PINSRD $0x03, 4(SI), X10 + MOVL 52(SI), X11 + PINSRD $0x01, 20(SI), X11 + PINSRD $0x02, 56(SI), X11 + PINSRD $0x03, 36(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 48(SI), X8 + PINSRD $0x01, 4(SI), X8 + PINSRD $0x02, 56(SI), X8 + PINSRD $0x03, 16(SI), X8 + MOVL 20(SI), X9 + PINSRD $0x01, 60(SI), X9 + PINSRD $0x02, 52(SI), X9 + PINSRD $0x03, 40(SI), X9 + MOVL (SI), X10 + PINSRD $0x01, 24(SI), X10 + PINSRD $0x02, 36(SI), X10 + PINSRD $0x03, 32(SI), X10 + MOVL 28(SI), X11 + PINSRD $0x01, 12(SI), X11 + PINSRD $0x02, 8(SI), X11 + PINSRD $0x03, 44(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 52(SI), X8 + PINSRD $0x01, 28(SI), X8 + PINSRD $0x02, 48(SI), X8 + PINSRD $0x03, 12(SI), X8 + MOVL 44(SI), X9 + PINSRD $0x01, 56(SI), X9 + PINSRD $0x02, 4(SI), X9 + PINSRD $0x03, 36(SI), X9 + MOVL 20(SI), X10 + PINSRD $0x01, 60(SI), X10 + PINSRD $0x02, 32(SI), X10 + PINSRD $0x03, 8(SI), X10 + MOVL (SI), X11 + PINSRD $0x01, 16(SI), X11 + PINSRD $0x02, 24(SI), X11 + PINSRD $0x03, 40(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 24(SI), X8 + PINSRD $0x01, 56(SI), X8 + PINSRD $0x02, 44(SI), X8 + PINSRD $0x03, (SI), X8 + MOVL 60(SI), X9 + PINSRD $0x01, 36(SI), X9 + PINSRD $0x02, 12(SI), X9 + PINSRD $0x03, 32(SI), X9 + MOVL 48(SI), X10 + PINSRD $0x01, 52(SI), X10 + PINSRD $0x02, 4(SI), X10 + PINSRD $0x03, 40(SI), X10 + MOVL 8(SI), X11 + PINSRD $0x01, 28(SI), X11 + PINSRD $0x02, 16(SI), X11 + PINSRD $0x03, 20(SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + MOVL 40(SI), X8 + PINSRD $0x01, 32(SI), X8 + PINSRD $0x02, 28(SI), X8 + PINSRD $0x03, 4(SI), X8 + MOVL 8(SI), X9 + PINSRD $0x01, 16(SI), X9 + PINSRD $0x02, 24(SI), X9 + PINSRD $0x03, 20(SI), X9 + MOVL 60(SI), X10 + PINSRD $0x01, 36(SI), X10 + PINSRD $0x02, 12(SI), X10 + PINSRD $0x03, 52(SI), X10 + MOVL 44(SI), X11 + PINSRD $0x01, 56(SI), X11 + PINSRD $0x02, 48(SI), X11 + PINSRD $0x03, (SI), X11 + PADDL X8, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X9, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X5, X5 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X7, X7 + PADDL X10, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X13, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x14, X8 + PSRLL $0x0c, X5 + PXOR X8, X5 + PADDL X11, X4 + PADDL X5, X4 + PXOR X4, X7 + PSHUFB X14, X7 + PADDL X7, X6 + PXOR X6, X5 + MOVO X5, X8 + PSLLL $0x19, X8 + PSRLL $0x07, X5 + PXOR X8, X5 + PSHUFL $0x39, X7, X7 + PSHUFL $0x4e, X6, X6 + PSHUFL $0x93, X5, X5 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + LEAQ 64(SI), SI + SUBQ $0x40, DX + JNE loop + MOVO X15, (BP) + MOVQ (BP), R9 + MOVQ R9, (BX) + MOVOU X0, (AX) + MOVOU X1, 16(AX) RET diff --git a/vendor/golang.org/x/crypto/blake2s/register.go b/vendor/golang.org/x/crypto/blake2s/register.go deleted file mode 100644 index 3156148..0000000 --- a/vendor/golang.org/x/crypto/blake2s/register.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.9 - -package blake2s - -import ( - "crypto" - "hash" -) - -func init() { - newHash256 := func() hash.Hash { - h, _ := New256(nil) - return h - } - - crypto.RegisterHash(crypto.BLAKE2s_256, newHash256) -} diff --git a/vendor/golang.org/x/crypto/blowfish/cipher.go b/vendor/golang.org/x/crypto/blowfish/cipher.go index 213bf20..0898956 100644 --- a/vendor/golang.org/x/crypto/blowfish/cipher.go +++ b/vendor/golang.org/x/crypto/blowfish/cipher.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package blowfish // import "golang.org/x/crypto/blowfish" +package blowfish // The code is a port of Bruce Schneier's C implementation. // See https://www.schneier.com/blowfish.html. diff --git a/vendor/golang.org/x/crypto/cast5/cast5.go b/vendor/golang.org/x/crypto/cast5/cast5.go index 425e8ee..016e902 100644 --- a/vendor/golang.org/x/crypto/cast5/cast5.go +++ b/vendor/golang.org/x/crypto/cast5/cast5.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package cast5 // import "golang.org/x/crypto/cast5" +package cast5 import ( "errors" diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s index 66aebae..c672ccf 100644 --- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s @@ -33,6 +33,9 @@ #define CONSTBASE R16 #define BLOCKS R17 +// for VPERMXOR +#define MASK R18 + DATA consts<>+0x00(SB)/8, $0x3320646e61707865 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 DATA consts<>+0x10(SB)/8, $0x0000000000000001 @@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 DATA consts<>+0x90(SB)/8, $0x0000000100000000 DATA consts<>+0x98(SB)/8, $0x0000000300000002 -GLOBL consts<>(SB), RODATA, $0xa0 +DATA consts<>+0xa0(SB)/8, $0x5566774411223300 +DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 +DATA consts<>+0xb0(SB)/8, $0x6677445522330011 +DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +GLOBL consts<>(SB), RODATA, $0xc0 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 @@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $48, R10 MOVD $64, R11 SRD $6, LEN, BLOCKS + // for VPERMXOR + MOVD $consts<>+0xa0(SB), MASK + MOVD $16, R20 // V16 LXVW4X (CONSTBASE)(R0), VS48 ADD $80,CONSTBASE @@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // V28 LXVW4X (CONSTBASE)(R11), VS60 + // Load mask constants for VPERMXOR + LXVW4X (MASK)(R0), V20 + LXVW4X (MASK)(R20), V21 + // splat slot from V19 -> V26 VSPLTW $0, V19, V26 @@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $10, R14 MOVD R14, CTR - + PCALIGN $16 loop_outer_vsx: // V0, V1, V2, V3 LXVW4X (R0)(CONSTBASE), VS32 @@ -128,22 +142,17 @@ loop_outer_vsx: VSPLTISW $12, V28 VSPLTISW $8, V29 VSPLTISW $7, V30 - + PCALIGN $16 loop_vsx: VADDUWM V0, V4, V0 VADDUWM V1, V5, V1 VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 - VRLW V15, V27, V15 + VPERMXOR V12, V0, V21, V12 + VPERMXOR V13, V1, V21, V13 + VPERMXOR V14, V2, V21, V14 + VPERMXOR V15, V3, V21, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -165,15 +174,10 @@ loop_vsx: VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 - VRLW V15, V29, V15 + VPERMXOR V12, V0, V20, V12 + VPERMXOR V13, V1, V20, V13 + VPERMXOR V14, V2, V20, V14 + VPERMXOR V15, V3, V20, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -195,15 +199,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V27, V15 - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 + VPERMXOR V15, V0, V21, V15 + VPERMXOR V12, V1, V21, V12 + VPERMXOR V13, V2, V21, V13 + VPERMXOR V14, V3, V21, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -225,15 +224,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V29, V15 - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 + VPERMXOR V15, V0, V20, V15 + VPERMXOR V12, V1, V20, V12 + VPERMXOR V13, V2, V20, V13 + VPERMXOR V14, V3, V20, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -249,48 +243,48 @@ loop_vsx: VRLW V6, V30, V6 VRLW V7, V30, V7 VRLW V4, V30, V4 - BC 16, LT, loop_vsx + BDNZ loop_vsx VADDUWM V12, V26, V12 - WORD $0x13600F8C // VMRGEW V0, V1, V27 - WORD $0x13821F8C // VMRGEW V2, V3, V28 + VMRGEW V0, V1, V27 + VMRGEW V2, V3, V28 - WORD $0x10000E8C // VMRGOW V0, V1, V0 - WORD $0x10421E8C // VMRGOW V2, V3, V2 + VMRGOW V0, V1, V0 + VMRGOW V2, V3, V2 - WORD $0x13A42F8C // VMRGEW V4, V5, V29 - WORD $0x13C63F8C // VMRGEW V6, V7, V30 + VMRGEW V4, V5, V29 + VMRGEW V6, V7, V30 XXPERMDI VS32, VS34, $0, VS33 XXPERMDI VS32, VS34, $3, VS35 XXPERMDI VS59, VS60, $0, VS32 XXPERMDI VS59, VS60, $3, VS34 - WORD $0x10842E8C // VMRGOW V4, V5, V4 - WORD $0x10C63E8C // VMRGOW V6, V7, V6 + VMRGOW V4, V5, V4 + VMRGOW V6, V7, V6 - WORD $0x13684F8C // VMRGEW V8, V9, V27 - WORD $0x138A5F8C // VMRGEW V10, V11, V28 + VMRGEW V8, V9, V27 + VMRGEW V10, V11, V28 XXPERMDI VS36, VS38, $0, VS37 XXPERMDI VS36, VS38, $3, VS39 XXPERMDI VS61, VS62, $0, VS36 XXPERMDI VS61, VS62, $3, VS38 - WORD $0x11084E8C // VMRGOW V8, V9, V8 - WORD $0x114A5E8C // VMRGOW V10, V11, V10 + VMRGOW V8, V9, V8 + VMRGOW V10, V11, V10 - WORD $0x13AC6F8C // VMRGEW V12, V13, V29 - WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + VMRGEW V12, V13, V29 + VMRGEW V14, V15, V30 XXPERMDI VS40, VS42, $0, VS41 XXPERMDI VS40, VS42, $3, VS43 XXPERMDI VS59, VS60, $0, VS40 XXPERMDI VS59, VS60, $3, VS42 - WORD $0x118C6E8C // VMRGOW V12, V13, V12 - WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + VMRGOW V12, V13, V12 + VMRGOW V14, V15, V14 VSPLTISW $4, V27 VADDUWM V26, V27, V26 @@ -431,7 +425,7 @@ tail_vsx: ADD $-1, R11, R12 ADD $-1, INP ADD $-1, OUT - + PCALIGN $16 looptail_vsx: // Copying the result to OUT // in bytes. @@ -439,7 +433,7 @@ looptail_vsx: MOVBZU 1(INP), TMP XOR KEY, TMP, KEY MOVBU KEY, 1(OUT) - BC 16, LT, looptail_vsx + BDNZ looptail_vsx // Clear the stack values STXVW4X VS48, (R11)(R0) diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go index 93da732..8cf5d81 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go @@ -5,7 +5,7 @@ // Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its // extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and // draft-irtf-cfrg-xchacha-01. -package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" +package chacha20poly1305 import ( "crypto/cipher" diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index 731d2ac..fd5ee84 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -1,2715 +1,9762 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. +// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" -// General register allocation -#define oup DI -#define inp SI -#define inl BX -#define adp CX // free to reuse, after we hash the additional data -#define keyp R8 // free to reuse, when we copy the key to stack -#define itr2 R9 // general iterator -#define itr1 CX // general iterator -#define acc0 R10 -#define acc1 R11 -#define acc2 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R8 -// Register and stack allocation for the SSE code -#define rStore (0*16)(BP) -#define sStore (1*16)(BP) -#define state1Store (2*16)(BP) -#define state2Store (3*16)(BP) -#define tmpStore (4*16)(BP) -#define ctr0Store (5*16)(BP) -#define ctr1Store (6*16)(BP) -#define ctr2Store (7*16)(BP) -#define ctr3Store (8*16)(BP) -#define A0 X0 -#define A1 X1 -#define A2 X2 -#define B0 X3 -#define B1 X4 -#define B2 X5 -#define C0 X6 -#define C1 X7 -#define C2 X8 -#define D0 X9 -#define D1 X10 -#define D2 X11 -#define T0 X12 -#define T1 X13 -#define T2 X14 -#define T3 X15 -#define A3 T0 -#define B3 T1 -#define C3 T2 -#define D3 T3 -// Register and stack allocation for the AVX2 code -#define rsStoreAVX2 (0*32)(BP) -#define state1StoreAVX2 (1*32)(BP) -#define state2StoreAVX2 (2*32)(BP) -#define ctr0StoreAVX2 (3*32)(BP) -#define ctr1StoreAVX2 (4*32)(BP) -#define ctr2StoreAVX2 (5*32)(BP) -#define ctr3StoreAVX2 (6*32)(BP) -#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack -#define AA0 Y0 -#define AA1 Y5 -#define AA2 Y6 -#define AA3 Y7 -#define BB0 Y14 -#define BB1 Y9 -#define BB2 Y10 -#define BB3 Y11 -#define CC0 Y12 -#define CC1 Y13 -#define CC2 Y8 -#define CC3 Y15 -#define DD0 Y4 -#define DD1 Y1 -#define DD2 Y2 -#define DD3 Y3 -#define TT0 DD3 -#define TT1 AA3 -#define TT2 BB3 -#define TT3 CC3 -// ChaCha20 constants -DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 -// <<< 16 with PSHUFB -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -// <<< 8 with PSHUFB -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 -DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 -DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 -// Poly1305 key clamp -DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA ·sseIncMask<>+0x00(SB)/8, $0x1 -DATA ·sseIncMask<>+0x08(SB)/8, $0x0 -// To load/store the last < 16 bytes in a buffer -DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 -// No PALIGNR in Go ASM yet (but VPALIGNR is present). -#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 -#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 -#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 -#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 -#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 -#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 -#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 -#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 -#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 -#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 -#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 -#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 -#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 -#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 -#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 -#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 -#define shiftC0Right shiftC0Left -#define shiftC1Right shiftC1Left -#define shiftC2Right shiftC2Left -#define shiftC3Right shiftC3Left -#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 -#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 -#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 -#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 - -// Some macros - -// ROL rotates the uint32s in register R left by N bits, using temporary T. -#define ROL(N, R, T) \ - MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R - -// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL16(R, T) PSHUFB ·rol16<>(SB), R -#else -#define ROL16(R, T) ROL(16, R, T) -#endif - -// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL8(R, T) PSHUFB ·rol8<>(SB), R -#else -#define ROL8(R, T) ROL(8, R, T) -#endif - -#define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; ROL16(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; ROL8(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B - -#define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B - -#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 -#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX -#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 -#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 - -#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 -#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 - -#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage -#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage -// ---------------------------------------------------------------------------- + +// func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // adp points to beginning of additional data - // itr2 holds ad length - XORQ acc0, acc0 - XORQ acc1, acc1 - XORQ acc2, acc2 - CMPQ itr2, $13 - JNE hashADLoop - -openFastTLSAD: - // Special treatment for the TLS case of 13 bytes - MOVQ (adp), acc0 - MOVQ 5(adp), acc1 - SHRQ $24, acc1 - MOVQ $1, acc2 - polyMul + // Hack: Must declare #define macros inside of a function due to Avo constraints + // ROL rotates the uint32s in register R left by N bits, using temporary T. + #define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R + + // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif + + // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif + XORQ R10, R10 + XORQ R11, R11 + XORQ R12, R12 + CMPQ R9, $0x0d + JNE hashADLoop + MOVQ (CX), R10 + MOVQ 5(CX), R11 + SHRQ $0x18, R11 + MOVQ $0x00000001, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks - CMPQ itr2, $16 - JB hashADTail - polyAdd(0(adp)) - LEAQ (1*16)(adp), adp - SUBQ $16, itr2 - polyMul - JMP hashADLoop + CMPQ R9, $0x10 + JB hashADTail + ADDQ (CX), R10 + ADCQ 8(CX), R11 + ADCQ $0x01, R12 + LEAQ 16(CX), CX + SUBQ $0x10, R9 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP hashADLoop hashADTail: - CMPQ itr2, $0 + CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail - XORQ t0, t0 - XORQ t1, t1 - XORQ t2, t2 - ADDQ itr2, adp + XORQ R13, R13 + XORQ R14, R14 + XORQ R15, R15 + ADDQ R9, CX hashADTailLoop: - SHLQ $8, t0, t1 - SHLQ $8, t0 - MOVB -1(adp), t2 - XORQ t2, t0 - DECQ adp - DECQ itr2 - JNE hashADTailLoop - -hashADTailFinish: - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - // Finished AD + SHLQ $0x08, R13, R14 + SHLQ $0x08, R13 + MOVB -1(CX), R15 + XORQ R15, R13 + DECQ CX + DECQ R9 + JNE hashADTailLoop + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + hashADDone: RET -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Open(dst, key, src, ad []byte) bool -TEXT ·chacha20Poly1305Open(SB), 0, $288-97 +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX // Check for AVX2 support - CMPB ·useAVX2(SB), $1 + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE openSSE128 // About 16% faster + CMPQ BX, $0x80 + JBE openSSE128 // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 - MOVO D0, T1 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X9, X13 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store - MOVO D0, ctr3Store - MOVQ $10, itr2 + MOVO X3, 32(BP) + MOVO X6, 48(BP) + MOVO X9, 128(BP) + MOVQ $0x0000000a, R9 openSSEPreparePolyKey: - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - DECQ itr2 - JNE openSSEPreparePolyKey + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + DECQ R9 + JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore; MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: - CMPQ inl, $256 + CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $4, itr1 - MOVQ inp, itr2 + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash + // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $0x00000004, CX + MOVQ SI, R9 openSSEInternalLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(itr2)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(itr2), itr2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr1 - JGE openSSEInternalLoop - - polyAdd(0(itr2)) - polyMul - LEAQ (2*8)(itr2), itr2 - - CMPQ itr1, $-6 - JG openSSEInternalLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(R9), R9 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ CX + JGE openSSEInternalLoop + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + CMPQ CX, $-6 + JG openSSEInternalLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Load - xor - store - MOVO D3, tmpStore - MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) - MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) - MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) - MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) - MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) - MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) - MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) - MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) - MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) - MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) - MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) - MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) - MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) - MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) - LEAQ 256(inp), inp - LEAQ 256(oup), oup - SUBQ $256, inl + MOVO X15, 64(BP) + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU X0, (DI) + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU X3, 16(DI) + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU X6, 32(DI) + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X9, 48(DI) + MOVOU 64(SI), X9 + PXOR X9, X1 + MOVOU X1, 64(DI) + MOVOU 80(SI), X9 + PXOR X9, X4 + MOVOU X4, 80(DI) + MOVOU 96(SI), X9 + PXOR X9, X7 + MOVOU X7, 96(DI) + MOVOU 112(SI), X9 + PXOR X9, X10 + MOVOU X10, 112(DI) + MOVOU 128(SI), X9 + PXOR X9, X2 + MOVOU X2, 128(DI) + MOVOU 144(SI), X9 + PXOR X9, X5 + MOVOU X5, 144(DI) + MOVOU 160(SI), X9 + PXOR X9, X8 + MOVOU X8, 160(DI) + MOVOU 176(SI), X9 + PXOR X9, X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X9 + PXOR X9, X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X9 + PXOR X9, X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X9 + PXOR X9, X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X9 + PXOR 64(BP), X9 + MOVOU X9, 240(DI) + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $64 + CMPQ BX, $0x40 JBE openSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openSSETail128 - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX - MOVQ $1, DX - XORQ (0*8)(inp), acc0 - XORQ (1*8)(inp), acc1 - ORQ acc1, acc0 + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 129 bytes openSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE openSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore; MOVOU B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail16 - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0(inp)) + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 // Load for decryption - MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - polyMul + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP openSSE128Open openSSETail16: - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVOU (inp), T0 - ADDQ inl, inp - PAND -16(t0)(itr2*1), T0 - MOVO T0, 0+tmpStore - MOVQ T0, t0 - MOVQ 8+tmpStore, t1 - PXOR A1, T0 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: - MOVQ T0, t3 - MOVB t3, (oup) - PSRLDQ $1, T0 - INCQ oup - DECQ inl + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX JNE openSSETail16Store - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 JMP openSSEFinalize -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of ciphertext openSSETail64: - // Need to decrypt up to 64 bytes - prepare single block - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - XORQ itr2, itr2 - MOVQ inl, itr1 - CMPQ itr1, $16 - JB openSSETail64LoopB + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + XORQ R9, R9 + MOVQ BX, CX + CMPQ CX, $0x10 + JB openSSETail64LoopB openSSETail64LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul - SUBQ $16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX openSSETail64LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - - CMPQ itr1, $16 - JAE openSSETail64LoopA - - CMPQ itr2, $160 - JNE openSSETail64LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + CMPQ CX, $0x10 + JAE openSSETail64LoopA + CMPQ R9, $0xa0 + JNE openSSETail64LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 + PADDL 48(BP), X6 + PADDL 80(BP), X9 openSSETail64DecLoop: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail64DecLoopDone - SUBQ $16, inl - MOVOU (inp), T0 - PXOR T0, A0 - MOVOU A0, (oup) - LEAQ 16(inp), inp - LEAQ 16(oup), oup - MOVO B0, A0 - MOVO C0, B0 - MOVO D0, C0 + SUBQ $0x10, BX + MOVOU (SI), X12 + PXOR X12, X0 + MOVOU X0, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVO X3, X0 + MOVO X6, X3 + MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: - MOVO A0, A1 + MOVO X0, X1 JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openSSETail128: - // Need to decrypt up to 128 bytes - prepare two blocks - MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 96(BP) + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX openSSETail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSETail128LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - CMPQ itr2, itr1 - JB openSSETail128LoopA - - CMPQ itr2, $160 - JNE openSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr1Store, D0; PADDL ctr0Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - - SUBQ $64, inl - LEAQ 64(inp), inp - LEAQ 64(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of ciphertext + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + CMPQ R9, CX + JB openSSETail128LoopA + CMPQ R9, $0xa0 + JNE openSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 96(BP), X9 + PADDL 80(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + SUBQ $0x40, BX + LEAQ 64(SI), SI + LEAQ 64(DI), DI + JMP openSSETail64DecLoop + openSSETail192: - // Need to decrypt up to 192 bytes - prepare three blocks - MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store - - MOVQ inl, itr1 - MOVQ $160, itr2 - CMPQ itr1, $160 - CMOVQGT itr2, itr1 - ANDQ $-16, itr1 - XORQ itr2, itr2 + MOVO ·chacha20Constants<>+0(SB), X2 + MOVO 32(BP), X5 + MOVO 48(BP), X8 + MOVO 128(BP), X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 80(BP) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 112(BP) + MOVQ BX, CX + MOVQ $0x000000a0, R9 + CMPQ CX, $0xa0 + CMOVQGT R9, CX + ANDQ $-16, CX + XORQ R9, R9 openSSLTail192LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - CMPQ itr2, itr1 - JB openSSLTail192LoopA - - CMPQ itr2, $160 - JNE openSSLTail192LoopB - - CMPQ inl, $176 - JB openSSLTail192Store - - polyAdd(160(inp)) - polyMul - - CMPQ inl, $192 - JB openSSLTail192Store - - polyAdd(176(inp)) - polyMul + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + CMPQ R9, CX + JB openSSLTail192LoopA + CMPQ R9, $0xa0 + JNE openSSLTail192LoopB + CMPQ BX, $0xb0 + JB openSSLTail192Store + ADDQ 160(SI), R10 + ADCQ 168(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + CMPQ BX, $0xc0 + JB openSSLTail192Store + ADDQ 176(SI), R10 + ADCQ 184(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192Store: - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 - MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) - - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - SUBQ $128, inl - LEAQ 128(inp), inp - LEAQ 128(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 112(BP), X9 + PADDL 96(BP), X10 + PADDL 80(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X2 + PXOR X13, X5 + PXOR X14, X8 + PXOR X15, X11 + MOVOU X2, (DI) + MOVOU X5, 16(DI) + MOVOU X8, 32(DI) + MOVOU X11, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + LEAQ 128(DI), DI + JMP openSSETail64DecLoop + openSSETail256: - // Need to decrypt up to 256 bytes - prepare four blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - XORQ itr2, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + XORQ R9, R9 openSSETail256Loop: - // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication - polyAdd(0(inp)(itr2*1)) - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulStage3 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - ADDQ $2*8, itr2 - CMPQ itr2, $160 - JB openSSETail256Loop - MOVQ inl, itr1 - ANDQ $-16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + ADDQ $0x10, R9 + CMPQ R9, $0xa0 + JB openSSETail256Loop + MOVQ BX, CX + ANDQ $-16, CX openSSETail256HashLoop: - polyAdd(0(inp)(itr2*1)) - polyMul - ADDQ $2*8, itr2 - CMPQ itr2, itr1 - JB openSSETail256HashLoop + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, R9 + CMPQ R9, CX + JB openSSETail256HashLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - LEAQ 192(inp), inp - LEAQ 192(oup), oup - SUBQ $192, inl - MOVO A3, A0 - MOVO B3, B0 - MOVO C3, C0 - MOVO tmpStore, D0 - - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + LEAQ 192(SI), SI + LEAQ 192(DI), DI + SUBQ $0xc0, BX + MOVO X12, X0 + MOVO X13, X3 + MOVO X14, X6 + MOVO 64(BP), X9 + JMP openSSETail64DecLoop + chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openAVX2192 - CMPQ inl, $320 + CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, state2StoreAVX2 - VMOVDQA DD0, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - DECQ itr2 - JNE openAVX2PreparePolyKey - - VPADDD ·chacha20Constants<>(SB), AA0, AA0 - VPADDD state1StoreAVX2, BB0, BB0 - VPADDD state2StoreAVX2, CC0, CC0 - VPADDD ctr3StoreAVX2, DD0, DD0 - - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for the first 64 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX openAVX2InitialHash64: - polyAdd(0(inp)(itr1*1)) - polyMulAVX2 - ADDQ $16, itr1 - CMPQ itr1, $64 - JNE openAVX2InitialHash64 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 // Decrypt the first 64 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), BB0, BB0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU BB0, (1*32)(oup) - LEAQ (2*32)(inp), inp - LEAQ (2*32)(oup), oup - SUBQ $64, inl + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX openAVX2MainLoop: - CMPQ inl, $512 + CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX openAVX2InternalLoop: - // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications - // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext - polyAdd(0*8(inp)(itr1*1)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(inp)(itr1*1)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(inp)(itr1*1)) - LEAQ (6*8)(itr1), itr1 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - CMPQ itr1, $480 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(480(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(496(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - LEAQ (32*16)(oup), oup - SUBQ $(32*16), inl + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openAVX2Tail128 - CMPQ inl, $256 + CMPQ BX, $0x00000100 JBE openAVX2Tail256 - CMPQ inl, $384 + CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes openAVX2192: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE openAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 - polyAdd(2*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes openAVX2320: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE openAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD1 - VMOVDQA DD1, DD0 - - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 - TESTQ itr1, itr1 - JE openAVX2Tail128LoopB + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB openAVX2Tail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMulAVX2 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openAVX2Tail128LoopB: - ADDQ $16, itr2 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 - JB openAVX2Tail128LoopA - CMPQ itr2, $160 - JNE openAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC1, CC1 - VPADDD DD0, DD1, DD1 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y13, Y13 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2Tail - SUBQ $32, inl + SUBQ $0x20, BX // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2TailDone - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $128, itr1 - SHRQ $4, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 openAVX2Tail256LoopA: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX - // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX - CMPQ itr2, $10 - JNE openAVX2Tail256LoopB - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl - - // Hash the remainder of data (if any) openAVX2Tail256Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail256HashEnd - polyAdd (0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail256Hash - -// Store 128 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - - VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 - VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) - LEAQ (4*32)(inp), inp - LEAQ (4*32)(oup), oup - SUBQ $4*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, ctr0StoreAVX2 - VMOVDQA DD1, ctr1StoreAVX2 - VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $256, itr1 - SHRQ $4, itr1 - ADDQ $6, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 - - // Perform ChaCha rounds, while hashing the remaining input + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + openAVX2Tail384LoopB: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX openAVX2Tail384LoopA: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - - CMPQ itr2, itr1 - JB openAVX2Tail384LoopB - - CMPQ itr2, $10 - JNE openAVX2Tail384LoopA - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX openAVX2Tail384Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail384HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail384Hash - -// Store 256 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash + openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - LEAQ (8*32)(inp), inp - LEAQ (8*32)(oup), oup - SUBQ $8*32, inl + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openAVX2TailLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 - MOVQ inp, itr2 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 openAVX2Tail512LoopB: - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ (2*8)(itr2), itr2 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 openAVX2Tail512LoopA: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(itr2)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(itr2)) - polyMulAVX2 - LEAQ (4*8)(itr2), itr2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - INCQ itr1 - CMPQ itr1, $4 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 JLT openAVX2Tail512LoopB - - CMPQ itr1, $10 - JNE openAVX2Tail512LoopA - - MOVQ inl, itr1 - SUBQ $384, itr1 - ANDQ $-16, itr1 + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX openAVX2Tail512HashLoop: - TESTQ itr1, itr1 + TESTQ CX, CX JE openAVX2Tail512HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - SUBQ $16, itr1 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - LEAQ (12*32)(inp), inp - LEAQ (12*32)(oup), oup - SUBQ $12*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Seal(dst, key, src, ad []byte) -TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 - // For aligned stack access + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop + +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 + +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 + +DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 +DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 +GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 + +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 + +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp - - CMPB ·useAVX2(SB), $1 + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE sealSSE128 // About 15% faster + CMPQ BX, $0x80 + JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store + MOVO X3, 32(BP) + MOVO X6, 48(BP) // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - MOVQ $10, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + MOVQ $0x0000000a, R9 sealSSEIntroLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JNE sealSSEIntroLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JNE sealSSEIntroLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore - MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 - CALL polyHashADInternal<>(SB) - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) - - MOVQ $128, itr1 - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 - - CMPQ inl, $64 - JBE sealSSE128SealHash - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) - - ADDQ $64, itr1 - SUBQ $64, inl - LEAQ 64(inp), inp - - MOVQ $2, itr1 - MOVQ $8, itr2 - - CMPQ inl, $64 - JBE sealSSETail64 - CMPQ inl, $128 - JBE sealSSETail128 - CMPQ inl, $192 - JBE sealSSETail192 + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 64(DI) + MOVOU X5, 80(DI) + MOVOU X8, 96(DI) + MOVOU X11, 112(DI) + MOVQ $0x00000080, CX + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 + JBE sealSSE128SealHash + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 128(DI) + MOVOU X13, 144(DI) + MOVOU X14, 160(DI) + MOVOU X15, 176(DI) + ADDQ $0x40, CX + SUBQ $0x40, BX + LEAQ 64(SI), SI + MOVQ $0x00000002, CX + MOVQ $0x00000008, R9 + CMPQ BX, $0x40 + JBE sealSSETail64 + CMPQ BX, $0x80 + JBE sealSSETail128 + CMPQ BX, $0xc0 + JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) sealSSEInnerLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(oup)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(oup), oup - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JGE sealSSEInnerLoop - polyAdd(0(oup)) - polyMul - LEAQ (2*8)(oup), oup - DECQ itr1 - JG sealSSEInnerLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(DI), DI + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JGE sealSSEInnerLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSEInnerLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVO tmpStore, D3 - - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - ADDQ $192, inp - MOVQ $192, itr1 - SUBQ $192, inl - MOVO A3, A1 - MOVO B3, B1 - MOVO C3, C1 - MOVO D3, D1 - CMPQ inl, $64 + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVO 64(BP), X15 + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + ADDQ $0xc0, SI + MOVQ $0x000000c0, CX + SUBQ $0xc0, BX + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 JBE sealSSE128SealHash - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) - LEAQ 64(inp), inp - SUBQ $64, inl - MOVQ $6, itr1 - MOVQ $4, itr2 - CMPQ inl, $192 + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + LEAQ 64(SI), SI + SUBQ $0x40, BX + MOVQ $0x00000006, CX + MOVQ $0x00000004, R9 + CMPQ BX, $0xc0 JG sealSSEMainLoop - - MOVQ inl, itr1 - TESTQ inl, inl + MOVQ BX, CX + TESTQ BX, BX JE sealSSE128SealHash - MOVQ $6, itr1 - CMPQ inl, $64 + MOVQ $0x00000006, CX + CMPQ BX, $0x40 JBE sealSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of plaintext sealSSETail64: - // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A1 - MOVO state1Store, B1 - MOVO state2Store, C1 - MOVO ctr3Store, D1 - PADDL ·sseIncMask<>(SB), D1 - MOVO D1, ctr0Store + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) sealSSETail64LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail64LoopB: - chachaQR(A1, B1, C1, D1, T1) - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A1, B1, C1, D1, T1) - shiftB1Right; shiftC1Right; shiftD1Right - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - DECQ itr1 - JG sealSSETail64LoopA - - DECQ itr2 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSETail64LoopA + DECQ R9 JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B1 - PADDL state2Store, C1 - PADDL ctr0Store, D1 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X4 + PADDL 48(BP), X7 + PADDL 80(BP), X10 + JMP sealSSE128Seal - JMP sealSSE128Seal - -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of plaintext sealSSETail128: - // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) sealSSETail128LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail128LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - DECQ itr1 - JG sealSSETail128LoopA - - DECQ itr2 - JGE sealSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr0Store, D0; PADDL ctr1Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - - MOVQ $64, itr1 - LEAQ 64(inp), inp - SUBQ $64, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of plaintext + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + DECQ CX + JG sealSSETail128LoopA + DECQ R9 + JGE sealSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVQ $0x00000040, CX + LEAQ 64(SI), SI + SUBQ $0x40, BX + JMP sealSSE128SealHash + sealSSETail192: - // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 112(BP) sealSSETail192LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail192LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - DECQ itr1 - JG sealSSETail192LoopA - - DECQ itr2 - JGE sealSSETail192LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - MOVO A2, A1 - MOVO B2, B1 - MOVO C2, C1 - MOVO D2, D1 - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special seal optimization for buffers smaller than 129 bytes + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ CX + JG sealSSETail192LoopA + DECQ R9 + JGE sealSSETail192LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + PADDL 112(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + JMP sealSSE128SealHash + sealSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE sealSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore - MOVOU B0, sStore + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealSSE128SealHash: - // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealSSE128Seal - polyAdd(0(oup)) - polyMul - - SUBQ $16, itr1 - ADDQ $16, oup - - JMP sealSSE128SealHash + CMPQ CX, $0x10 + JB sealSSE128Seal + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealSSE128SealHash sealSSE128Seal: - CMPQ inl, $16 + CMPQ BX, $0x10 JB sealSSETail - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - MOVOU (inp), T0 - PXOR T0, A1 - MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI // Extract for hashing - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP sealSSE128Seal sealSSETail: - TESTQ inl, inl + TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVQ inl, itr1 - LEAQ -1(inp)(inl*1), inp - XORQ t2, t2 - XORQ t3, t3 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: - SHLQ $8, t2, t3 - SHLQ $8, t2 - MOVB (inp), AX - XORQ AX, t2 - LEAQ -1(inp), inp - DECQ itr1 + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX JNE sealSSETailLoadLoop - MOVQ t2, 0+tmpStore - MOVQ t3, 8+tmpStore - PXOR 0+tmpStore, A1 - MOVOU A1, (oup) - MOVOU -16(t0)(itr2*1), T0 - PAND T0, A1 - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - ADDQ inl, oup + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths - ADDQ ad_len+80(FP), acc0 - ADCQ src_len+56(FP), acc1 - ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally store the tag at the end of the message - MOVQ acc0, (0*8)(oup) - MOVQ acc1, (1*8)(oup) + MOVQ R10, (DI) + MOVQ R11, 8(DI) RET -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers - CMPQ inl, $192 - JBE seal192AVX2 // 33% faster - CMPQ inl, $320 - JBE seal320AVX2 // 17% faster + CMPQ BX, $0x000000c0 + JBE seal192AVX2 + CMPQ BX, $0x00000140 + JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 - VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA Y12, 64(BP) + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, 96(BP) + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y1, 128(BP) + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, R9 sealAVX2IntroLoop: - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr2 - JNE sealAVX2IntroLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - - VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 - VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key - VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ R9 + JNE sealAVX2IntroLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPERM2I128 $0x02, Y0, Y14, Y4 + VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), DD0, DD0 - VMOVDQA DD0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, (BP) // Hash AD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), CC0, CC0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU CC0, (1*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 - VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 - VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) - - MOVQ $320, itr1 - SUBQ $320, inl - LEAQ 320(inp), inp - - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 - CMPQ inl, $128 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y12, Y12 + VMOVDQU Y0, (DI) + VMOVDQU Y12, 32(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 64(SI), Y0, Y0 + VPXOR 96(SI), Y14, Y14 + VPXOR 128(SI), Y12, Y12 + VPXOR 160(SI), Y4, Y4 + VMOVDQU Y0, 64(DI) + VMOVDQU Y14, 96(DI) + VMOVDQU Y12, 128(DI) + VMOVDQU Y4, 160(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 192(SI), Y0, Y0 + VPXOR 224(SI), Y14, Y14 + VPXOR 256(SI), Y12, Y12 + VPXOR 288(SI), Y4, Y4 + VMOVDQU Y0, 192(DI) + VMOVDQU Y14, 224(DI) + VMOVDQU Y12, 256(DI) + VMOVDQU Y4, 288(DI) + MOVQ $0x00000140, CX + SUBQ $0x00000140, BX + LEAQ 320(SI), SI + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, Y15, Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, Y15, Y3, Y4 + CMPQ BX, $0x80 JBE sealAVX2SealHash - - VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 - VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVQ $8, itr1 - MOVQ $2, itr2 - - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - CMPQ inl, $512 - JBE sealAVX2Tail512 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VPXOR 64(SI), Y12, Y12 + VPXOR 96(SI), Y4, Y4 + VMOVDQU Y0, 320(DI) + VMOVDQU Y14, 352(DI) + VMOVDQU Y12, 384(DI) + VMOVDQU Y4, 416(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVQ $0x00000008, CX + MOVQ $0x00000002, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + CMPQ BX, $0x00000200 + JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - - SUBQ $16, oup // Adjust the pointer - MOVQ $9, itr1 - JMP sealAVX2InternalLoopStart + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + SUBQ $0x10, DI + MOVQ $0x00000009, CX + JMP sealAVX2InternalLoopStart sealAVX2MainLoop: - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, CX sealAVX2InternalLoop: - polyAdd(0*8(oup)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 sealAVX2InternalLoopStart: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(oup)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(oup)) - LEAQ (6*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr1 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(DI), R10 + ADCQ 40(DI), R11 + ADCQ $0x01, R12 + LEAQ 48(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX JNE sealAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(-2*8(oup)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - SUBQ $(32*16), inl - CMPQ inl, $512 + ADDQ -16(DI), R10 + ADCQ -8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + SUBQ $0x00000200, BX + CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ 32(oup), oup - - MOVQ $10, itr1 - MOVQ $0, itr2 - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - JMP sealAVX2Tail512 - -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + MOVQ $0x0000000a, CX + MOVQ $0x00000000, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + seal192AVX2: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE sealAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealAVX2ShortSealLoop - polyAdd(0(oup)) - polyMul - SUBQ $16, itr1 - ADDQ $16, oup - JMP sealAVX2SealHash + CMPQ CX, $0x10 + JB sealAVX2ShortSealLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealAVX2SealHash sealAVX2ShortSealLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB sealAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for encryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI // Now can hash - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (1*32)(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB sealAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for encryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI // Hash - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes seal320AVX2: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE sealAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0 - VMOVDQA state1StoreAVX2, BB0 - VMOVDQA state2StoreAVX2, CC0 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VMOVDQA DD0, DD1 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA 32(BP), Y14 + VMOVDQA 64(BP), Y12 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail128LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $4, DD0, DD0, DD0 - DECQ itr1 - JG sealAVX2Tail128LoopA - DECQ itr2 - JGE sealAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA1 - VPADDD state1StoreAVX2, BB0, BB1 - VPADDD state2StoreAVX2, CC0, CC1 - VPADDD DD1, DD0, DD1 - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ CX + JG sealAVX2Tail128LoopA + DECQ R9 + JGE sealAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 + VPADDD 32(BP), Y14, Y9 + VPADDD 64(BP), Y12, Y13 + VPADDD Y1, Y4, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr1 - JG sealAVX2Tail256LoopA - DECQ itr2 - JGE sealAVX2Tail256LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ CX + JG sealAVX2Tail256LoopA + DECQ R9 + JGE sealAVX2Tail256LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + JMP sealAVX2SealHash + sealAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail384LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr1 - JG sealAVX2Tail384LoopA - DECQ itr2 - JGE sealAVX2Tail384LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0 - VPERM2I128 $0x02, CC1, DD1, TT1 - VPERM2I128 $0x13, AA1, BB1, TT2 - VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - MOVQ $256, itr1 - LEAQ 256(inp), inp - SUBQ $256, inl - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ CX + JG sealAVX2Tail384LoopA + DECQ R9 + JGE sealAVX2Tail384LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPADDD Y15, Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + MOVQ $0x00000100, CX + LEAQ 256(SI), SI + SUBQ $0x00000100, BX + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + JMP sealAVX2SealHash + sealAVX2Tail512: - // Need to decrypt up to 512 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail512LoopB: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(oup)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - - DECQ itr1 - JG sealAVX2Tail512LoopA - DECQ itr2 - JGE sealAVX2Tail512LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3 - VPXOR (0*32)(inp), CC3, CC3 - VMOVDQU CC3, (0*32)(oup) - VPERM2I128 $0x02, CC0, DD0, CC3 - VPXOR (1*32)(inp), CC3, CC3 - VMOVDQU CC3, (1*32)(oup) - VPERM2I128 $0x13, AA0, BB0, CC3 - VPXOR (2*32)(inp), CC3, CC3 - VMOVDQU CC3, (2*32)(oup) - VPERM2I128 $0x13, CC0, DD0, CC3 - VPXOR (3*32)(inp), CC3, CC3 - VMOVDQU CC3, (3*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - - MOVQ $384, itr1 - LEAQ 384(inp), inp - SUBQ $384, inl - VPERM2I128 $0x02, AA3, BB3, AA0 - VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 - VPERM2I128 $0x13, AA3, BB3, CC0 - VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - JMP sealAVX2SealHash + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX + JG sealAVX2Tail512LoopA + DECQ R9 + JGE sealAVX2Tail512LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPXOR (SI), Y15, Y15 + VMOVDQU Y15, (DI) + VPERM2I128 $0x02, Y12, Y4, Y15 + VPXOR 32(SI), Y15, Y15 + VMOVDQU Y15, 32(DI) + VPERM2I128 $0x13, Y0, Y14, Y15 + VPXOR 64(SI), Y15, Y15 + VMOVDQU Y15, 64(DI) + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + MOVQ $0x00000180, CX + LEAQ 384(SI), SI + SUBQ $0x00000180, BX + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + JMP sealAVX2SealHash diff --git a/vendor/golang.org/x/crypto/curve25519/curve25519.go b/vendor/golang.org/x/crypto/curve25519/curve25519.go index 00f963e..21ca3b2 100644 --- a/vendor/golang.org/x/crypto/curve25519/curve25519.go +++ b/vendor/golang.org/x/crypto/curve25519/curve25519.go @@ -6,9 +6,11 @@ // performs scalar multiplication on the elliptic curve known as Curve25519. // See RFC 7748. // -// Starting in Go 1.20, this package is a wrapper for the X25519 implementation +// This package is a wrapper for the X25519 implementation // in the crypto/ecdh package. -package curve25519 // import "golang.org/x/crypto/curve25519" +package curve25519 + +import "crypto/ecdh" // ScalarMult sets dst to the product scalar * point. // @@ -16,7 +18,13 @@ package curve25519 // import "golang.org/x/crypto/curve25519" // zeroes, irrespective of the scalar. Instead, use the X25519 function, which // will return an error. func ScalarMult(dst, scalar, point *[32]byte) { - scalarMult(dst, scalar, point) + if _, err := x25519(dst, scalar[:], point[:]); err != nil { + // The only error condition for x25519 when the inputs are 32 bytes long + // is if the output would have been the all-zero value. + for i := range dst { + dst[i] = 0 + } + } } // ScalarBaseMult sets dst to the product scalar * base where base is the @@ -25,7 +33,12 @@ func ScalarMult(dst, scalar, point *[32]byte) { // It is recommended to use the X25519 function with Basepoint instead, as // copying into fixed size arrays can lead to unexpected bugs. func ScalarBaseMult(dst, scalar *[32]byte) { - scalarBaseMult(dst, scalar) + curve := ecdh.X25519() + priv, err := curve.NewPrivateKey(scalar[:]) + if err != nil { + panic("curve25519: internal error: scalarBaseMult was not 32 bytes") + } + copy(dst[:], priv.PublicKey().Bytes()) } const ( @@ -57,3 +70,21 @@ func X25519(scalar, point []byte) ([]byte, error) { var dst [32]byte return x25519(&dst, scalar, point) } + +func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { + curve := ecdh.X25519() + pub, err := curve.NewPublicKey(point) + if err != nil { + return nil, err + } + priv, err := curve.NewPrivateKey(scalar) + if err != nil { + return nil, err + } + out, err := priv.ECDH(pub) + if err != nil { + return nil, err + } + copy(dst[:], out) + return dst[:], nil +} diff --git a/vendor/golang.org/x/crypto/curve25519/curve25519_compat.go b/vendor/golang.org/x/crypto/curve25519/curve25519_compat.go deleted file mode 100644 index ba647e8..0000000 --- a/vendor/golang.org/x/crypto/curve25519/curve25519_compat.go +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.20 - -package curve25519 - -import ( - "crypto/subtle" - "errors" - "strconv" - - "golang.org/x/crypto/curve25519/internal/field" -) - -func scalarMult(dst, scalar, point *[32]byte) { - var e [32]byte - - copy(e[:], scalar[:]) - e[0] &= 248 - e[31] &= 127 - e[31] |= 64 - - var x1, x2, z2, x3, z3, tmp0, tmp1 field.Element - x1.SetBytes(point[:]) - x2.One() - x3.Set(&x1) - z3.One() - - swap := 0 - for pos := 254; pos >= 0; pos-- { - b := e[pos/8] >> uint(pos&7) - b &= 1 - swap ^= int(b) - x2.Swap(&x3, swap) - z2.Swap(&z3, swap) - swap = int(b) - - tmp0.Subtract(&x3, &z3) - tmp1.Subtract(&x2, &z2) - x2.Add(&x2, &z2) - z2.Add(&x3, &z3) - z3.Multiply(&tmp0, &x2) - z2.Multiply(&z2, &tmp1) - tmp0.Square(&tmp1) - tmp1.Square(&x2) - x3.Add(&z3, &z2) - z2.Subtract(&z3, &z2) - x2.Multiply(&tmp1, &tmp0) - tmp1.Subtract(&tmp1, &tmp0) - z2.Square(&z2) - - z3.Mult32(&tmp1, 121666) - x3.Square(&x3) - tmp0.Add(&tmp0, &z3) - z3.Multiply(&x1, &z2) - z2.Multiply(&tmp1, &tmp0) - } - - x2.Swap(&x3, swap) - z2.Swap(&z3, swap) - - z2.Invert(&z2) - x2.Multiply(&x2, &z2) - copy(dst[:], x2.Bytes()) -} - -func scalarBaseMult(dst, scalar *[32]byte) { - checkBasepoint() - scalarMult(dst, scalar, &basePoint) -} - -func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { - var in [32]byte - if l := len(scalar); l != 32 { - return nil, errors.New("bad scalar length: " + strconv.Itoa(l) + ", expected 32") - } - if l := len(point); l != 32 { - return nil, errors.New("bad point length: " + strconv.Itoa(l) + ", expected 32") - } - copy(in[:], scalar) - if &point[0] == &Basepoint[0] { - scalarBaseMult(dst, &in) - } else { - var base, zero [32]byte - copy(base[:], point) - scalarMult(dst, &in, &base) - if subtle.ConstantTimeCompare(dst[:], zero[:]) == 1 { - return nil, errors.New("bad input point: low order point") - } - } - return dst[:], nil -} - -func checkBasepoint() { - if subtle.ConstantTimeCompare(Basepoint, []byte{ - 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - }) != 1 { - panic("curve25519: global Basepoint value was modified") - } -} diff --git a/vendor/golang.org/x/crypto/curve25519/curve25519_go120.go b/vendor/golang.org/x/crypto/curve25519/curve25519_go120.go deleted file mode 100644 index 627df49..0000000 --- a/vendor/golang.org/x/crypto/curve25519/curve25519_go120.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.20 - -package curve25519 - -import "crypto/ecdh" - -func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) { - curve := ecdh.X25519() - pub, err := curve.NewPublicKey(point) - if err != nil { - return nil, err - } - priv, err := curve.NewPrivateKey(scalar) - if err != nil { - return nil, err - } - out, err := priv.ECDH(pub) - if err != nil { - return nil, err - } - copy(dst[:], out) - return dst[:], nil -} - -func scalarMult(dst, scalar, point *[32]byte) { - if _, err := x25519(dst, scalar[:], point[:]); err != nil { - // The only error condition for x25519 when the inputs are 32 bytes long - // is if the output would have been the all-zero value. - for i := range dst { - dst[i] = 0 - } - } -} - -func scalarBaseMult(dst, scalar *[32]byte) { - curve := ecdh.X25519() - priv, err := curve.NewPrivateKey(scalar[:]) - if err != nil { - panic("curve25519: internal error: scalarBaseMult was not 32 bytes") - } - copy(dst[:], priv.PublicKey().Bytes()) -} diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/README b/vendor/golang.org/x/crypto/curve25519/internal/field/README deleted file mode 100644 index e25bca7..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/README +++ /dev/null @@ -1,7 +0,0 @@ -This package is kept in sync with crypto/ed25519/internal/edwards25519/field in -the standard library. - -If there are any changes in the standard library that need to be synced to this -package, run sync.sh. It will not overwrite any local changes made since the -previous sync, so it's ok to land changes in this package first, and then sync -to the standard library later. diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe.go deleted file mode 100644 index ca841ad..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe.go +++ /dev/null @@ -1,416 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package field implements fast arithmetic modulo 2^255-19. -package field - -import ( - "crypto/subtle" - "encoding/binary" - "math/bits" -) - -// Element represents an element of the field GF(2^255-19). Note that this -// is not a cryptographically secure group, and should only be used to interact -// with edwards25519.Point coordinates. -// -// This type works similarly to math/big.Int, and all arguments and receivers -// are allowed to alias. -// -// The zero value is a valid zero element. -type Element struct { - // An element t represents the integer - // t.l0 + t.l1*2^51 + t.l2*2^102 + t.l3*2^153 + t.l4*2^204 - // - // Between operations, all limbs are expected to be lower than 2^52. - l0 uint64 - l1 uint64 - l2 uint64 - l3 uint64 - l4 uint64 -} - -const maskLow51Bits uint64 = (1 << 51) - 1 - -var feZero = &Element{0, 0, 0, 0, 0} - -// Zero sets v = 0, and returns v. -func (v *Element) Zero() *Element { - *v = *feZero - return v -} - -var feOne = &Element{1, 0, 0, 0, 0} - -// One sets v = 1, and returns v. -func (v *Element) One() *Element { - *v = *feOne - return v -} - -// reduce reduces v modulo 2^255 - 19 and returns it. -func (v *Element) reduce() *Element { - v.carryPropagate() - - // After the light reduction we now have a field element representation - // v < 2^255 + 2^13 * 19, but need v < 2^255 - 19. - - // If v >= 2^255 - 19, then v + 19 >= 2^255, which would overflow 2^255 - 1, - // generating a carry. That is, c will be 0 if v < 2^255 - 19, and 1 otherwise. - c := (v.l0 + 19) >> 51 - c = (v.l1 + c) >> 51 - c = (v.l2 + c) >> 51 - c = (v.l3 + c) >> 51 - c = (v.l4 + c) >> 51 - - // If v < 2^255 - 19 and c = 0, this will be a no-op. Otherwise, it's - // effectively applying the reduction identity to the carry. - v.l0 += 19 * c - - v.l1 += v.l0 >> 51 - v.l0 = v.l0 & maskLow51Bits - v.l2 += v.l1 >> 51 - v.l1 = v.l1 & maskLow51Bits - v.l3 += v.l2 >> 51 - v.l2 = v.l2 & maskLow51Bits - v.l4 += v.l3 >> 51 - v.l3 = v.l3 & maskLow51Bits - // no additional carry - v.l4 = v.l4 & maskLow51Bits - - return v -} - -// Add sets v = a + b, and returns v. -func (v *Element) Add(a, b *Element) *Element { - v.l0 = a.l0 + b.l0 - v.l1 = a.l1 + b.l1 - v.l2 = a.l2 + b.l2 - v.l3 = a.l3 + b.l3 - v.l4 = a.l4 + b.l4 - // Using the generic implementation here is actually faster than the - // assembly. Probably because the body of this function is so simple that - // the compiler can figure out better optimizations by inlining the carry - // propagation. TODO - return v.carryPropagateGeneric() -} - -// Subtract sets v = a - b, and returns v. -func (v *Element) Subtract(a, b *Element) *Element { - // We first add 2 * p, to guarantee the subtraction won't underflow, and - // then subtract b (which can be up to 2^255 + 2^13 * 19). - v.l0 = (a.l0 + 0xFFFFFFFFFFFDA) - b.l0 - v.l1 = (a.l1 + 0xFFFFFFFFFFFFE) - b.l1 - v.l2 = (a.l2 + 0xFFFFFFFFFFFFE) - b.l2 - v.l3 = (a.l3 + 0xFFFFFFFFFFFFE) - b.l3 - v.l4 = (a.l4 + 0xFFFFFFFFFFFFE) - b.l4 - return v.carryPropagate() -} - -// Negate sets v = -a, and returns v. -func (v *Element) Negate(a *Element) *Element { - return v.Subtract(feZero, a) -} - -// Invert sets v = 1/z mod p, and returns v. -// -// If z == 0, Invert returns v = 0. -func (v *Element) Invert(z *Element) *Element { - // Inversion is implemented as exponentiation with exponent p − 2. It uses the - // same sequence of 255 squarings and 11 multiplications as [Curve25519]. - var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element - - z2.Square(z) // 2 - t.Square(&z2) // 4 - t.Square(&t) // 8 - z9.Multiply(&t, z) // 9 - z11.Multiply(&z9, &z2) // 11 - t.Square(&z11) // 22 - z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0 - - t.Square(&z2_5_0) // 2^6 - 2^1 - for i := 0; i < 4; i++ { - t.Square(&t) // 2^10 - 2^5 - } - z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0 - - t.Square(&z2_10_0) // 2^11 - 2^1 - for i := 0; i < 9; i++ { - t.Square(&t) // 2^20 - 2^10 - } - z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0 - - t.Square(&z2_20_0) // 2^21 - 2^1 - for i := 0; i < 19; i++ { - t.Square(&t) // 2^40 - 2^20 - } - t.Multiply(&t, &z2_20_0) // 2^40 - 2^0 - - t.Square(&t) // 2^41 - 2^1 - for i := 0; i < 9; i++ { - t.Square(&t) // 2^50 - 2^10 - } - z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0 - - t.Square(&z2_50_0) // 2^51 - 2^1 - for i := 0; i < 49; i++ { - t.Square(&t) // 2^100 - 2^50 - } - z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0 - - t.Square(&z2_100_0) // 2^101 - 2^1 - for i := 0; i < 99; i++ { - t.Square(&t) // 2^200 - 2^100 - } - t.Multiply(&t, &z2_100_0) // 2^200 - 2^0 - - t.Square(&t) // 2^201 - 2^1 - for i := 0; i < 49; i++ { - t.Square(&t) // 2^250 - 2^50 - } - t.Multiply(&t, &z2_50_0) // 2^250 - 2^0 - - t.Square(&t) // 2^251 - 2^1 - t.Square(&t) // 2^252 - 2^2 - t.Square(&t) // 2^253 - 2^3 - t.Square(&t) // 2^254 - 2^4 - t.Square(&t) // 2^255 - 2^5 - - return v.Multiply(&t, &z11) // 2^255 - 21 -} - -// Set sets v = a, and returns v. -func (v *Element) Set(a *Element) *Element { - *v = *a - return v -} - -// SetBytes sets v to x, which must be a 32-byte little-endian encoding. -// -// Consistent with RFC 7748, the most significant bit (the high bit of the -// last byte) is ignored, and non-canonical values (2^255-19 through 2^255-1) -// are accepted. Note that this is laxer than specified by RFC 8032. -func (v *Element) SetBytes(x []byte) *Element { - if len(x) != 32 { - panic("edwards25519: invalid field element input size") - } - - // Bits 0:51 (bytes 0:8, bits 0:64, shift 0, mask 51). - v.l0 = binary.LittleEndian.Uint64(x[0:8]) - v.l0 &= maskLow51Bits - // Bits 51:102 (bytes 6:14, bits 48:112, shift 3, mask 51). - v.l1 = binary.LittleEndian.Uint64(x[6:14]) >> 3 - v.l1 &= maskLow51Bits - // Bits 102:153 (bytes 12:20, bits 96:160, shift 6, mask 51). - v.l2 = binary.LittleEndian.Uint64(x[12:20]) >> 6 - v.l2 &= maskLow51Bits - // Bits 153:204 (bytes 19:27, bits 152:216, shift 1, mask 51). - v.l3 = binary.LittleEndian.Uint64(x[19:27]) >> 1 - v.l3 &= maskLow51Bits - // Bits 204:251 (bytes 24:32, bits 192:256, shift 12, mask 51). - // Note: not bytes 25:33, shift 4, to avoid overread. - v.l4 = binary.LittleEndian.Uint64(x[24:32]) >> 12 - v.l4 &= maskLow51Bits - - return v -} - -// Bytes returns the canonical 32-byte little-endian encoding of v. -func (v *Element) Bytes() []byte { - // This function is outlined to make the allocations inline in the caller - // rather than happen on the heap. - var out [32]byte - return v.bytes(&out) -} - -func (v *Element) bytes(out *[32]byte) []byte { - t := *v - t.reduce() - - var buf [8]byte - for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} { - bitsOffset := i * 51 - binary.LittleEndian.PutUint64(buf[:], l<= len(out) { - break - } - out[off] |= bb - } - } - - return out[:] -} - -// Equal returns 1 if v and u are equal, and 0 otherwise. -func (v *Element) Equal(u *Element) int { - sa, sv := u.Bytes(), v.Bytes() - return subtle.ConstantTimeCompare(sa, sv) -} - -// mask64Bits returns 0xffffffff if cond is 1, and 0 otherwise. -func mask64Bits(cond int) uint64 { return ^(uint64(cond) - 1) } - -// Select sets v to a if cond == 1, and to b if cond == 0. -func (v *Element) Select(a, b *Element, cond int) *Element { - m := mask64Bits(cond) - v.l0 = (m & a.l0) | (^m & b.l0) - v.l1 = (m & a.l1) | (^m & b.l1) - v.l2 = (m & a.l2) | (^m & b.l2) - v.l3 = (m & a.l3) | (^m & b.l3) - v.l4 = (m & a.l4) | (^m & b.l4) - return v -} - -// Swap swaps v and u if cond == 1 or leaves them unchanged if cond == 0, and returns v. -func (v *Element) Swap(u *Element, cond int) { - m := mask64Bits(cond) - t := m & (v.l0 ^ u.l0) - v.l0 ^= t - u.l0 ^= t - t = m & (v.l1 ^ u.l1) - v.l1 ^= t - u.l1 ^= t - t = m & (v.l2 ^ u.l2) - v.l2 ^= t - u.l2 ^= t - t = m & (v.l3 ^ u.l3) - v.l3 ^= t - u.l3 ^= t - t = m & (v.l4 ^ u.l4) - v.l4 ^= t - u.l4 ^= t -} - -// IsNegative returns 1 if v is negative, and 0 otherwise. -func (v *Element) IsNegative() int { - return int(v.Bytes()[0] & 1) -} - -// Absolute sets v to |u|, and returns v. -func (v *Element) Absolute(u *Element) *Element { - return v.Select(new(Element).Negate(u), u, u.IsNegative()) -} - -// Multiply sets v = x * y, and returns v. -func (v *Element) Multiply(x, y *Element) *Element { - feMul(v, x, y) - return v -} - -// Square sets v = x * x, and returns v. -func (v *Element) Square(x *Element) *Element { - feSquare(v, x) - return v -} - -// Mult32 sets v = x * y, and returns v. -func (v *Element) Mult32(x *Element, y uint32) *Element { - x0lo, x0hi := mul51(x.l0, y) - x1lo, x1hi := mul51(x.l1, y) - x2lo, x2hi := mul51(x.l2, y) - x3lo, x3hi := mul51(x.l3, y) - x4lo, x4hi := mul51(x.l4, y) - v.l0 = x0lo + 19*x4hi // carried over per the reduction identity - v.l1 = x1lo + x0hi - v.l2 = x2lo + x1hi - v.l3 = x3lo + x2hi - v.l4 = x4lo + x3hi - // The hi portions are going to be only 32 bits, plus any previous excess, - // so we can skip the carry propagation. - return v -} - -// mul51 returns lo + hi * 2⁵¹ = a * b. -func mul51(a uint64, b uint32) (lo uint64, hi uint64) { - mh, ml := bits.Mul64(a, uint64(b)) - lo = ml & maskLow51Bits - hi = (mh << 13) | (ml >> 51) - return -} - -// Pow22523 set v = x^((p-5)/8), and returns v. (p-5)/8 is 2^252-3. -func (v *Element) Pow22523(x *Element) *Element { - var t0, t1, t2 Element - - t0.Square(x) // x^2 - t1.Square(&t0) // x^4 - t1.Square(&t1) // x^8 - t1.Multiply(x, &t1) // x^9 - t0.Multiply(&t0, &t1) // x^11 - t0.Square(&t0) // x^22 - t0.Multiply(&t1, &t0) // x^31 - t1.Square(&t0) // x^62 - for i := 1; i < 5; i++ { // x^992 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // x^1023 -> 1023 = 2^10 - 1 - t1.Square(&t0) // 2^11 - 2 - for i := 1; i < 10; i++ { // 2^20 - 2^10 - t1.Square(&t1) - } - t1.Multiply(&t1, &t0) // 2^20 - 1 - t2.Square(&t1) // 2^21 - 2 - for i := 1; i < 20; i++ { // 2^40 - 2^20 - t2.Square(&t2) - } - t1.Multiply(&t2, &t1) // 2^40 - 1 - t1.Square(&t1) // 2^41 - 2 - for i := 1; i < 10; i++ { // 2^50 - 2^10 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // 2^50 - 1 - t1.Square(&t0) // 2^51 - 2 - for i := 1; i < 50; i++ { // 2^100 - 2^50 - t1.Square(&t1) - } - t1.Multiply(&t1, &t0) // 2^100 - 1 - t2.Square(&t1) // 2^101 - 2 - for i := 1; i < 100; i++ { // 2^200 - 2^100 - t2.Square(&t2) - } - t1.Multiply(&t2, &t1) // 2^200 - 1 - t1.Square(&t1) // 2^201 - 2 - for i := 1; i < 50; i++ { // 2^250 - 2^50 - t1.Square(&t1) - } - t0.Multiply(&t1, &t0) // 2^250 - 1 - t0.Square(&t0) // 2^251 - 2 - t0.Square(&t0) // 2^252 - 4 - return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3) -} - -// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion. -var sqrtM1 = &Element{1718705420411056, 234908883556509, - 2233514472574048, 2117202627021982, 765476049583133} - -// SqrtRatio sets r to the non-negative square root of the ratio of u and v. -// -// If u/v is square, SqrtRatio returns r and 1. If u/v is not square, SqrtRatio -// sets r according to Section 4.3 of draft-irtf-cfrg-ristretto255-decaf448-00, -// and returns r and 0. -func (r *Element) SqrtRatio(u, v *Element) (rr *Element, wasSquare int) { - var a, b Element - - // r = (u * v3) * (u * v7)^((p-5)/8) - v2 := a.Square(v) - uv3 := b.Multiply(u, b.Multiply(v2, v)) - uv7 := a.Multiply(uv3, a.Square(v2)) - r.Multiply(uv3, r.Pow22523(uv7)) - - check := a.Multiply(v, a.Square(r)) // check = v * r^2 - - uNeg := b.Negate(u) - correctSignSqrt := check.Equal(u) - flippedSignSqrt := check.Equal(uNeg) - flippedSignSqrtI := check.Equal(uNeg.Multiply(uNeg, sqrtM1)) - - rPrime := b.Multiply(r, sqrtM1) // r_prime = SQRT_M1 * r - // r = CT_SELECT(r_prime IF flipped_sign_sqrt | flipped_sign_sqrt_i ELSE r) - r.Select(rPrime, r, flippedSignSqrt|flippedSignSqrtI) - - r.Absolute(r) // Choose the nonnegative square root. - return r, correctSignSqrt | flippedSignSqrt -} diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.go deleted file mode 100644 index 70c5416..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.go +++ /dev/null @@ -1,15 +0,0 @@ -// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -package field - -// feMul sets out = a * b. It works like feMulGeneric. -// -//go:noescape -func feMul(out *Element, a *Element, b *Element) - -// feSquare sets out = a * a. It works like feSquareGeneric. -// -//go:noescape -func feSquare(out *Element, a *Element) diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.s b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.s deleted file mode 100644 index 60817ac..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.s +++ /dev/null @@ -1,378 +0,0 @@ -// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -#include "textflag.h" - -// func feMul(out *Element, a *Element, b *Element) -TEXT ·feMul(SB), NOSPLIT, $0-24 - MOVQ a+8(FP), CX - MOVQ b+16(FP), BX - - // r0 = a0×b0 - MOVQ (CX), AX - MULQ (BX) - MOVQ AX, DI - MOVQ DX, SI - - // r0 += 19×a1×b4 - MOVQ 8(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a2×b3 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a3×b2 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r0 += 19×a4×b1 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 8(BX) - ADDQ AX, DI - ADCQ DX, SI - - // r1 = a0×b1 - MOVQ (CX), AX - MULQ 8(BX) - MOVQ AX, R9 - MOVQ DX, R8 - - // r1 += a1×b0 - MOVQ 8(CX), AX - MULQ (BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a2×b4 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a3×b3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r1 += 19×a4×b2 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, R9 - ADCQ DX, R8 - - // r2 = a0×b2 - MOVQ (CX), AX - MULQ 16(BX) - MOVQ AX, R11 - MOVQ DX, R10 - - // r2 += a1×b1 - MOVQ 8(CX), AX - MULQ 8(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += a2×b0 - MOVQ 16(CX), AX - MULQ (BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += 19×a3×b4 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r2 += 19×a4×b3 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R11 - ADCQ DX, R10 - - // r3 = a0×b3 - MOVQ (CX), AX - MULQ 24(BX) - MOVQ AX, R13 - MOVQ DX, R12 - - // r3 += a1×b2 - MOVQ 8(CX), AX - MULQ 16(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += a2×b1 - MOVQ 16(CX), AX - MULQ 8(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += a3×b0 - MOVQ 24(CX), AX - MULQ (BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r3 += 19×a4×b4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R13 - ADCQ DX, R12 - - // r4 = a0×b4 - MOVQ (CX), AX - MULQ 32(BX) - MOVQ AX, R15 - MOVQ DX, R14 - - // r4 += a1×b3 - MOVQ 8(CX), AX - MULQ 24(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a2×b2 - MOVQ 16(CX), AX - MULQ 16(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a3×b1 - MOVQ 24(CX), AX - MULQ 8(BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // r4 += a4×b0 - MOVQ 32(CX), AX - MULQ (BX) - ADDQ AX, R15 - ADCQ DX, R14 - - // First reduction chain - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, DI, SI - SHLQ $0x0d, R9, R8 - SHLQ $0x0d, R11, R10 - SHLQ $0x0d, R13, R12 - SHLQ $0x0d, R15, R14 - ANDQ AX, DI - IMUL3Q $0x13, R14, R14 - ADDQ R14, DI - ANDQ AX, R9 - ADDQ SI, R9 - ANDQ AX, R11 - ADDQ R8, R11 - ANDQ AX, R13 - ADDQ R10, R13 - ANDQ AX, R15 - ADDQ R12, R15 - - // Second reduction chain (carryPropagate) - MOVQ DI, SI - SHRQ $0x33, SI - MOVQ R9, R8 - SHRQ $0x33, R8 - MOVQ R11, R10 - SHRQ $0x33, R10 - MOVQ R13, R12 - SHRQ $0x33, R12 - MOVQ R15, R14 - SHRQ $0x33, R14 - ANDQ AX, DI - IMUL3Q $0x13, R14, R14 - ADDQ R14, DI - ANDQ AX, R9 - ADDQ SI, R9 - ANDQ AX, R11 - ADDQ R8, R11 - ANDQ AX, R13 - ADDQ R10, R13 - ANDQ AX, R15 - ADDQ R12, R15 - - // Store output - MOVQ out+0(FP), AX - MOVQ DI, (AX) - MOVQ R9, 8(AX) - MOVQ R11, 16(AX) - MOVQ R13, 24(AX) - MOVQ R15, 32(AX) - RET - -// func feSquare(out *Element, a *Element) -TEXT ·feSquare(SB), NOSPLIT, $0-16 - MOVQ a+8(FP), CX - - // r0 = l0×l0 - MOVQ (CX), AX - MULQ (CX) - MOVQ AX, SI - MOVQ DX, BX - - // r0 += 38×l1×l4 - MOVQ 8(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, SI - ADCQ DX, BX - - // r0 += 38×l2×l3 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 24(CX) - ADDQ AX, SI - ADCQ DX, BX - - // r1 = 2×l0×l1 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 8(CX) - MOVQ AX, R8 - MOVQ DX, DI - - // r1 += 38×l2×l4 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R8 - ADCQ DX, DI - - // r1 += 19×l3×l3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, DI - - // r2 = 2×l0×l2 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 16(CX) - MOVQ AX, R10 - MOVQ DX, R9 - - // r2 += l1×l1 - MOVQ 8(CX), AX - MULQ 8(CX) - ADDQ AX, R10 - ADCQ DX, R9 - - // r2 += 38×l3×l4 - MOVQ 24(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R10 - ADCQ DX, R9 - - // r3 = 2×l0×l3 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 24(CX) - MOVQ AX, R12 - MOVQ DX, R11 - - // r3 += 2×l1×l2 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 16(CX) - ADDQ AX, R12 - ADCQ DX, R11 - - // r3 += 19×l4×l4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(CX) - ADDQ AX, R12 - ADCQ DX, R11 - - // r4 = 2×l0×l4 - MOVQ (CX), AX - SHLQ $0x01, AX - MULQ 32(CX) - MOVQ AX, R14 - MOVQ DX, R13 - - // r4 += 2×l1×l3 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R13 - - // r4 += l2×l2 - MOVQ 16(CX), AX - MULQ 16(CX) - ADDQ AX, R14 - ADCQ DX, R13 - - // First reduction chain - MOVQ $0x0007ffffffffffff, AX - SHLQ $0x0d, SI, BX - SHLQ $0x0d, R8, DI - SHLQ $0x0d, R10, R9 - SHLQ $0x0d, R12, R11 - SHLQ $0x0d, R14, R13 - ANDQ AX, SI - IMUL3Q $0x13, R13, R13 - ADDQ R13, SI - ANDQ AX, R8 - ADDQ BX, R8 - ANDQ AX, R10 - ADDQ DI, R10 - ANDQ AX, R12 - ADDQ R9, R12 - ANDQ AX, R14 - ADDQ R11, R14 - - // Second reduction chain (carryPropagate) - MOVQ SI, BX - SHRQ $0x33, BX - MOVQ R8, DI - SHRQ $0x33, DI - MOVQ R10, R9 - SHRQ $0x33, R9 - MOVQ R12, R11 - SHRQ $0x33, R11 - MOVQ R14, R13 - SHRQ $0x33, R13 - ANDQ AX, SI - IMUL3Q $0x13, R13, R13 - ADDQ R13, SI - ANDQ AX, R8 - ADDQ BX, R8 - ANDQ AX, R10 - ADDQ DI, R10 - ANDQ AX, R12 - ADDQ R9, R12 - ANDQ AX, R14 - ADDQ R11, R14 - - // Store output - MOVQ out+0(FP), AX - MOVQ SI, (AX) - MOVQ R8, 8(AX) - MOVQ R10, 16(AX) - MOVQ R12, 24(AX) - MOVQ R14, 32(AX) - RET diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64_noasm.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64_noasm.go deleted file mode 100644 index 9da280d..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64_noasm.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !amd64 || !gc || purego - -package field - -func feMul(v, x, y *Element) { feMulGeneric(v, x, y) } - -func feSquare(v, x *Element) { feSquareGeneric(v, x) } diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.go deleted file mode 100644 index 075fe9b..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -package field - -//go:noescape -func carryPropagate(v *Element) - -func (v *Element) carryPropagate() *Element { - carryPropagate(v) - return v -} diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.s b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.s deleted file mode 100644 index 3126a43..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -#include "textflag.h" - -// carryPropagate works exactly like carryPropagateGeneric and uses the -// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but -// avoids loading R0-R4 twice and uses LDP and STP. -// -// See https://golang.org/issues/43145 for the main compiler issue. -// -// func carryPropagate(v *Element) -TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8 - MOVD v+0(FP), R20 - - LDP 0(R20), (R0, R1) - LDP 16(R20), (R2, R3) - MOVD 32(R20), R4 - - AND $0x7ffffffffffff, R0, R10 - AND $0x7ffffffffffff, R1, R11 - AND $0x7ffffffffffff, R2, R12 - AND $0x7ffffffffffff, R3, R13 - AND $0x7ffffffffffff, R4, R14 - - ADD R0>>51, R11, R11 - ADD R1>>51, R12, R12 - ADD R2>>51, R13, R13 - ADD R3>>51, R14, R14 - // R4>>51 * 19 + R10 -> R10 - LSR $51, R4, R21 - MOVD $19, R22 - MADD R22, R10, R21, R10 - - STP (R10, R11), 0(R20) - STP (R12, R13), 16(R20) - MOVD R14, 32(R20) - - RET diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64_noasm.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64_noasm.go deleted file mode 100644 index fc029ac..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64_noasm.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !arm64 || !gc || purego - -package field - -func (v *Element) carryPropagate() *Element { - return v.carryPropagateGeneric() -} diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_generic.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_generic.go deleted file mode 100644 index 2671217..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_generic.go +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package field - -import "math/bits" - -// uint128 holds a 128-bit number as two 64-bit limbs, for use with the -// bits.Mul64 and bits.Add64 intrinsics. -type uint128 struct { - lo, hi uint64 -} - -// mul64 returns a * b. -func mul64(a, b uint64) uint128 { - hi, lo := bits.Mul64(a, b) - return uint128{lo, hi} -} - -// addMul64 returns v + a * b. -func addMul64(v uint128, a, b uint64) uint128 { - hi, lo := bits.Mul64(a, b) - lo, c := bits.Add64(lo, v.lo, 0) - hi, _ = bits.Add64(hi, v.hi, c) - return uint128{lo, hi} -} - -// shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits. -func shiftRightBy51(a uint128) uint64 { - return (a.hi << (64 - 51)) | (a.lo >> 51) -} - -func feMulGeneric(v, a, b *Element) { - a0 := a.l0 - a1 := a.l1 - a2 := a.l2 - a3 := a.l3 - a4 := a.l4 - - b0 := b.l0 - b1 := b.l1 - b2 := b.l2 - b3 := b.l3 - b4 := b.l4 - - // Limb multiplication works like pen-and-paper columnar multiplication, but - // with 51-bit limbs instead of digits. - // - // a4 a3 a2 a1 a0 x - // b4 b3 b2 b1 b0 = - // ------------------------ - // a4b0 a3b0 a2b0 a1b0 a0b0 + - // a4b1 a3b1 a2b1 a1b1 a0b1 + - // a4b2 a3b2 a2b2 a1b2 a0b2 + - // a4b3 a3b3 a2b3 a1b3 a0b3 + - // a4b4 a3b4 a2b4 a1b4 a0b4 = - // ---------------------------------------------- - // r8 r7 r6 r5 r4 r3 r2 r1 r0 - // - // We can then use the reduction identity (a * 2²⁵⁵ + b = a * 19 + b) to - // reduce the limbs that would overflow 255 bits. r5 * 2²⁵⁵ becomes 19 * r5, - // r6 * 2³⁰⁶ becomes 19 * r6 * 2⁵¹, etc. - // - // Reduction can be carried out simultaneously to multiplication. For - // example, we do not compute r5: whenever the result of a multiplication - // belongs to r5, like a1b4, we multiply it by 19 and add the result to r0. - // - // a4b0 a3b0 a2b0 a1b0 a0b0 + - // a3b1 a2b1 a1b1 a0b1 19×a4b1 + - // a2b2 a1b2 a0b2 19×a4b2 19×a3b2 + - // a1b3 a0b3 19×a4b3 19×a3b3 19×a2b3 + - // a0b4 19×a4b4 19×a3b4 19×a2b4 19×a1b4 = - // -------------------------------------- - // r4 r3 r2 r1 r0 - // - // Finally we add up the columns into wide, overlapping limbs. - - a1_19 := a1 * 19 - a2_19 := a2 * 19 - a3_19 := a3 * 19 - a4_19 := a4 * 19 - - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - r0 := mul64(a0, b0) - r0 = addMul64(r0, a1_19, b4) - r0 = addMul64(r0, a2_19, b3) - r0 = addMul64(r0, a3_19, b2) - r0 = addMul64(r0, a4_19, b1) - - // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) - r1 := mul64(a0, b1) - r1 = addMul64(r1, a1, b0) - r1 = addMul64(r1, a2_19, b4) - r1 = addMul64(r1, a3_19, b3) - r1 = addMul64(r1, a4_19, b2) - - // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) - r2 := mul64(a0, b2) - r2 = addMul64(r2, a1, b1) - r2 = addMul64(r2, a2, b0) - r2 = addMul64(r2, a3_19, b4) - r2 = addMul64(r2, a4_19, b3) - - // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 - r3 := mul64(a0, b3) - r3 = addMul64(r3, a1, b2) - r3 = addMul64(r3, a2, b1) - r3 = addMul64(r3, a3, b0) - r3 = addMul64(r3, a4_19, b4) - - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - r4 := mul64(a0, b4) - r4 = addMul64(r4, a1, b3) - r4 = addMul64(r4, a2, b2) - r4 = addMul64(r4, a3, b1) - r4 = addMul64(r4, a4, b0) - - // After the multiplication, we need to reduce (carry) the five coefficients - // to obtain a result with limbs that are at most slightly larger than 2⁵¹, - // to respect the Element invariant. - // - // Overall, the reduction works the same as carryPropagate, except with - // wider inputs: we take the carry for each coefficient by shifting it right - // by 51, and add it to the limb above it. The top carry is multiplied by 19 - // according to the reduction identity and added to the lowest limb. - // - // The largest coefficient (r0) will be at most 111 bits, which guarantees - // that all carries are at most 111 - 51 = 60 bits, which fits in a uint64. - // - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - // r0 < 2⁵²×2⁵² + 19×(2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵²) - // r0 < (1 + 19 × 4) × 2⁵² × 2⁵² - // r0 < 2⁷ × 2⁵² × 2⁵² - // r0 < 2¹¹¹ - // - // Moreover, the top coefficient (r4) is at most 107 bits, so c4 is at most - // 56 bits, and c4 * 19 is at most 61 bits, which again fits in a uint64 and - // allows us to easily apply the reduction identity. - // - // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - // r4 < 5 × 2⁵² × 2⁵² - // r4 < 2¹⁰⁷ - // - - c0 := shiftRightBy51(r0) - c1 := shiftRightBy51(r1) - c2 := shiftRightBy51(r2) - c3 := shiftRightBy51(r3) - c4 := shiftRightBy51(r4) - - rr0 := r0.lo&maskLow51Bits + c4*19 - rr1 := r1.lo&maskLow51Bits + c0 - rr2 := r2.lo&maskLow51Bits + c1 - rr3 := r3.lo&maskLow51Bits + c2 - rr4 := r4.lo&maskLow51Bits + c3 - - // Now all coefficients fit into 64-bit registers but are still too large to - // be passed around as a Element. We therefore do one last carry chain, - // where the carries will be small enough to fit in the wiggle room above 2⁵¹. - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() -} - -func feSquareGeneric(v, a *Element) { - l0 := a.l0 - l1 := a.l1 - l2 := a.l2 - l3 := a.l3 - l4 := a.l4 - - // Squaring works precisely like multiplication above, but thanks to its - // symmetry we get to group a few terms together. - // - // l4 l3 l2 l1 l0 x - // l4 l3 l2 l1 l0 = - // ------------------------ - // l4l0 l3l0 l2l0 l1l0 l0l0 + - // l4l1 l3l1 l2l1 l1l1 l0l1 + - // l4l2 l3l2 l2l2 l1l2 l0l2 + - // l4l3 l3l3 l2l3 l1l3 l0l3 + - // l4l4 l3l4 l2l4 l1l4 l0l4 = - // ---------------------------------------------- - // r8 r7 r6 r5 r4 r3 r2 r1 r0 - // - // l4l0 l3l0 l2l0 l1l0 l0l0 + - // l3l1 l2l1 l1l1 l0l1 19×l4l1 + - // l2l2 l1l2 l0l2 19×l4l2 19×l3l2 + - // l1l3 l0l3 19×l4l3 19×l3l3 19×l2l3 + - // l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4 = - // -------------------------------------- - // r4 r3 r2 r1 r0 - // - // With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with - // only three Mul64 and four Add64, instead of five and eight. - - l0_2 := l0 * 2 - l1_2 := l1 * 2 - - l1_38 := l1 * 38 - l2_38 := l2 * 38 - l3_38 := l3 * 38 - - l3_19 := l3 * 19 - l4_19 := l4 * 19 - - // r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3) - r0 := mul64(l0, l0) - r0 = addMul64(r0, l1_38, l4) - r0 = addMul64(r0, l2_38, l3) - - // r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 - r1 := mul64(l0_2, l1) - r1 = addMul64(r1, l2_38, l4) - r1 = addMul64(r1, l3_19, l3) - - // r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4 - r2 := mul64(l0_2, l2) - r2 = addMul64(r2, l1, l1) - r2 = addMul64(r2, l3_38, l4) - - // r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 - r3 := mul64(l0_2, l3) - r3 = addMul64(r3, l1_2, l2) - r3 = addMul64(r3, l4_19, l4) - - // r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2 - r4 := mul64(l0_2, l4) - r4 = addMul64(r4, l1_2, l3) - r4 = addMul64(r4, l2, l2) - - c0 := shiftRightBy51(r0) - c1 := shiftRightBy51(r1) - c2 := shiftRightBy51(r2) - c3 := shiftRightBy51(r3) - c4 := shiftRightBy51(r4) - - rr0 := r0.lo&maskLow51Bits + c4*19 - rr1 := r1.lo&maskLow51Bits + c0 - rr2 := r2.lo&maskLow51Bits + c1 - rr3 := r3.lo&maskLow51Bits + c2 - rr4 := r4.lo&maskLow51Bits + c3 - - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() -} - -// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction -// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry. TODO inline -func (v *Element) carryPropagateGeneric() *Element { - c0 := v.l0 >> 51 - c1 := v.l1 >> 51 - c2 := v.l2 >> 51 - c3 := v.l3 >> 51 - c4 := v.l4 >> 51 - - v.l0 = v.l0&maskLow51Bits + c4*19 - v.l1 = v.l1&maskLow51Bits + c0 - v.l2 = v.l2&maskLow51Bits + c1 - v.l3 = v.l3&maskLow51Bits + c2 - v.l4 = v.l4&maskLow51Bits + c3 - - return v -} diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.checkpoint b/vendor/golang.org/x/crypto/curve25519/internal/field/sync.checkpoint deleted file mode 100644 index e3685f9..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.checkpoint +++ /dev/null @@ -1 +0,0 @@ -b0c49ae9f59d233526f8934262c5bbbe14d4358d diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.sh b/vendor/golang.org/x/crypto/curve25519/internal/field/sync.sh deleted file mode 100644 index 1ba22a8..0000000 --- a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.sh +++ /dev/null @@ -1,19 +0,0 @@ -#! /bin/bash -set -euo pipefail - -cd "$(git rev-parse --show-toplevel)" - -STD_PATH=src/crypto/ed25519/internal/edwards25519/field -LOCAL_PATH=curve25519/internal/field -LAST_SYNC_REF=$(cat $LOCAL_PATH/sync.checkpoint) - -git fetch https://go.googlesource.com/go master - -if git diff --quiet $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH; then - echo "No changes." -else - NEW_REF=$(git rev-parse FETCH_HEAD | tee $LOCAL_PATH/sync.checkpoint) - echo "Applying changes from $LAST_SYNC_REF to $NEW_REF..." - git diff $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH | \ - git apply -3 --directory=$LOCAL_PATH -fi diff --git a/vendor/golang.org/x/crypto/hkdf/hkdf.go b/vendor/golang.org/x/crypto/hkdf/hkdf.go index f4ded5f..3bee662 100644 --- a/vendor/golang.org/x/crypto/hkdf/hkdf.go +++ b/vendor/golang.org/x/crypto/hkdf/hkdf.go @@ -8,7 +8,7 @@ // HKDF is a cryptographic key derivation function (KDF) with the goal of // expanding limited input keying material into one or more cryptographically // strong secret keys. -package hkdf // import "golang.org/x/crypto/hkdf" +package hkdf import ( "crypto/hmac" diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s index e0d3c64..1337573 100644 --- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET diff --git a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go index 904b57e..28cd99c 100644 --- a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go +++ b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go @@ -16,7 +16,7 @@ Hash Functions SHA-1, SHA-224, SHA-256, SHA-384 and SHA-512 for HMAC. To choose, you can pass the `New` functions from the different SHA packages to pbkdf2.Key. */ -package pbkdf2 // import "golang.org/x/crypto/pbkdf2" +package pbkdf2 import ( "crypto/hmac" diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go index 3fd05b2..3685b34 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go +++ b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. // Package salsa provides low-level access to functions in the Salsa family. -package salsa // import "golang.org/x/crypto/salsa20/salsa" +package salsa import "math/bits" diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s index fcce023..3883e0e 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s +++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s @@ -1,880 +1,880 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html +// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte) +// Requires: SSE2 +TEXT ·salsa2020XORKeyStream(SB), $456-40 + // This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. + MOVQ out+0(FP), DI + MOVQ in+8(FP), SI + MOVQ n+16(FP), DX + MOVQ nonce+24(FP), CX + MOVQ key+32(FP), R8 + MOVQ SP, R12 + ADDQ $0x1f, R12 + ANDQ $-32, R12 + MOVQ DX, R9 + MOVQ CX, DX + MOVQ R8, R10 + CMPQ R9, $0x00 + JBE DONE + MOVL 20(R10), CX + MOVL (R10), R8 + MOVL (DX), AX + MOVL 16(R10), R11 + MOVL CX, (R12) + MOVL R8, 4(R12) + MOVL AX, 8(R12) + MOVL R11, 12(R12) + MOVL 8(DX), CX + MOVL 24(R10), R8 + MOVL 4(R10), AX + MOVL 4(DX), R11 + MOVL CX, 16(R12) + MOVL R8, 20(R12) + MOVL AX, 24(R12) + MOVL R11, 28(R12) + MOVL 12(DX), CX + MOVL 12(R10), DX + MOVL 28(R10), R8 + MOVL 8(R10), AX + MOVL DX, 32(R12) + MOVL CX, 36(R12) + MOVL R8, 40(R12) + MOVL AX, 44(R12) + MOVQ $0x61707865, DX + MOVQ $0x3320646e, CX + MOVQ $0x79622d32, R8 + MOVQ $0x6b206574, AX + MOVL DX, 48(R12) + MOVL CX, 52(R12) + MOVL R8, 56(R12) + MOVL AX, 60(R12) + CMPQ R9, $0x00000100 + JB BYTESBETWEEN1AND255 + MOVOA 48(R12), X0 + PSHUFL $0x55, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X3 + PSHUFL $0x00, X0, X0 + MOVOA X1, 64(R12) + MOVOA X2, 80(R12) + MOVOA X3, 96(R12) + MOVOA X0, 112(R12) + MOVOA (R12), X0 + PSHUFL $0xaa, X0, X1 + PSHUFL $0xff, X0, X2 + PSHUFL $0x00, X0, X3 + PSHUFL $0x55, X0, X0 + MOVOA X1, 128(R12) + MOVOA X2, 144(R12) + MOVOA X3, 160(R12) + MOVOA X0, 176(R12) + MOVOA 16(R12), X0 + PSHUFL $0xff, X0, X1 + PSHUFL $0x55, X0, X2 + PSHUFL $0xaa, X0, X0 + MOVOA X1, 192(R12) + MOVOA X2, 208(R12) + MOVOA X0, 224(R12) + MOVOA 32(R12), X0 + PSHUFL $0x00, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X0 + MOVOA X1, 240(R12) + MOVOA X2, 256(R12) + MOVOA X0, 272(R12) -// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) -// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. -TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment - MOVQ out+0(FP),DI - MOVQ in+8(FP),SI - MOVQ n+16(FP),DX - MOVQ nonce+24(FP),CX - MOVQ key+32(FP),R8 +BYTESATLEAST256: + MOVL 16(R12), DX + MOVL 36(R12), CX + MOVL DX, 288(R12) + MOVL CX, 304(R12) + SHLQ $0x20, CX + ADDQ CX, DX + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 292(R12) + MOVL CX, 308(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 296(R12) + MOVL CX, 312(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 300(R12) + MOVL CX, 316(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 16(R12) + MOVL CX, 36(R12) + MOVQ R9, 352(R12) + MOVQ $0x00000014, DX + MOVOA 64(R12), X0 + MOVOA 80(R12), X1 + MOVOA 96(R12), X2 + MOVOA 256(R12), X3 + MOVOA 272(R12), X4 + MOVOA 128(R12), X5 + MOVOA 144(R12), X6 + MOVOA 176(R12), X7 + MOVOA 192(R12), X8 + MOVOA 208(R12), X9 + MOVOA 224(R12), X10 + MOVOA 304(R12), X11 + MOVOA 112(R12), X12 + MOVOA 160(R12), X13 + MOVOA 240(R12), X14 + MOVOA 288(R12), X15 - MOVQ SP,R12 - ADDQ $31, R12 - ANDQ $~31, R12 +MAINLOOP1: + MOVOA X1, 320(R12) + MOVOA X2, 336(R12) + MOVOA X13, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X14 + PSRLL $0x19, X2 + PXOR X2, X14 + MOVOA X7, X1 + PADDL X0, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X11 + PSRLL $0x19, X2 + PXOR X2, X11 + MOVOA X12, X1 + PADDL X14, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X15 + PSRLL $0x17, X2 + PXOR X2, X15 + MOVOA X0, X1 + PADDL X11, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X9 + PSRLL $0x17, X2 + PXOR X2, X9 + MOVOA X14, X1 + PADDL X15, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X13 + PSRLL $0x13, X2 + PXOR X2, X13 + MOVOA X11, X1 + PADDL X9, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X7 + PSRLL $0x13, X2 + PXOR X2, X7 + MOVOA X15, X1 + PADDL X13, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA 320(R12), X1 + MOVOA X12, 320(R12) + MOVOA X9, X2 + PADDL X7, X2 + MOVOA X2, X12 + PSLLL $0x12, X2 + PXOR X2, X0 + PSRLL $0x0e, X12 + PXOR X12, X0 + MOVOA X5, X2 + PADDL X1, X2 + MOVOA X2, X12 + PSLLL $0x07, X2 + PXOR X2, X3 + PSRLL $0x19, X12 + PXOR X12, X3 + MOVOA 336(R12), X2 + MOVOA X0, 336(R12) + MOVOA X6, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X4 + PSRLL $0x19, X12 + PXOR X12, X4 + MOVOA X1, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X10 + PSRLL $0x17, X12 + PXOR X12, X10 + MOVOA X2, X0 + PADDL X4, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X8 + PSRLL $0x17, X12 + PXOR X12, X8 + MOVOA X3, X0 + PADDL X10, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X5 + PSRLL $0x13, X12 + PXOR X12, X5 + MOVOA X4, X0 + PADDL X8, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X6 + PSRLL $0x13, X12 + PXOR X12, X6 + MOVOA X10, X0 + PADDL X5, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA 320(R12), X0 + MOVOA X1, 320(R12) + MOVOA X4, X1 + PADDL X0, X1 + MOVOA X1, X12 + PSLLL $0x07, X1 + PXOR X1, X7 + PSRLL $0x19, X12 + PXOR X12, X7 + MOVOA X8, X1 + PADDL X6, X1 + MOVOA X1, X12 + PSLLL $0x12, X1 + PXOR X1, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 336(R12), X12 + MOVOA X2, 336(R12) + MOVOA X14, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X5 + PSRLL $0x19, X2 + PXOR X2, X5 + MOVOA X0, X1 + PADDL X7, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X10 + PSRLL $0x17, X2 + PXOR X2, X10 + MOVOA X12, X1 + PADDL X5, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X8 + PSRLL $0x17, X2 + PXOR X2, X8 + MOVOA X7, X1 + PADDL X10, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X4 + PSRLL $0x13, X2 + PXOR X2, X4 + MOVOA X5, X1 + PADDL X8, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X14 + PSRLL $0x13, X2 + PXOR X2, X14 + MOVOA X10, X1 + PADDL X4, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X0 + PSRLL $0x0e, X2 + PXOR X2, X0 + MOVOA 320(R12), X1 + MOVOA X0, 320(R12) + MOVOA X8, X0 + PADDL X14, X0 + MOVOA X0, X2 + PSLLL $0x12, X0 + PXOR X0, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA X11, X0 + PADDL X1, X0 + MOVOA X0, X2 + PSLLL $0x07, X0 + PXOR X0, X6 + PSRLL $0x19, X2 + PXOR X2, X6 + MOVOA 336(R12), X2 + MOVOA X12, 336(R12) + MOVOA X3, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X13 + PSRLL $0x19, X12 + PXOR X12, X13 + MOVOA X1, X0 + PADDL X6, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X15 + PSRLL $0x17, X12 + PXOR X12, X15 + MOVOA X2, X0 + PADDL X13, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X9 + PSRLL $0x17, X12 + PXOR X12, X9 + MOVOA X6, X0 + PADDL X15, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X11 + PSRLL $0x13, X12 + PXOR X12, X11 + MOVOA X13, X0 + PADDL X9, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X3 + PSRLL $0x13, X12 + PXOR X12, X3 + MOVOA X15, X0 + PADDL X11, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA X9, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 320(R12), X12 + MOVOA 336(R12), X0 + SUBQ $0x02, DX + JA MAINLOOP1 + PADDL 112(R12), X12 + PADDL 176(R12), X7 + PADDL 224(R12), X10 + PADDL 272(R12), X4 + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL (SI), DX + XORL 4(SI), CX + XORL 8(SI), R8 + XORL 12(SI), R9 + MOVL DX, (DI) + MOVL CX, 4(DI) + MOVL R8, 8(DI) + MOVL R9, 12(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 64(SI), DX + XORL 68(SI), CX + XORL 72(SI), R8 + XORL 76(SI), R9 + MOVL DX, 64(DI) + MOVL CX, 68(DI) + MOVL R8, 72(DI) + MOVL R9, 76(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 128(SI), DX + XORL 132(SI), CX + XORL 136(SI), R8 + XORL 140(SI), R9 + MOVL DX, 128(DI) + MOVL CX, 132(DI) + MOVL R8, 136(DI) + MOVL R9, 140(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + XORL 192(SI), DX + XORL 196(SI), CX + XORL 200(SI), R8 + XORL 204(SI), R9 + MOVL DX, 192(DI) + MOVL CX, 196(DI) + MOVL R8, 200(DI) + MOVL R9, 204(DI) + PADDL 240(R12), X14 + PADDL 64(R12), X0 + PADDL 128(R12), X5 + PADDL 192(R12), X8 + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 16(SI), DX + XORL 20(SI), CX + XORL 24(SI), R8 + XORL 28(SI), R9 + MOVL DX, 16(DI) + MOVL CX, 20(DI) + MOVL R8, 24(DI) + MOVL R9, 28(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 80(SI), DX + XORL 84(SI), CX + XORL 88(SI), R8 + XORL 92(SI), R9 + MOVL DX, 80(DI) + MOVL CX, 84(DI) + MOVL R8, 88(DI) + MOVL R9, 92(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 144(SI), DX + XORL 148(SI), CX + XORL 152(SI), R8 + XORL 156(SI), R9 + MOVL DX, 144(DI) + MOVL CX, 148(DI) + MOVL R8, 152(DI) + MOVL R9, 156(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + XORL 208(SI), DX + XORL 212(SI), CX + XORL 216(SI), R8 + XORL 220(SI), R9 + MOVL DX, 208(DI) + MOVL CX, 212(DI) + MOVL R8, 216(DI) + MOVL R9, 220(DI) + PADDL 288(R12), X15 + PADDL 304(R12), X11 + PADDL 80(R12), X1 + PADDL 144(R12), X6 + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 32(SI), DX + XORL 36(SI), CX + XORL 40(SI), R8 + XORL 44(SI), R9 + MOVL DX, 32(DI) + MOVL CX, 36(DI) + MOVL R8, 40(DI) + MOVL R9, 44(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 96(SI), DX + XORL 100(SI), CX + XORL 104(SI), R8 + XORL 108(SI), R9 + MOVL DX, 96(DI) + MOVL CX, 100(DI) + MOVL R8, 104(DI) + MOVL R9, 108(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 160(SI), DX + XORL 164(SI), CX + XORL 168(SI), R8 + XORL 172(SI), R9 + MOVL DX, 160(DI) + MOVL CX, 164(DI) + MOVL R8, 168(DI) + MOVL R9, 172(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + XORL 224(SI), DX + XORL 228(SI), CX + XORL 232(SI), R8 + XORL 236(SI), R9 + MOVL DX, 224(DI) + MOVL CX, 228(DI) + MOVL R8, 232(DI) + MOVL R9, 236(DI) + PADDL 160(R12), X13 + PADDL 208(R12), X9 + PADDL 256(R12), X3 + PADDL 96(R12), X2 + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 48(SI), DX + XORL 52(SI), CX + XORL 56(SI), R8 + XORL 60(SI), R9 + MOVL DX, 48(DI) + MOVL CX, 52(DI) + MOVL R8, 56(DI) + MOVL R9, 60(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 112(SI), DX + XORL 116(SI), CX + XORL 120(SI), R8 + XORL 124(SI), R9 + MOVL DX, 112(DI) + MOVL CX, 116(DI) + MOVL R8, 120(DI) + MOVL R9, 124(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 176(SI), DX + XORL 180(SI), CX + XORL 184(SI), R8 + XORL 188(SI), R9 + MOVL DX, 176(DI) + MOVL CX, 180(DI) + MOVL R8, 184(DI) + MOVL R9, 188(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + XORL 240(SI), DX + XORL 244(SI), CX + XORL 248(SI), R8 + XORL 252(SI), R9 + MOVL DX, 240(DI) + MOVL CX, 244(DI) + MOVL R8, 248(DI) + MOVL R9, 252(DI) + MOVQ 352(R12), R9 + SUBQ $0x00000100, R9 + ADDQ $0x00000100, SI + ADDQ $0x00000100, DI + CMPQ R9, $0x00000100 + JAE BYTESATLEAST256 + CMPQ R9, $0x00 + JBE DONE - MOVQ DX,R9 - MOVQ CX,DX - MOVQ R8,R10 - CMPQ R9,$0 - JBE DONE - START: - MOVL 20(R10),CX - MOVL 0(R10),R8 - MOVL 0(DX),AX - MOVL 16(R10),R11 - MOVL CX,0(R12) - MOVL R8, 4 (R12) - MOVL AX, 8 (R12) - MOVL R11, 12 (R12) - MOVL 8(DX),CX - MOVL 24(R10),R8 - MOVL 4(R10),AX - MOVL 4(DX),R11 - MOVL CX,16(R12) - MOVL R8, 20 (R12) - MOVL AX, 24 (R12) - MOVL R11, 28 (R12) - MOVL 12(DX),CX - MOVL 12(R10),DX - MOVL 28(R10),R8 - MOVL 8(R10),AX - MOVL DX,32(R12) - MOVL CX, 36 (R12) - MOVL R8, 40 (R12) - MOVL AX, 44 (R12) - MOVQ $1634760805,DX - MOVQ $857760878,CX - MOVQ $2036477234,R8 - MOVQ $1797285236,AX - MOVL DX,48(R12) - MOVL CX, 52 (R12) - MOVL R8, 56 (R12) - MOVL AX, 60 (R12) - CMPQ R9,$256 - JB BYTESBETWEEN1AND255 - MOVOA 48(R12),X0 - PSHUFL $0X55,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X3 - PSHUFL $0X00,X0,X0 - MOVOA X1,64(R12) - MOVOA X2,80(R12) - MOVOA X3,96(R12) - MOVOA X0,112(R12) - MOVOA 0(R12),X0 - PSHUFL $0XAA,X0,X1 - PSHUFL $0XFF,X0,X2 - PSHUFL $0X00,X0,X3 - PSHUFL $0X55,X0,X0 - MOVOA X1,128(R12) - MOVOA X2,144(R12) - MOVOA X3,160(R12) - MOVOA X0,176(R12) - MOVOA 16(R12),X0 - PSHUFL $0XFF,X0,X1 - PSHUFL $0X55,X0,X2 - PSHUFL $0XAA,X0,X0 - MOVOA X1,192(R12) - MOVOA X2,208(R12) - MOVOA X0,224(R12) - MOVOA 32(R12),X0 - PSHUFL $0X00,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X0 - MOVOA X1,240(R12) - MOVOA X2,256(R12) - MOVOA X0,272(R12) - BYTESATLEAST256: - MOVL 16(R12),DX - MOVL 36 (R12),CX - MOVL DX,288(R12) - MOVL CX,304(R12) - SHLQ $32,CX - ADDQ CX,DX - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 292 (R12) - MOVL CX, 308 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 296 (R12) - MOVL CX, 312 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 300 (R12) - MOVL CX, 316 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX,16(R12) - MOVL CX, 36 (R12) - MOVQ R9,352(R12) - MOVQ $20,DX - MOVOA 64(R12),X0 - MOVOA 80(R12),X1 - MOVOA 96(R12),X2 - MOVOA 256(R12),X3 - MOVOA 272(R12),X4 - MOVOA 128(R12),X5 - MOVOA 144(R12),X6 - MOVOA 176(R12),X7 - MOVOA 192(R12),X8 - MOVOA 208(R12),X9 - MOVOA 224(R12),X10 - MOVOA 304(R12),X11 - MOVOA 112(R12),X12 - MOVOA 160(R12),X13 - MOVOA 240(R12),X14 - MOVOA 288(R12),X15 - MAINLOOP1: - MOVOA X1,320(R12) - MOVOA X2,336(R12) - MOVOA X13,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X14 - PSRLL $25,X2 - PXOR X2,X14 - MOVOA X7,X1 - PADDL X0,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X11 - PSRLL $25,X2 - PXOR X2,X11 - MOVOA X12,X1 - PADDL X14,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X15 - PSRLL $23,X2 - PXOR X2,X15 - MOVOA X0,X1 - PADDL X11,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X9 - PSRLL $23,X2 - PXOR X2,X9 - MOVOA X14,X1 - PADDL X15,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X13 - PSRLL $19,X2 - PXOR X2,X13 - MOVOA X11,X1 - PADDL X9,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X7 - PSRLL $19,X2 - PXOR X2,X7 - MOVOA X15,X1 - PADDL X13,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA 320(R12),X1 - MOVOA X12,320(R12) - MOVOA X9,X2 - PADDL X7,X2 - MOVOA X2,X12 - PSLLL $18,X2 - PXOR X2,X0 - PSRLL $14,X12 - PXOR X12,X0 - MOVOA X5,X2 - PADDL X1,X2 - MOVOA X2,X12 - PSLLL $7,X2 - PXOR X2,X3 - PSRLL $25,X12 - PXOR X12,X3 - MOVOA 336(R12),X2 - MOVOA X0,336(R12) - MOVOA X6,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X4 - PSRLL $25,X12 - PXOR X12,X4 - MOVOA X1,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X10 - PSRLL $23,X12 - PXOR X12,X10 - MOVOA X2,X0 - PADDL X4,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X8 - PSRLL $23,X12 - PXOR X12,X8 - MOVOA X3,X0 - PADDL X10,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X5 - PSRLL $19,X12 - PXOR X12,X5 - MOVOA X4,X0 - PADDL X8,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X6 - PSRLL $19,X12 - PXOR X12,X6 - MOVOA X10,X0 - PADDL X5,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA 320(R12),X0 - MOVOA X1,320(R12) - MOVOA X4,X1 - PADDL X0,X1 - MOVOA X1,X12 - PSLLL $7,X1 - PXOR X1,X7 - PSRLL $25,X12 - PXOR X12,X7 - MOVOA X8,X1 - PADDL X6,X1 - MOVOA X1,X12 - PSLLL $18,X1 - PXOR X1,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 336(R12),X12 - MOVOA X2,336(R12) - MOVOA X14,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X5 - PSRLL $25,X2 - PXOR X2,X5 - MOVOA X0,X1 - PADDL X7,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X10 - PSRLL $23,X2 - PXOR X2,X10 - MOVOA X12,X1 - PADDL X5,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X8 - PSRLL $23,X2 - PXOR X2,X8 - MOVOA X7,X1 - PADDL X10,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X4 - PSRLL $19,X2 - PXOR X2,X4 - MOVOA X5,X1 - PADDL X8,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X14 - PSRLL $19,X2 - PXOR X2,X14 - MOVOA X10,X1 - PADDL X4,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X0 - PSRLL $14,X2 - PXOR X2,X0 - MOVOA 320(R12),X1 - MOVOA X0,320(R12) - MOVOA X8,X0 - PADDL X14,X0 - MOVOA X0,X2 - PSLLL $18,X0 - PXOR X0,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA X11,X0 - PADDL X1,X0 - MOVOA X0,X2 - PSLLL $7,X0 - PXOR X0,X6 - PSRLL $25,X2 - PXOR X2,X6 - MOVOA 336(R12),X2 - MOVOA X12,336(R12) - MOVOA X3,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X13 - PSRLL $25,X12 - PXOR X12,X13 - MOVOA X1,X0 - PADDL X6,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X15 - PSRLL $23,X12 - PXOR X12,X15 - MOVOA X2,X0 - PADDL X13,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X9 - PSRLL $23,X12 - PXOR X12,X9 - MOVOA X6,X0 - PADDL X15,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X11 - PSRLL $19,X12 - PXOR X12,X11 - MOVOA X13,X0 - PADDL X9,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X3 - PSRLL $19,X12 - PXOR X12,X3 - MOVOA X15,X0 - PADDL X11,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA X9,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 320(R12),X12 - MOVOA 336(R12),X0 - SUBQ $2,DX - JA MAINLOOP1 - PADDL 112(R12),X12 - PADDL 176(R12),X7 - PADDL 224(R12),X10 - PADDL 272(R12),X4 - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 0(SI),DX - XORL 4(SI),CX - XORL 8(SI),R8 - XORL 12(SI),R9 - MOVL DX,0(DI) - MOVL CX,4(DI) - MOVL R8,8(DI) - MOVL R9,12(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 64(SI),DX - XORL 68(SI),CX - XORL 72(SI),R8 - XORL 76(SI),R9 - MOVL DX,64(DI) - MOVL CX,68(DI) - MOVL R8,72(DI) - MOVL R9,76(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 128(SI),DX - XORL 132(SI),CX - XORL 136(SI),R8 - XORL 140(SI),R9 - MOVL DX,128(DI) - MOVL CX,132(DI) - MOVL R8,136(DI) - MOVL R9,140(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - XORL 192(SI),DX - XORL 196(SI),CX - XORL 200(SI),R8 - XORL 204(SI),R9 - MOVL DX,192(DI) - MOVL CX,196(DI) - MOVL R8,200(DI) - MOVL R9,204(DI) - PADDL 240(R12),X14 - PADDL 64(R12),X0 - PADDL 128(R12),X5 - PADDL 192(R12),X8 - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 16(SI),DX - XORL 20(SI),CX - XORL 24(SI),R8 - XORL 28(SI),R9 - MOVL DX,16(DI) - MOVL CX,20(DI) - MOVL R8,24(DI) - MOVL R9,28(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 80(SI),DX - XORL 84(SI),CX - XORL 88(SI),R8 - XORL 92(SI),R9 - MOVL DX,80(DI) - MOVL CX,84(DI) - MOVL R8,88(DI) - MOVL R9,92(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 144(SI),DX - XORL 148(SI),CX - XORL 152(SI),R8 - XORL 156(SI),R9 - MOVL DX,144(DI) - MOVL CX,148(DI) - MOVL R8,152(DI) - MOVL R9,156(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - XORL 208(SI),DX - XORL 212(SI),CX - XORL 216(SI),R8 - XORL 220(SI),R9 - MOVL DX,208(DI) - MOVL CX,212(DI) - MOVL R8,216(DI) - MOVL R9,220(DI) - PADDL 288(R12),X15 - PADDL 304(R12),X11 - PADDL 80(R12),X1 - PADDL 144(R12),X6 - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 32(SI),DX - XORL 36(SI),CX - XORL 40(SI),R8 - XORL 44(SI),R9 - MOVL DX,32(DI) - MOVL CX,36(DI) - MOVL R8,40(DI) - MOVL R9,44(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 96(SI),DX - XORL 100(SI),CX - XORL 104(SI),R8 - XORL 108(SI),R9 - MOVL DX,96(DI) - MOVL CX,100(DI) - MOVL R8,104(DI) - MOVL R9,108(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 160(SI),DX - XORL 164(SI),CX - XORL 168(SI),R8 - XORL 172(SI),R9 - MOVL DX,160(DI) - MOVL CX,164(DI) - MOVL R8,168(DI) - MOVL R9,172(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - XORL 224(SI),DX - XORL 228(SI),CX - XORL 232(SI),R8 - XORL 236(SI),R9 - MOVL DX,224(DI) - MOVL CX,228(DI) - MOVL R8,232(DI) - MOVL R9,236(DI) - PADDL 160(R12),X13 - PADDL 208(R12),X9 - PADDL 256(R12),X3 - PADDL 96(R12),X2 - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 48(SI),DX - XORL 52(SI),CX - XORL 56(SI),R8 - XORL 60(SI),R9 - MOVL DX,48(DI) - MOVL CX,52(DI) - MOVL R8,56(DI) - MOVL R9,60(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 112(SI),DX - XORL 116(SI),CX - XORL 120(SI),R8 - XORL 124(SI),R9 - MOVL DX,112(DI) - MOVL CX,116(DI) - MOVL R8,120(DI) - MOVL R9,124(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 176(SI),DX - XORL 180(SI),CX - XORL 184(SI),R8 - XORL 188(SI),R9 - MOVL DX,176(DI) - MOVL CX,180(DI) - MOVL R8,184(DI) - MOVL R9,188(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - XORL 240(SI),DX - XORL 244(SI),CX - XORL 248(SI),R8 - XORL 252(SI),R9 - MOVL DX,240(DI) - MOVL CX,244(DI) - MOVL R8,248(DI) - MOVL R9,252(DI) - MOVQ 352(R12),R9 - SUBQ $256,R9 - ADDQ $256,SI - ADDQ $256,DI - CMPQ R9,$256 - JAE BYTESATLEAST256 - CMPQ R9,$0 - JBE DONE - BYTESBETWEEN1AND255: - CMPQ R9,$64 - JAE NOCOPY - MOVQ DI,DX - LEAQ 360(R12),DI - MOVQ R9,CX +BYTESBETWEEN1AND255: + CMPQ R9, $0x40 + JAE NOCOPY + MOVQ DI, DX + LEAQ 360(R12), DI + MOVQ R9, CX REP; MOVSB - LEAQ 360(R12),DI - LEAQ 360(R12),SI - NOCOPY: - MOVQ R9,352(R12) - MOVOA 48(R12),X0 - MOVOA 0(R12),X1 - MOVOA 16(R12),X2 - MOVOA 32(R12),X3 - MOVOA X1,X4 - MOVQ $20,CX - MAINLOOP2: - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - SUBQ $4,CX - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PXOR X7,X7 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - JA MAINLOOP2 - PADDL 48(R12),X0 - PADDL 0(R12),X1 - PADDL 16(R12),X2 - PADDL 32(R12),X3 - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 0(SI),CX - XORL 48(SI),R8 - XORL 32(SI),R9 - XORL 16(SI),AX - MOVL CX,0(DI) - MOVL R8,48(DI) - MOVL R9,32(DI) - MOVL AX,16(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 20(SI),CX - XORL 4(SI),R8 - XORL 52(SI),R9 - XORL 36(SI),AX - MOVL CX,20(DI) - MOVL R8,4(DI) - MOVL R9,52(DI) - MOVL AX,36(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 40(SI),CX - XORL 24(SI),R8 - XORL 8(SI),R9 - XORL 56(SI),AX - MOVL CX,40(DI) - MOVL R8,24(DI) - MOVL R9,8(DI) - MOVL AX,56(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - XORL 60(SI),CX - XORL 44(SI),R8 - XORL 28(SI),R9 - XORL 12(SI),AX - MOVL CX,60(DI) - MOVL R8,44(DI) - MOVL R9,28(DI) - MOVL AX,12(DI) - MOVQ 352(R12),R9 - MOVL 16(R12),CX - MOVL 36 (R12),R8 - ADDQ $1,CX - SHLQ $32,R8 - ADDQ R8,CX - MOVQ CX,R8 - SHRQ $32,R8 - MOVL CX,16(R12) - MOVL R8, 36 (R12) - CMPQ R9,$64 - JA BYTESATLEAST65 - JAE BYTESATLEAST64 - MOVQ DI,SI - MOVQ DX,DI - MOVQ R9,CX + LEAQ 360(R12), DI + LEAQ 360(R12), SI + +NOCOPY: + MOVQ R9, 352(R12) + MOVOA 48(R12), X0 + MOVOA (R12), X1 + MOVOA 16(R12), X2 + MOVOA 32(R12), X3 + MOVOA X1, X4 + MOVQ $0x00000014, CX + +MAINLOOP2: + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + SUBQ $0x04, CX + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PXOR X7, X7 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + JA MAINLOOP2 + PADDL 48(R12), X0 + PADDL (R12), X1 + PADDL 16(R12), X2 + PADDL 32(R12), X3 + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL (SI), CX + XORL 48(SI), R8 + XORL 32(SI), R9 + XORL 16(SI), AX + MOVL CX, (DI) + MOVL R8, 48(DI) + MOVL R9, 32(DI) + MOVL AX, 16(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 20(SI), CX + XORL 4(SI), R8 + XORL 52(SI), R9 + XORL 36(SI), AX + MOVL CX, 20(DI) + MOVL R8, 4(DI) + MOVL R9, 52(DI) + MOVL AX, 36(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 40(SI), CX + XORL 24(SI), R8 + XORL 8(SI), R9 + XORL 56(SI), AX + MOVL CX, 40(DI) + MOVL R8, 24(DI) + MOVL R9, 8(DI) + MOVL AX, 56(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + XORL 60(SI), CX + XORL 44(SI), R8 + XORL 28(SI), R9 + XORL 12(SI), AX + MOVL CX, 60(DI) + MOVL R8, 44(DI) + MOVL R9, 28(DI) + MOVL AX, 12(DI) + MOVQ 352(R12), R9 + MOVL 16(R12), CX + MOVL 36(R12), R8 + ADDQ $0x01, CX + SHLQ $0x20, R8 + ADDQ R8, CX + MOVQ CX, R8 + SHRQ $0x20, R8 + MOVL CX, 16(R12) + MOVL R8, 36(R12) + CMPQ R9, $0x40 + JA BYTESATLEAST65 + JAE BYTESATLEAST64 + MOVQ DI, SI + MOVQ DX, DI + MOVQ R9, CX REP; MOVSB - BYTESATLEAST64: - DONE: + +BYTESATLEAST64: +DONE: RET - BYTESATLEAST65: - SUBQ $64,R9 - ADDQ $64,DI - ADDQ $64,SI - JMP BYTESBETWEEN1AND255 + +BYTESATLEAST65: + SUBQ $0x40, R9 + ADDQ $0x40, DI + ADDQ $0x40, SI + JMP BYTESBETWEEN1AND255 diff --git a/vendor/golang.org/x/crypto/salsa20/salsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa20.go index 8f4f896..e75c934 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa20.go +++ b/vendor/golang.org/x/crypto/salsa20/salsa20.go @@ -19,7 +19,7 @@ This package also implements XSalsa20: a version of Salsa20 with a 24-byte nonce as specified in https://cr.yp.to/snuffle/xsalsa-20081128.pdf. Simply passing a 24-byte slice as the nonce triggers XSalsa20. */ -package salsa20 // import "golang.org/x/crypto/salsa20" +package salsa20 // TODO(agl): implement XORKeyStream12 and XORKeyStream8 - the reduced round variants of Salsa20. diff --git a/vendor/golang.org/x/crypto/twofish/twofish.go b/vendor/golang.org/x/crypto/twofish/twofish.go index e4eeae1..6d0a302 100644 --- a/vendor/golang.org/x/crypto/twofish/twofish.go +++ b/vendor/golang.org/x/crypto/twofish/twofish.go @@ -9,7 +9,7 @@ // implementation. Instead, use AES (from crypto/aes, if necessary in an AEAD // mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package twofish // import "golang.org/x/crypto/twofish" +package twofish // Twofish is defined in https://www.schneier.com/paper-twofish-paper.pdf [TWOFISH] diff --git a/vendor/golang.org/x/crypto/xtea/cipher.go b/vendor/golang.org/x/crypto/xtea/cipher.go index a4c2fd0..7b4f8aa 100644 --- a/vendor/golang.org/x/crypto/xtea/cipher.go +++ b/vendor/golang.org/x/crypto/xtea/cipher.go @@ -12,7 +12,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package xtea // import "golang.org/x/crypto/xtea" +package xtea // For details, see http://www.cix.co.uk/~klockstone/xtea.pdf diff --git a/vendor/golang.org/x/exp/LICENSE b/vendor/golang.org/x/exp/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/exp/LICENSE +++ b/vendor/golang.org/x/exp/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/mod/LICENSE b/vendor/golang.org/x/mod/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/mod/LICENSE +++ b/vendor/golang.org/x/mod/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/mod/modfile/read.go b/vendor/golang.org/x/mod/modfile/read.go index 5b5bb5e..de1b982 100644 --- a/vendor/golang.org/x/mod/modfile/read.go +++ b/vendor/golang.org/x/mod/modfile/read.go @@ -225,9 +225,10 @@ func (x *FileSyntax) Cleanup() { if ww == 0 { continue } - if ww == 1 { - // Collapse block into single line. - line := &Line{ + if ww == 1 && len(stmt.RParen.Comments.Before) == 0 { + // Collapse block into single line but keep the Line reference used by the + // parsed File structure. + *stmt.Line[0] = Line{ Comments: Comments{ Before: commentsAdd(stmt.Before, stmt.Line[0].Before), Suffix: commentsAdd(stmt.Line[0].Suffix, stmt.Suffix), @@ -235,7 +236,7 @@ func (x *FileSyntax) Cleanup() { }, Token: stringsAdd(stmt.Token, stmt.Line[0].Token), } - x.Stmt[w] = line + x.Stmt[w] = stmt.Line[0] w++ continue } diff --git a/vendor/golang.org/x/mod/modfile/rule.go b/vendor/golang.org/x/mod/modfile/rule.go index 930b6c5..3e4a1d0 100644 --- a/vendor/golang.org/x/mod/modfile/rule.go +++ b/vendor/golang.org/x/mod/modfile/rule.go @@ -38,10 +38,12 @@ type File struct { Module *Module Go *Go Toolchain *Toolchain + Godebug []*Godebug Require []*Require Exclude []*Exclude Replace []*Replace Retract []*Retract + Tool []*Tool Syntax *FileSyntax } @@ -65,6 +67,13 @@ type Toolchain struct { Syntax *Line } +// A Godebug is a single godebug key=value statement. +type Godebug struct { + Key string + Value string + Syntax *Line +} + // An Exclude is a single exclude statement. type Exclude struct { Mod module.Version @@ -85,6 +94,12 @@ type Retract struct { Syntax *Line } +// A Tool is a single tool statement. +type Tool struct { + Path string + Syntax *Line +} + // A VersionInterval represents a range of versions with upper and lower bounds. // Intervals are closed: both bounds are included. When Low is equal to High, // the interval may refer to a single version ('v1.2.3') or an interval @@ -289,7 +304,7 @@ func parseToFile(file string, data []byte, fix VersionFixer, strict bool) (parse }) } continue - case "module", "require", "exclude", "replace", "retract": + case "module", "godebug", "require", "exclude", "replace", "retract", "tool": for _, l := range x.Line { f.add(&errs, x, l, x.Token[0], l.Token, fix, strict) } @@ -308,6 +323,9 @@ var laxGoVersionRE = lazyregexp.New(`^v?(([1-9][0-9]*)\.(0|[1-9][0-9]*))([^0-9]. // Toolchains must be named beginning with `go1`, // like "go1.20.3" or "go1.20.3-gccgo". As a special case, "default" is also permitted. +// Note that this regexp is a much looser condition than go/version.IsValid, +// for forward compatibility. +// (This code has to be work to identify new toolchains even if we tweak the syntax in the future.) var ToolchainRE = lazyregexp.New(`^default$|^go1($|\.)`) func (f *File) add(errs *ErrorList, block *LineBlock, line *Line, verb string, args []string, fix VersionFixer, strict bool) { @@ -367,7 +385,7 @@ func (f *File) add(errs *ErrorList, block *LineBlock, line *Line, verb string, a } } if !fixed { - errorf("invalid go version '%s': must match format 1.23", args[0]) + errorf("invalid go version '%s': must match format 1.23.0", args[0]) return } } @@ -383,8 +401,8 @@ func (f *File) add(errs *ErrorList, block *LineBlock, line *Line, verb string, a if len(args) != 1 { errorf("toolchain directive expects exactly one argument") return - } else if strict && !ToolchainRE.MatchString(args[0]) { - errorf("invalid toolchain version '%s': must match format go1.23 or local", args[0]) + } else if !ToolchainRE.MatchString(args[0]) { + errorf("invalid toolchain version '%s': must match format go1.23.0 or default", args[0]) return } f.Toolchain = &Toolchain{Syntax: line} @@ -411,6 +429,22 @@ func (f *File) add(errs *ErrorList, block *LineBlock, line *Line, verb string, a } f.Module.Mod = module.Version{Path: s} + case "godebug": + if len(args) != 1 || strings.ContainsAny(args[0], "\"`',") { + errorf("usage: godebug key=value") + return + } + key, value, ok := strings.Cut(args[0], "=") + if !ok { + errorf("usage: godebug key=value") + return + } + f.Godebug = append(f.Godebug, &Godebug{ + Key: key, + Value: value, + Syntax: line, + }) + case "require", "exclude": if len(args) != 2 { errorf("usage: %s module/path v1.2.3", verb) @@ -482,6 +516,21 @@ func (f *File) add(errs *ErrorList, block *LineBlock, line *Line, verb string, a Syntax: line, } f.Retract = append(f.Retract, retract) + + case "tool": + if len(args) != 1 { + errorf("tool directive expects exactly one argument") + return + } + s, err := parseString(&args[0]) + if err != nil { + errorf("invalid quoted string: %v", err) + return + } + f.Tool = append(f.Tool, &Tool{ + Path: s, + Syntax: line, + }) } } @@ -542,7 +591,7 @@ func parseReplace(filename string, line *Line, verb string, args []string, fix V if strings.Contains(ns, "@") { return nil, errorf("replacement module must match format 'path version', not 'path@version'") } - return nil, errorf("replacement module without version must be directory path (rooted or starting with ./ or ../)") + return nil, errorf("replacement module without version must be directory path (rooted or starting with . or ..)") } if filepath.Separator == '/' && strings.Contains(ns, `\`) { return nil, errorf("replacement directory appears to be Windows path (on a non-windows system)") @@ -555,7 +604,6 @@ func parseReplace(filename string, line *Line, verb string, args []string, fix V } if IsDirectoryPath(ns) { return nil, errorf("replacement module directory path %q cannot have version", ns) - } } return &Replace{ @@ -631,7 +679,7 @@ func (f *WorkFile) add(errs *ErrorList, line *Line, verb string, args []string, errorf("go directive expects exactly one argument") return } else if !GoVersionRE.MatchString(args[0]) { - errorf("invalid go version '%s': must match format 1.23", args[0]) + errorf("invalid go version '%s': must match format 1.23.0", args[0]) return } @@ -647,13 +695,29 @@ func (f *WorkFile) add(errs *ErrorList, line *Line, verb string, args []string, errorf("toolchain directive expects exactly one argument") return } else if !ToolchainRE.MatchString(args[0]) { - errorf("invalid toolchain version '%s': must match format go1.23 or local", args[0]) + errorf("invalid toolchain version '%s': must match format go1.23.0 or default", args[0]) return } f.Toolchain = &Toolchain{Syntax: line} f.Toolchain.Name = args[0] + case "godebug": + if len(args) != 1 || strings.ContainsAny(args[0], "\"`',") { + errorf("usage: godebug key=value") + return + } + key, value, ok := strings.Cut(args[0], "=") + if !ok { + errorf("usage: godebug key=value") + return + } + f.Godebug = append(f.Godebug, &Godebug{ + Key: key, + Value: value, + Syntax: line, + }) + case "use": if len(args) != 1 { errorf("usage: %s local/dir", verb) @@ -679,14 +743,15 @@ func (f *WorkFile) add(errs *ErrorList, line *Line, verb string, args []string, } } -// IsDirectoryPath reports whether the given path should be interpreted -// as a directory path. Just like on the go command line, relative paths +// IsDirectoryPath reports whether the given path should be interpreted as a directory path. +// Just like on the go command line, relative paths starting with a '.' or '..' path component // and rooted paths are directory paths; the rest are module paths. func IsDirectoryPath(ns string) bool { // Because go.mod files can move from one system to another, // we check all known path syntaxes, both Unix and Windows. - return strings.HasPrefix(ns, "./") || strings.HasPrefix(ns, "../") || strings.HasPrefix(ns, "/") || - strings.HasPrefix(ns, `.\`) || strings.HasPrefix(ns, `..\`) || strings.HasPrefix(ns, `\`) || + return ns == "." || strings.HasPrefix(ns, "./") || strings.HasPrefix(ns, `.\`) || + ns == ".." || strings.HasPrefix(ns, "../") || strings.HasPrefix(ns, `..\`) || + strings.HasPrefix(ns, "/") || strings.HasPrefix(ns, `\`) || len(ns) >= 2 && ('A' <= ns[0] && ns[0] <= 'Z' || 'a' <= ns[0] && ns[0] <= 'z') && ns[1] == ':' } @@ -928,6 +993,15 @@ func (f *File) Format() ([]byte, error) { // Cleanup cleans out all the cleared entries. func (f *File) Cleanup() { w := 0 + for _, g := range f.Godebug { + if g.Key != "" { + f.Godebug[w] = g + w++ + } + } + f.Godebug = f.Godebug[:w] + + w = 0 for _, r := range f.Require { if r.Mod.Path != "" { f.Require[w] = r @@ -974,6 +1048,8 @@ func (f *File) AddGoStmt(version string) error { var hint Expr if f.Module != nil && f.Module.Syntax != nil { hint = f.Module.Syntax + } else if f.Syntax == nil { + f.Syntax = new(FileSyntax) } f.Go = &Go{ Version: version, @@ -1024,6 +1100,45 @@ func (f *File) AddToolchainStmt(name string) error { return nil } +// AddGodebug sets the first godebug line for key to value, +// preserving any existing comments for that line and removing all +// other godebug lines for key. +// +// If no line currently exists for key, AddGodebug adds a new line +// at the end of the last godebug block. +func (f *File) AddGodebug(key, value string) error { + need := true + for _, g := range f.Godebug { + if g.Key == key { + if need { + g.Value = value + f.Syntax.updateLine(g.Syntax, "godebug", key+"="+value) + need = false + } else { + g.Syntax.markRemoved() + *g = Godebug{} + } + } + } + + if need { + f.addNewGodebug(key, value) + } + return nil +} + +// addNewGodebug adds a new godebug key=value line at the end +// of the last godebug block, regardless of any existing godebug lines for key. +func (f *File) addNewGodebug(key, value string) { + line := f.Syntax.addLine(nil, "godebug", key+"="+value) + g := &Godebug{ + Key: key, + Value: value, + Syntax: line, + } + f.Godebug = append(f.Godebug, g) +} + // AddRequire sets the first require line for path to version vers, // preserving any existing comments for that line and removing all // other lines for path. @@ -1331,6 +1446,16 @@ func (f *File) SetRequireSeparateIndirect(req []*Require) { f.SortBlocks() } +func (f *File) DropGodebug(key string) error { + for _, g := range f.Godebug { + if g.Key == key { + g.Syntax.markRemoved() + *g = Godebug{} + } + } + return nil +} + func (f *File) DropRequire(path string) error { for _, r := range f.Require { if r.Mod.Path == path { @@ -1464,6 +1589,36 @@ func (f *File) DropRetract(vi VersionInterval) error { return nil } +// AddTool adds a new tool directive with the given path. +// It does nothing if the tool line already exists. +func (f *File) AddTool(path string) error { + for _, t := range f.Tool { + if t.Path == path { + return nil + } + } + + f.Tool = append(f.Tool, &Tool{ + Path: path, + Syntax: f.Syntax.addLine(nil, "tool", path), + }) + + f.SortBlocks() + return nil +} + +// RemoveTool removes a tool directive with the given path. +// It does nothing if no such tool directive exists. +func (f *File) DropTool(path string) error { + for _, t := range f.Tool { + if t.Path == path { + t.Syntax.markRemoved() + *t = Tool{} + } + } + return nil +} + func (f *File) SortBlocks() { f.removeDups() // otherwise sorting is unsafe @@ -1490,9 +1645,9 @@ func (f *File) SortBlocks() { } } -// removeDups removes duplicate exclude and replace directives. +// removeDups removes duplicate exclude, replace and tool directives. // -// Earlier exclude directives take priority. +// Earlier exclude and tool directives take priority. // // Later replace directives take priority. // @@ -1502,10 +1657,10 @@ func (f *File) SortBlocks() { // retract directives are not de-duplicated since comments are // meaningful, and versions may be retracted multiple times. func (f *File) removeDups() { - removeDups(f.Syntax, &f.Exclude, &f.Replace) + removeDups(f.Syntax, &f.Exclude, &f.Replace, &f.Tool) } -func removeDups(syntax *FileSyntax, exclude *[]*Exclude, replace *[]*Replace) { +func removeDups(syntax *FileSyntax, exclude *[]*Exclude, replace *[]*Replace, tool *[]*Tool) { kill := make(map[*Line]bool) // Remove duplicate excludes. @@ -1546,6 +1701,24 @@ func removeDups(syntax *FileSyntax, exclude *[]*Exclude, replace *[]*Replace) { } *replace = repl + if tool != nil { + haveTool := make(map[string]bool) + for _, t := range *tool { + if haveTool[t.Path] { + kill[t.Syntax] = true + continue + } + haveTool[t.Path] = true + } + var newTool []*Tool + for _, t := range *tool { + if !kill[t.Syntax] { + newTool = append(newTool, t) + } + } + *tool = newTool + } + // Duplicate require and retract directives are not removed. // Drop killed statements from the syntax tree. diff --git a/vendor/golang.org/x/mod/modfile/work.go b/vendor/golang.org/x/mod/modfile/work.go index d7b9937..5387d0c 100644 --- a/vendor/golang.org/x/mod/modfile/work.go +++ b/vendor/golang.org/x/mod/modfile/work.go @@ -14,6 +14,7 @@ import ( type WorkFile struct { Go *Go Toolchain *Toolchain + Godebug []*Godebug Use []*Use Replace []*Replace @@ -68,7 +69,7 @@ func ParseWork(file string, data []byte, fix VersionFixer) (*WorkFile, error) { Err: fmt.Errorf("unknown block type: %s", strings.Join(x.Token, " ")), }) continue - case "use", "replace": + case "godebug", "use", "replace": for _, l := range x.Line { f.add(&errs, l, x.Token[0], l.Token, fix) } @@ -184,6 +185,55 @@ func (f *WorkFile) DropToolchainStmt() { } } +// AddGodebug sets the first godebug line for key to value, +// preserving any existing comments for that line and removing all +// other godebug lines for key. +// +// If no line currently exists for key, AddGodebug adds a new line +// at the end of the last godebug block. +func (f *WorkFile) AddGodebug(key, value string) error { + need := true + for _, g := range f.Godebug { + if g.Key == key { + if need { + g.Value = value + f.Syntax.updateLine(g.Syntax, "godebug", key+"="+value) + need = false + } else { + g.Syntax.markRemoved() + *g = Godebug{} + } + } + } + + if need { + f.addNewGodebug(key, value) + } + return nil +} + +// addNewGodebug adds a new godebug key=value line at the end +// of the last godebug block, regardless of any existing godebug lines for key. +func (f *WorkFile) addNewGodebug(key, value string) { + line := f.Syntax.addLine(nil, "godebug", key+"="+value) + g := &Godebug{ + Key: key, + Value: value, + Syntax: line, + } + f.Godebug = append(f.Godebug, g) +} + +func (f *WorkFile) DropGodebug(key string) error { + for _, g := range f.Godebug { + if g.Key == key { + g.Syntax.markRemoved() + *g = Godebug{} + } + } + return nil +} + func (f *WorkFile) AddUse(diskPath, modulePath string) error { need := true for _, d := range f.Use { @@ -281,5 +331,5 @@ func (f *WorkFile) SortBlocks() { // retract directives are not de-duplicated since comments are // meaningful, and versions may be retracted multiple times. func (f *WorkFile) removeDups() { - removeDups(f.Syntax, nil, &f.Replace) + removeDups(f.Syntax, nil, &f.Replace, nil) } diff --git a/vendor/golang.org/x/net/LICENSE b/vendor/golang.org/x/net/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/net/LICENSE +++ b/vendor/golang.org/x/net/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/sync/LICENSE b/vendor/golang.org/x/sync/LICENSE new file mode 100644 index 0000000..2a7cf70 --- /dev/null +++ b/vendor/golang.org/x/sync/LICENSE @@ -0,0 +1,27 @@ +Copyright 2009 The Go Authors. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google LLC nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/sync/PATENTS b/vendor/golang.org/x/sync/PATENTS new file mode 100644 index 0000000..7330990 --- /dev/null +++ b/vendor/golang.org/x/sync/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/vendor/golang.org/x/sync/errgroup/errgroup.go b/vendor/golang.org/x/sync/errgroup/errgroup.go new file mode 100644 index 0000000..948a3ee --- /dev/null +++ b/vendor/golang.org/x/sync/errgroup/errgroup.go @@ -0,0 +1,135 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package errgroup provides synchronization, error propagation, and Context +// cancelation for groups of goroutines working on subtasks of a common task. +// +// [errgroup.Group] is related to [sync.WaitGroup] but adds handling of tasks +// returning errors. +package errgroup + +import ( + "context" + "fmt" + "sync" +) + +type token struct{} + +// A Group is a collection of goroutines working on subtasks that are part of +// the same overall task. +// +// A zero Group is valid, has no limit on the number of active goroutines, +// and does not cancel on error. +type Group struct { + cancel func(error) + + wg sync.WaitGroup + + sem chan token + + errOnce sync.Once + err error +} + +func (g *Group) done() { + if g.sem != nil { + <-g.sem + } + g.wg.Done() +} + +// WithContext returns a new Group and an associated Context derived from ctx. +// +// The derived Context is canceled the first time a function passed to Go +// returns a non-nil error or the first time Wait returns, whichever occurs +// first. +func WithContext(ctx context.Context) (*Group, context.Context) { + ctx, cancel := withCancelCause(ctx) + return &Group{cancel: cancel}, ctx +} + +// Wait blocks until all function calls from the Go method have returned, then +// returns the first non-nil error (if any) from them. +func (g *Group) Wait() error { + g.wg.Wait() + if g.cancel != nil { + g.cancel(g.err) + } + return g.err +} + +// Go calls the given function in a new goroutine. +// It blocks until the new goroutine can be added without the number of +// active goroutines in the group exceeding the configured limit. +// +// The first call to return a non-nil error cancels the group's context, if the +// group was created by calling WithContext. The error will be returned by Wait. +func (g *Group) Go(f func() error) { + if g.sem != nil { + g.sem <- token{} + } + + g.wg.Add(1) + go func() { + defer g.done() + + if err := f(); err != nil { + g.errOnce.Do(func() { + g.err = err + if g.cancel != nil { + g.cancel(g.err) + } + }) + } + }() +} + +// TryGo calls the given function in a new goroutine only if the number of +// active goroutines in the group is currently below the configured limit. +// +// The return value reports whether the goroutine was started. +func (g *Group) TryGo(f func() error) bool { + if g.sem != nil { + select { + case g.sem <- token{}: + // Note: this allows barging iff channels in general allow barging. + default: + return false + } + } + + g.wg.Add(1) + go func() { + defer g.done() + + if err := f(); err != nil { + g.errOnce.Do(func() { + g.err = err + if g.cancel != nil { + g.cancel(g.err) + } + }) + } + }() + return true +} + +// SetLimit limits the number of active goroutines in this group to at most n. +// A negative value indicates no limit. +// +// Any subsequent call to the Go method will block until it can add an active +// goroutine without exceeding the configured limit. +// +// The limit must not be modified while any goroutines in the group are active. +func (g *Group) SetLimit(n int) { + if n < 0 { + g.sem = nil + return + } + if len(g.sem) != 0 { + panic(fmt.Errorf("errgroup: modify limit while %v goroutines in the group are still active", len(g.sem))) + } + g.sem = make(chan token, n) +} diff --git a/vendor/golang.org/x/sync/errgroup/go120.go b/vendor/golang.org/x/sync/errgroup/go120.go new file mode 100644 index 0000000..f93c740 --- /dev/null +++ b/vendor/golang.org/x/sync/errgroup/go120.go @@ -0,0 +1,13 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.20 + +package errgroup + +import "context" + +func withCancelCause(parent context.Context) (context.Context, func(error)) { + return context.WithCancelCause(parent) +} diff --git a/vendor/golang.org/x/sync/errgroup/pre_go120.go b/vendor/golang.org/x/sync/errgroup/pre_go120.go new file mode 100644 index 0000000..88ce334 --- /dev/null +++ b/vendor/golang.org/x/sync/errgroup/pre_go120.go @@ -0,0 +1,14 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !go1.20 + +package errgroup + +import "context" + +func withCancelCause(parent context.Context) (context.Context, func(error)) { + ctx, cancel := context.WithCancel(parent) + return ctx, func(error) { cancel() } +} diff --git a/vendor/golang.org/x/sync/semaphore/semaphore.go b/vendor/golang.org/x/sync/semaphore/semaphore.go new file mode 100644 index 0000000..b618162 --- /dev/null +++ b/vendor/golang.org/x/sync/semaphore/semaphore.go @@ -0,0 +1,160 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package semaphore provides a weighted semaphore implementation. +package semaphore // import "golang.org/x/sync/semaphore" + +import ( + "container/list" + "context" + "sync" +) + +type waiter struct { + n int64 + ready chan<- struct{} // Closed when semaphore acquired. +} + +// NewWeighted creates a new weighted semaphore with the given +// maximum combined weight for concurrent access. +func NewWeighted(n int64) *Weighted { + w := &Weighted{size: n} + return w +} + +// Weighted provides a way to bound concurrent access to a resource. +// The callers can request access with a given weight. +type Weighted struct { + size int64 + cur int64 + mu sync.Mutex + waiters list.List +} + +// Acquire acquires the semaphore with a weight of n, blocking until resources +// are available or ctx is done. On success, returns nil. On failure, returns +// ctx.Err() and leaves the semaphore unchanged. +func (s *Weighted) Acquire(ctx context.Context, n int64) error { + done := ctx.Done() + + s.mu.Lock() + select { + case <-done: + // ctx becoming done has "happened before" acquiring the semaphore, + // whether it became done before the call began or while we were + // waiting for the mutex. We prefer to fail even if we could acquire + // the mutex without blocking. + s.mu.Unlock() + return ctx.Err() + default: + } + if s.size-s.cur >= n && s.waiters.Len() == 0 { + // Since we hold s.mu and haven't synchronized since checking done, if + // ctx becomes done before we return here, it becoming done must have + // "happened concurrently" with this call - it cannot "happen before" + // we return in this branch. So, we're ok to always acquire here. + s.cur += n + s.mu.Unlock() + return nil + } + + if n > s.size { + // Don't make other Acquire calls block on one that's doomed to fail. + s.mu.Unlock() + <-done + return ctx.Err() + } + + ready := make(chan struct{}) + w := waiter{n: n, ready: ready} + elem := s.waiters.PushBack(w) + s.mu.Unlock() + + select { + case <-done: + s.mu.Lock() + select { + case <-ready: + // Acquired the semaphore after we were canceled. + // Pretend we didn't and put the tokens back. + s.cur -= n + s.notifyWaiters() + default: + isFront := s.waiters.Front() == elem + s.waiters.Remove(elem) + // If we're at the front and there're extra tokens left, notify other waiters. + if isFront && s.size > s.cur { + s.notifyWaiters() + } + } + s.mu.Unlock() + return ctx.Err() + + case <-ready: + // Acquired the semaphore. Check that ctx isn't already done. + // We check the done channel instead of calling ctx.Err because we + // already have the channel, and ctx.Err is O(n) with the nesting + // depth of ctx. + select { + case <-done: + s.Release(n) + return ctx.Err() + default: + } + return nil + } +} + +// TryAcquire acquires the semaphore with a weight of n without blocking. +// On success, returns true. On failure, returns false and leaves the semaphore unchanged. +func (s *Weighted) TryAcquire(n int64) bool { + s.mu.Lock() + success := s.size-s.cur >= n && s.waiters.Len() == 0 + if success { + s.cur += n + } + s.mu.Unlock() + return success +} + +// Release releases the semaphore with a weight of n. +func (s *Weighted) Release(n int64) { + s.mu.Lock() + s.cur -= n + if s.cur < 0 { + s.mu.Unlock() + panic("semaphore: released more than held") + } + s.notifyWaiters() + s.mu.Unlock() +} + +func (s *Weighted) notifyWaiters() { + for { + next := s.waiters.Front() + if next == nil { + break // No more waiters blocked. + } + + w := next.Value.(waiter) + if s.size-s.cur < w.n { + // Not enough tokens for the next waiter. We could keep going (to try to + // find a waiter with a smaller request), but under load that could cause + // starvation for large requests; instead, we leave all remaining waiters + // blocked. + // + // Consider a semaphore used as a read-write lock, with N tokens, N + // readers, and one writer. Each reader can Acquire(1) to obtain a read + // lock. The writer can Acquire(N) to obtain a write lock, excluding all + // of the readers. If we allow the readers to jump ahead in the queue, + // the writer will starve — there is always one token available for every + // reader. + break + } + + s.cur += w.n + s.waiters.Remove(next) + close(w.ready) + } +} diff --git a/vendor/golang.org/x/sys/LICENSE b/vendor/golang.org/x/sys/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/sys/LICENSE +++ b/vendor/golang.org/x/sys/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go index 8fa707a..02609d5 100644 --- a/vendor/golang.org/x/sys/cpu/cpu.go +++ b/vendor/golang.org/x/sys/cpu/cpu.go @@ -105,6 +105,8 @@ var ARM64 struct { HasSVE bool // Scalable Vector Extensions HasSVE2 bool // Scalable Vector Extensions 2 HasASIMDFHM bool // Advanced SIMD multiplication FP16 to FP32 + HasDIT bool // Data Independent Timing support + HasI8MM bool // Advanced SIMD Int8 matrix multiplication instructions _ CacheLinePad } @@ -199,6 +201,25 @@ var S390X struct { _ CacheLinePad } +// RISCV64 contains the supported CPU features and performance characteristics for riscv64 +// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate +// the presence of RISC-V extensions. +// +// It is safe to assume that all the RV64G extensions are supported and so they are omitted from +// this structure. As riscv64 Go programs require at least RV64G, the code that populates +// this structure cannot run successfully if some of the RV64G extensions are missing. +// The struct is padded to avoid false sharing. +var RISCV64 struct { + _ CacheLinePad + HasFastMisaligned bool // Fast misaligned accesses + HasC bool // Compressed instruction-set extension + HasV bool // Vector extension compatible with RVV 1.0 + HasZba bool // Address generation instructions extension + HasZbb bool // Basic bit-manipulation extension + HasZbs bool // Single-bit instructions extension + _ CacheLinePad +} + func init() { archInit() initOptions() diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_arm64.go index 0e27a21..af2aa99 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.go @@ -38,6 +38,8 @@ func initOptions() { {Name: "dcpop", Feature: &ARM64.HasDCPOP}, {Name: "asimddp", Feature: &ARM64.HasASIMDDP}, {Name: "asimdfhm", Feature: &ARM64.HasASIMDFHM}, + {Name: "dit", Feature: &ARM64.HasDIT}, + {Name: "i8mm", Feature: &ARM64.HasI8MM}, } } @@ -145,6 +147,11 @@ func parseARM64SystemRegisters(isar0, isar1, pfr0 uint64) { ARM64.HasLRCPC = true } + switch extractBits(isar1, 52, 55) { + case 1: + ARM64.HasI8MM = true + } + // ID_AA64PFR0_EL1 switch extractBits(pfr0, 16, 19) { case 0: @@ -168,6 +175,11 @@ func parseARM64SystemRegisters(isar0, isar1, pfr0 uint64) { parseARM64SVERegister(getzfr0()) } + + switch extractBits(pfr0, 48, 51) { + case 1: + ARM64.HasDIT = true + } } func parseARM64SVERegister(zfr0 uint64) { diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go index 3d386d0..08f35ea 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go @@ -35,8 +35,10 @@ const ( hwcap_SHA512 = 1 << 21 hwcap_SVE = 1 << 22 hwcap_ASIMDFHM = 1 << 23 + hwcap_DIT = 1 << 24 hwcap2_SVE2 = 1 << 1 + hwcap2_I8MM = 1 << 13 ) // linuxKernelCanEmulateCPUID reports whether we're running @@ -106,9 +108,12 @@ func doinit() { ARM64.HasSHA512 = isSet(hwCap, hwcap_SHA512) ARM64.HasSVE = isSet(hwCap, hwcap_SVE) ARM64.HasASIMDFHM = isSet(hwCap, hwcap_ASIMDFHM) + ARM64.HasDIT = isSet(hwCap, hwcap_DIT) + // HWCAP2 feature bits ARM64.HasSVE2 = isSet(hwCap2, hwcap2_SVE2) + ARM64.HasI8MM = isSet(hwCap2, hwcap2_I8MM) } func isSet(hwc uint, value uint) bool { diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go index cd63e73..7d902b6 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x +//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x && !riscv64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go new file mode 100644 index 0000000..cb4a0c5 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go @@ -0,0 +1,137 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +import ( + "syscall" + "unsafe" +) + +// RISC-V extension discovery code for Linux. The approach here is to first try the riscv_hwprobe +// syscall falling back to HWCAP to check for the C extension if riscv_hwprobe is not available. +// +// A note on detection of the Vector extension using HWCAP. +// +// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5. +// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe +// syscall is not available then neither is the Vector extension (which needs kernel support). +// The riscv_hwprobe syscall should then be all we need to detect the Vector extension. +// However, some RISC-V board manufacturers ship boards with an older kernel on top of which +// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe +// patches. These kernels advertise support for the Vector extension using HWCAP. Falling +// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not +// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option. +// +// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by +// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board +// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified +// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use +// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector +// extension are binary incompatible. HWCAP can then not be used in isolation to populate the +// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0. +// +// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector +// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype +// register. This check would allow us to safely detect version 1.0 of the Vector extension +// with HWCAP, if riscv_hwprobe were not available. However, the check cannot +// be added until the assembler supports the Vector instructions. +// +// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the +// extensions it advertises support for are explicitly versioned. It's also worth noting that +// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zba. +// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority +// of RISC-V extensions. +// +// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information. + +// golang.org/x/sys/cpu is not allowed to depend on golang.org/x/sys/unix so we must +// reproduce the constants, types and functions needed to make the riscv_hwprobe syscall +// here. + +const ( + // Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go. + riscv_HWPROBE_KEY_IMA_EXT_0 = 0x4 + riscv_HWPROBE_IMA_C = 0x2 + riscv_HWPROBE_IMA_V = 0x4 + riscv_HWPROBE_EXT_ZBA = 0x8 + riscv_HWPROBE_EXT_ZBB = 0x10 + riscv_HWPROBE_EXT_ZBS = 0x20 + riscv_HWPROBE_KEY_CPUPERF_0 = 0x5 + riscv_HWPROBE_MISALIGNED_FAST = 0x3 + riscv_HWPROBE_MISALIGNED_MASK = 0x7 +) + +const ( + // sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go. + sys_RISCV_HWPROBE = 258 +) + +// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go. +type riscvHWProbePairs struct { + key int64 + value uint64 +} + +const ( + // CPU features + hwcap_RISCV_ISA_C = 1 << ('C' - 'A') +) + +func doinit() { + // A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key + // field should be initialised with one of the key constants defined above, e.g., + // RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value. + // If the kernel does not recognise a key it will set the key field to -1 and the value field to 0. + + pairs := []riscvHWProbePairs{ + {riscv_HWPROBE_KEY_IMA_EXT_0, 0}, + {riscv_HWPROBE_KEY_CPUPERF_0, 0}, + } + + // This call only indicates that extensions are supported if they are implemented on all cores. + if riscvHWProbe(pairs, 0) { + if pairs[0].key != -1 { + v := uint(pairs[0].value) + RISCV64.HasC = isSet(v, riscv_HWPROBE_IMA_C) + RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V) + RISCV64.HasZba = isSet(v, riscv_HWPROBE_EXT_ZBA) + RISCV64.HasZbb = isSet(v, riscv_HWPROBE_EXT_ZBB) + RISCV64.HasZbs = isSet(v, riscv_HWPROBE_EXT_ZBS) + } + if pairs[1].key != -1 { + v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK + RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST + } + } + + // Let's double check with HWCAP if the C extension does not appear to be supported. + // This may happen if we're running on a kernel older than 6.4. + + if !RISCV64.HasC { + RISCV64.HasC = isSet(hwCap, hwcap_RISCV_ISA_C) + } +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} + +// riscvHWProbe is a simplified version of the generated wrapper function found in +// golang.org/x/sys/unix/zsyscall_linux_riscv64.go. We simplify it by removing the +// cpuCount and cpus parameters which we do not need. We always want to pass 0 for +// these parameters here so the kernel only reports the extensions that are present +// on all cores. +func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool { + var _zero uintptr + var p0 unsafe.Pointer + if len(pairs) > 0 { + p0 = unsafe.Pointer(&pairs[0]) + } else { + p0 = unsafe.Pointer(&_zero) + } + + _, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(p0), uintptr(len(pairs)), uintptr(0), uintptr(0), uintptr(flags), 0) + return e1 == 0 +} diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go index 7f0c79c..aca3199 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go @@ -8,4 +8,13 @@ package cpu const cacheLineSize = 64 -func initOptions() {} +func initOptions() { + options = []option{ + {Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned}, + {Name: "c", Feature: &RISCV64.HasC}, + {Name: "v", Feature: &RISCV64.HasV}, + {Name: "zba", Feature: &RISCV64.HasZba}, + {Name: "zbb", Feature: &RISCV64.HasZbb}, + {Name: "zbs", Feature: &RISCV64.HasZbs}, + } +} diff --git a/vendor/golang.org/x/sys/execabs/execabs.go b/vendor/golang.org/x/sys/execabs/execabs.go deleted file mode 100644 index 3bf40fd..0000000 --- a/vendor/golang.org/x/sys/execabs/execabs.go +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package execabs is a drop-in replacement for os/exec -// that requires PATH lookups to find absolute paths. -// That is, execabs.Command("cmd") runs the same PATH lookup -// as exec.Command("cmd"), but if the result is a path -// which is relative, the Run and Start methods will report -// an error instead of running the executable. -// -// See https://blog.golang.org/path-security for more information -// about when it may be necessary or appropriate to use this package. -package execabs - -import ( - "context" - "fmt" - "os/exec" - "path/filepath" - "reflect" - "unsafe" -) - -// ErrNotFound is the error resulting if a path search failed to find an executable file. -// It is an alias for exec.ErrNotFound. -var ErrNotFound = exec.ErrNotFound - -// Cmd represents an external command being prepared or run. -// It is an alias for exec.Cmd. -type Cmd = exec.Cmd - -// Error is returned by LookPath when it fails to classify a file as an executable. -// It is an alias for exec.Error. -type Error = exec.Error - -// An ExitError reports an unsuccessful exit by a command. -// It is an alias for exec.ExitError. -type ExitError = exec.ExitError - -func relError(file, path string) error { - return fmt.Errorf("%s resolves to executable in current directory (.%c%s)", file, filepath.Separator, path) -} - -// LookPath searches for an executable named file in the directories -// named by the PATH environment variable. If file contains a slash, -// it is tried directly and the PATH is not consulted. The result will be -// an absolute path. -// -// LookPath differs from exec.LookPath in its handling of PATH lookups, -// which are used for file names without slashes. If exec.LookPath's -// PATH lookup would have returned an executable from the current directory, -// LookPath instead returns an error. -func LookPath(file string) (string, error) { - path, err := exec.LookPath(file) - if err != nil && !isGo119ErrDot(err) { - return "", err - } - if filepath.Base(file) == file && !filepath.IsAbs(path) { - return "", relError(file, path) - } - return path, nil -} - -func fixCmd(name string, cmd *exec.Cmd) { - if filepath.Base(name) == name && !filepath.IsAbs(cmd.Path) && !isGo119ErrFieldSet(cmd) { - // exec.Command was called with a bare binary name and - // exec.LookPath returned a path which is not absolute. - // Set cmd.lookPathErr and clear cmd.Path so that it - // cannot be run. - lookPathErr := (*error)(unsafe.Pointer(reflect.ValueOf(cmd).Elem().FieldByName("lookPathErr").Addr().Pointer())) - if *lookPathErr == nil { - *lookPathErr = relError(name, cmd.Path) - } - cmd.Path = "" - } -} - -// CommandContext is like Command but includes a context. -// -// The provided context is used to kill the process (by calling os.Process.Kill) -// if the context becomes done before the command completes on its own. -func CommandContext(ctx context.Context, name string, arg ...string) *exec.Cmd { - cmd := exec.CommandContext(ctx, name, arg...) - fixCmd(name, cmd) - return cmd - -} - -// Command returns the Cmd struct to execute the named program with the given arguments. -// See exec.Command for most details. -// -// Command differs from exec.Command in its handling of PATH lookups, -// which are used when the program name contains no slashes. -// If exec.Command would have returned an exec.Cmd configured to run an -// executable from the current directory, Command instead -// returns an exec.Cmd that will return an error from Start or Run. -func Command(name string, arg ...string) *exec.Cmd { - cmd := exec.Command(name, arg...) - fixCmd(name, cmd) - return cmd -} diff --git a/vendor/golang.org/x/sys/execabs/execabs_go118.go b/vendor/golang.org/x/sys/execabs/execabs_go118.go deleted file mode 100644 index 5627d70..0000000 --- a/vendor/golang.org/x/sys/execabs/execabs_go118.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.19 - -package execabs - -import "os/exec" - -func isGo119ErrDot(err error) bool { - return false -} - -func isGo119ErrFieldSet(cmd *exec.Cmd) bool { - return false -} diff --git a/vendor/golang.org/x/sys/execabs/execabs_go119.go b/vendor/golang.org/x/sys/execabs/execabs_go119.go deleted file mode 100644 index d60ab1b..0000000 --- a/vendor/golang.org/x/sys/execabs/execabs_go119.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.19 - -package execabs - -import ( - "errors" - "os/exec" -) - -func isGo119ErrDot(err error) bool { - return errors.Is(err, exec.ErrDot) -} - -func isGo119ErrFieldSet(cmd *exec.Cmd) bool { - return cmd.Err != nil -} diff --git a/vendor/golang.org/x/sys/unix/README.md b/vendor/golang.org/x/sys/unix/README.md index 7d3c060..6e08a76 100644 --- a/vendor/golang.org/x/sys/unix/README.md +++ b/vendor/golang.org/x/sys/unix/README.md @@ -156,7 +156,7 @@ from the generated architecture-specific files listed below, and merge these into a common file for each OS. The merge is performed in the following steps: -1. Construct the set of common code that is idential in all architecture-specific files. +1. Construct the set of common code that is identical in all architecture-specific files. 2. Write this common code to the merged file. 3. Remove the common code from all architecture-specific files. diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh index fdcaa97..ac54eca 100644 --- a/vendor/golang.org/x/sys/unix/mkerrors.sh +++ b/vendor/golang.org/x/sys/unix/mkerrors.sh @@ -58,6 +58,7 @@ includes_Darwin=' #define _DARWIN_USE_64_BIT_INODE #define __APPLE_USE_RFC_3542 #include +#include #include #include #include @@ -263,6 +264,7 @@ struct ltchars { #include #include #include +#include #include #include #include @@ -549,6 +551,8 @@ ccflags="$@" $2 !~ "NLA_TYPE_MASK" && $2 !~ /^RTC_VL_(ACCURACY|BACKUP|DATA)/ && $2 ~ /^(NETLINK|NLM|NLMSG|NLA|IFA|IFAN|RT|RTC|RTCF|RTN|RTPROT|RTNH|ARPHRD|ETH_P|NETNSA)_/ || + $2 ~ /^SOCK_|SK_DIAG_|SKNLGRP_$/ || + $2 ~ /^(CONNECT|SAE)_/ || $2 ~ /^FIORDCHK$/ || $2 ~ /^SIOC/ || $2 ~ /^TIOC/ || @@ -652,7 +656,7 @@ errors=$( signals=$( echo '#include ' | $CC -x c - -E -dM $ccflags | awk '$1=="#define" && $2 ~ /^SIG[A-Z0-9]+$/ { print $2 }' | - grep -v 'SIGSTKSIZE\|SIGSTKSZ\|SIGRT\|SIGMAX64' | + grep -E -v '(SIGSTKSIZE|SIGSTKSZ|SIGRT|SIGMAX64)' | sort ) @@ -662,7 +666,7 @@ echo '#include ' | $CC -x c - -E -dM $ccflags | sort >_error.grep echo '#include ' | $CC -x c - -E -dM $ccflags | awk '$1=="#define" && $2 ~ /^SIG[A-Z0-9]+$/ { print "^\t" $2 "[ \t]*=" }' | - grep -v 'SIGSTKSIZE\|SIGSTKSZ\|SIGRT\|SIGMAX64' | + grep -E -v '(SIGSTKSIZE|SIGSTKSZ|SIGRT|SIGMAX64)' | sort >_signal.grep echo '// mkerrors.sh' "$@" diff --git a/vendor/golang.org/x/sys/unix/mremap.go b/vendor/golang.org/x/sys/unix/mremap.go index fd45fe5..3a5e776 100644 --- a/vendor/golang.org/x/sys/unix/mremap.go +++ b/vendor/golang.org/x/sys/unix/mremap.go @@ -50,3 +50,8 @@ func (m *mremapMmapper) Mremap(oldData []byte, newLength int, flags int) (data [ func Mremap(oldData []byte, newLength int, flags int) (data []byte, err error) { return mapper.Mremap(oldData, newLength, flags) } + +func MremapPtr(oldAddr unsafe.Pointer, oldSize uintptr, newAddr unsafe.Pointer, newSize uintptr, flags int) (ret unsafe.Pointer, err error) { + xaddr, err := mapper.mremap(uintptr(oldAddr), oldSize, newSize, flags, uintptr(newAddr)) + return unsafe.Pointer(xaddr), err +} diff --git a/vendor/golang.org/x/sys/unix/syscall_aix.go b/vendor/golang.org/x/sys/unix/syscall_aix.go index 67ce6ce..6f15ba1 100644 --- a/vendor/golang.org/x/sys/unix/syscall_aix.go +++ b/vendor/golang.org/x/sys/unix/syscall_aix.go @@ -360,7 +360,7 @@ func Wait4(pid int, wstatus *WaitStatus, options int, rusage *Rusage) (wpid int, var status _C_int var r Pid_t err = ERESTART - // AIX wait4 may return with ERESTART errno, while the processus is still + // AIX wait4 may return with ERESTART errno, while the process is still // active. for err == ERESTART { r, err = wait4(Pid_t(pid), &status, options, rusage) diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go index 59542a8..099867d 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go @@ -402,6 +402,18 @@ func IoctlSetIfreqMTU(fd int, ifreq *IfreqMTU) error { return ioctlPtr(fd, SIOCSIFMTU, unsafe.Pointer(ifreq)) } +//sys renamexNp(from string, to string, flag uint32) (err error) + +func RenamexNp(from string, to string, flag uint32) (err error) { + return renamexNp(from, to, flag) +} + +//sys renameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) + +func RenameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) { + return renameatxNp(fromfd, from, tofd, to, flag) +} + //sys sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) = SYS_SYSCTL func Uname(uname *Utsname) error { @@ -542,6 +554,55 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) { } } +//sys pthread_chdir_np(path string) (err error) + +func PthreadChdir(path string) (err error) { + return pthread_chdir_np(path) +} + +//sys pthread_fchdir_np(fd int) (err error) + +func PthreadFchdir(fd int) (err error) { + return pthread_fchdir_np(fd) +} + +// Connectx calls connectx(2) to initiate a connection on a socket. +// +// srcIf, srcAddr, and dstAddr are filled into a [SaEndpoints] struct and passed as the endpoints argument. +// +// - srcIf is the optional source interface index. 0 means unspecified. +// - srcAddr is the optional source address. nil means unspecified. +// - dstAddr is the destination address. +// +// On success, Connectx returns the number of bytes enqueued for transmission. +func Connectx(fd int, srcIf uint32, srcAddr, dstAddr Sockaddr, associd SaeAssocID, flags uint32, iov []Iovec, connid *SaeConnID) (n uintptr, err error) { + endpoints := SaEndpoints{ + Srcif: srcIf, + } + + if srcAddr != nil { + addrp, addrlen, err := srcAddr.sockaddr() + if err != nil { + return 0, err + } + endpoints.Srcaddr = (*RawSockaddr)(addrp) + endpoints.Srcaddrlen = uint32(addrlen) + } + + if dstAddr != nil { + addrp, addrlen, err := dstAddr.sockaddr() + if err != nil { + return 0, err + } + endpoints.Dstaddr = (*RawSockaddr)(addrp) + endpoints.Dstaddrlen = uint32(addrlen) + } + + err = connectx(fd, &endpoints, associd, flags, iov, &n, connid) + return +} + +//sys connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) //sys sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) //sys shmat(id int, addr uintptr, flag int) (ret uintptr, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go index ba46651..a6a2d2f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_hurd.go +++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go @@ -11,6 +11,7 @@ package unix int ioctl(int, unsigned long int, uintptr_t); */ import "C" +import "unsafe" func ioctl(fd int, req uint, arg uintptr) (err error) { r0, er := C.ioctl(C.int(fd), C.ulong(req), C.uintptr_t(arg)) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux.go b/vendor/golang.org/x/sys/unix/syscall_linux.go index 5682e26..f08abd4 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux.go @@ -1295,6 +1295,48 @@ func GetsockoptTCPInfo(fd, level, opt int) (*TCPInfo, error) { return &value, err } +// GetsockoptTCPCCVegasInfo returns algorithm specific congestion control information for a socket using the "vegas" +// algorithm. +// +// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option: +// +// algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION) +func GetsockoptTCPCCVegasInfo(fd, level, opt int) (*TCPVegasInfo, error) { + var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment + vallen := _Socklen(SizeofTCPCCInfo) + err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen) + out := (*TCPVegasInfo)(unsafe.Pointer(&value[0])) + return out, err +} + +// GetsockoptTCPCCDCTCPInfo returns algorithm specific congestion control information for a socket using the "dctp" +// algorithm. +// +// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option: +// +// algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION) +func GetsockoptTCPCCDCTCPInfo(fd, level, opt int) (*TCPDCTCPInfo, error) { + var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment + vallen := _Socklen(SizeofTCPCCInfo) + err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen) + out := (*TCPDCTCPInfo)(unsafe.Pointer(&value[0])) + return out, err +} + +// GetsockoptTCPCCBBRInfo returns algorithm specific congestion control information for a socket using the "bbr" +// algorithm. +// +// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option: +// +// algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION) +func GetsockoptTCPCCBBRInfo(fd, level, opt int) (*TCPBBRInfo, error) { + var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment + vallen := _Socklen(SizeofTCPCCInfo) + err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen) + out := (*TCPBBRInfo)(unsafe.Pointer(&value[0])) + return out, err +} + // GetsockoptString returns the string value of the socket option opt for the // socket associated with fd at the given socket level. func GetsockoptString(fd, level, opt int) (string, error) { @@ -1959,7 +2001,26 @@ func Getpgrp() (pid int) { //sysnb Getpid() (pid int) //sysnb Getppid() (ppid int) //sys Getpriority(which int, who int) (prio int, err error) -//sys Getrandom(buf []byte, flags int) (n int, err error) + +func Getrandom(buf []byte, flags int) (n int, err error) { + vdsoRet, supported := vgetrandom(buf, uint32(flags)) + if supported { + if vdsoRet < 0 { + return 0, errnoErr(syscall.Errno(-vdsoRet)) + } + return vdsoRet, nil + } + var p *byte + if len(buf) > 0 { + p = &buf[0] + } + r, _, e := Syscall(SYS_GETRANDOM, uintptr(unsafe.Pointer(p)), uintptr(len(buf)), uintptr(flags)) + if e != 0 { + return 0, errnoErr(e) + } + return int(r), nil +} + //sysnb Getrusage(who int, rusage *Rusage) (err error) //sysnb Getsid(pid int) (sid int, err error) //sysnb Gettid() (tid int) @@ -2592,3 +2653,4 @@ func SchedGetAttr(pid int, flags uint) (*SchedAttr, error) { } //sys Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error) +//sys Mseal(b []byte, flags uint) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go index cf2ee6c..745e5c7 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go @@ -182,3 +182,5 @@ func KexecFileLoad(kernelFd int, initrdFd int, cmdline string, flags int) error } return kexecFileLoad(kernelFd, initrdFd, cmdlineLen, cmdline, flags) } + +const SYS_FSTATAT = SYS_NEWFSTATAT diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go index 3d0e984..dd2262a 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go @@ -214,3 +214,5 @@ func KexecFileLoad(kernelFd int, initrdFd int, cmdline string, flags int) error } return kexecFileLoad(kernelFd, initrdFd, cmdlineLen, cmdline, flags) } + +const SYS_FSTATAT = SYS_NEWFSTATAT diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go index 6f5a288..8cf3670 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go @@ -187,3 +187,5 @@ func RISCVHWProbe(pairs []RISCVHWProbePairs, set *CPUSet, flags uint) (err error } return riscvHWProbe(pairs, setSize, set, flags) } + +const SYS_FSTATAT = SYS_NEWFSTATAT diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd.go b/vendor/golang.org/x/sys/unix/syscall_openbsd.go index b25343c..b86ded5 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd.go @@ -293,6 +293,7 @@ func Uname(uname *Utsname) error { //sys Mkfifoat(dirfd int, path string, mode uint32) (err error) //sys Mknod(path string, mode uint32, dev int) (err error) //sys Mknodat(dirfd int, path string, mode uint32, dev int) (err error) +//sys Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) //sys Nanosleep(time *Timespec, leftover *Timespec) (err error) //sys Open(path string, mode int, perm uint32) (fd int, err error) //sys Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_unix.go b/vendor/golang.org/x/sys/unix/syscall_unix.go index 77081de..4e92e5a 100644 --- a/vendor/golang.org/x/sys/unix/syscall_unix.go +++ b/vendor/golang.org/x/sys/unix/syscall_unix.go @@ -154,6 +154,15 @@ func Munmap(b []byte) (err error) { return mapper.Munmap(b) } +func MmapPtr(fd int, offset int64, addr unsafe.Pointer, length uintptr, prot int, flags int) (ret unsafe.Pointer, err error) { + xaddr, err := mapper.mmap(uintptr(addr), length, prot, flags, fd, offset) + return unsafe.Pointer(xaddr), err +} + +func MunmapPtr(addr unsafe.Pointer, length uintptr) (err error) { + return mapper.munmap(uintptr(addr), length) +} + func Read(fd int, p []byte) (n int, err error) { n, err = read(fd, p) if raceenabled { diff --git a/vendor/golang.org/x/sys/unix/vgetrandom_linux.go b/vendor/golang.org/x/sys/unix/vgetrandom_linux.go new file mode 100644 index 0000000..07ac8e0 --- /dev/null +++ b/vendor/golang.org/x/sys/unix/vgetrandom_linux.go @@ -0,0 +1,13 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build linux && go1.24 + +package unix + +import _ "unsafe" + +//go:linkname vgetrandom runtime.vgetrandom +//go:noescape +func vgetrandom(p []byte, flags uint32) (ret int, supported bool) diff --git a/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go b/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go new file mode 100644 index 0000000..297e97b --- /dev/null +++ b/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go @@ -0,0 +1,11 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !linux || !go1.24 + +package unix + +func vgetrandom(p []byte, flags uint32) (ret int, supported bool) { + return -1, false +} diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go index e40fa85..d73c465 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go @@ -237,6 +237,9 @@ const ( CLOCK_UPTIME_RAW_APPROX = 0x9 CLONE_NOFOLLOW = 0x1 CLONE_NOOWNERCOPY = 0x2 + CONNECT_DATA_AUTHENTICATED = 0x4 + CONNECT_DATA_IDEMPOTENT = 0x2 + CONNECT_RESUME_ON_READ_WRITE = 0x1 CR0 = 0x0 CR1 = 0x1000 CR2 = 0x2000 @@ -1169,6 +1172,11 @@ const ( PT_WRITE_D = 0x5 PT_WRITE_I = 0x4 PT_WRITE_U = 0x6 + RENAME_EXCL = 0x4 + RENAME_NOFOLLOW_ANY = 0x10 + RENAME_RESERVED1 = 0x8 + RENAME_SECLUDE = 0x1 + RENAME_SWAP = 0x2 RLIMIT_AS = 0x5 RLIMIT_CORE = 0x4 RLIMIT_CPU = 0x0 @@ -1260,6 +1268,10 @@ const ( RTV_SSTHRESH = 0x20 RUSAGE_CHILDREN = -0x1 RUSAGE_SELF = 0x0 + SAE_ASSOCID_ALL = 0xffffffff + SAE_ASSOCID_ANY = 0x0 + SAE_CONNID_ALL = 0xffffffff + SAE_CONNID_ANY = 0x0 SCM_CREDS = 0x3 SCM_RIGHTS = 0x1 SCM_TIMESTAMP = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go index bb02aa6..4a55a40 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go @@ -237,6 +237,9 @@ const ( CLOCK_UPTIME_RAW_APPROX = 0x9 CLONE_NOFOLLOW = 0x1 CLONE_NOOWNERCOPY = 0x2 + CONNECT_DATA_AUTHENTICATED = 0x4 + CONNECT_DATA_IDEMPOTENT = 0x2 + CONNECT_RESUME_ON_READ_WRITE = 0x1 CR0 = 0x0 CR1 = 0x1000 CR2 = 0x2000 @@ -1169,6 +1172,11 @@ const ( PT_WRITE_D = 0x5 PT_WRITE_I = 0x4 PT_WRITE_U = 0x6 + RENAME_EXCL = 0x4 + RENAME_NOFOLLOW_ANY = 0x10 + RENAME_RESERVED1 = 0x8 + RENAME_SECLUDE = 0x1 + RENAME_SWAP = 0x2 RLIMIT_AS = 0x5 RLIMIT_CORE = 0x4 RLIMIT_CPU = 0x0 @@ -1260,6 +1268,10 @@ const ( RTV_SSTHRESH = 0x20 RUSAGE_CHILDREN = -0x1 RUSAGE_SELF = 0x0 + SAE_ASSOCID_ALL = 0xffffffff + SAE_ASSOCID_ANY = 0x0 + SAE_CONNID_ALL = 0xffffffff + SAE_CONNID_ANY = 0x0 SCM_CREDS = 0x3 SCM_RIGHTS = 0x1 SCM_TIMESTAMP = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux.go b/vendor/golang.org/x/sys/unix/zerrors_linux.go index 93a38a9..de3b462 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux.go @@ -457,6 +457,7 @@ const ( B600 = 0x8 B75 = 0x2 B9600 = 0xd + BCACHEFS_SUPER_MAGIC = 0xca451a4e BDEVFS_MAGIC = 0x62646576 BINDERFS_SUPER_MAGIC = 0x6c6f6f70 BINFMTFS_MAGIC = 0x42494e4d @@ -494,6 +495,7 @@ const ( BPF_F_TEST_REG_INVARIANTS = 0x80 BPF_F_TEST_RND_HI32 = 0x4 BPF_F_TEST_RUN_ON_CPU = 0x1 + BPF_F_TEST_SKB_CHECKSUM_COMPLETE = 0x4 BPF_F_TEST_STATE_FREQ = 0x8 BPF_F_TEST_XDP_LIVE_FRAMES = 0x2 BPF_F_XDP_DEV_BOUND_ONLY = 0x40 @@ -502,6 +504,7 @@ const ( BPF_IMM = 0x0 BPF_IND = 0x40 BPF_JA = 0x0 + BPF_JCOND = 0xe0 BPF_JEQ = 0x10 BPF_JGE = 0x30 BPF_JGT = 0x20 @@ -657,6 +660,9 @@ const ( CAN_NPROTO = 0x8 CAN_RAW = 0x1 CAN_RAW_FILTER_MAX = 0x200 + CAN_RAW_XL_VCID_RX_FILTER = 0x4 + CAN_RAW_XL_VCID_TX_PASS = 0x2 + CAN_RAW_XL_VCID_TX_SET = 0x1 CAN_RTR_FLAG = 0x40000000 CAN_SFF_ID_BITS = 0xb CAN_SFF_MASK = 0x7ff @@ -924,6 +930,7 @@ const ( EPOLL_CTL_ADD = 0x1 EPOLL_CTL_DEL = 0x2 EPOLL_CTL_MOD = 0x3 + EPOLL_IOC_TYPE = 0x8a EROFS_SUPER_MAGIC_V1 = 0xe0f5e1e2 ESP_V4_FLOW = 0xa ESP_V6_FLOW = 0xc @@ -937,9 +944,6 @@ const ( ETHTOOL_FEC_OFF = 0x4 ETHTOOL_FEC_RS = 0x8 ETHTOOL_FLAG_ALL = 0x7 - ETHTOOL_FLAG_COMPACT_BITSETS = 0x1 - ETHTOOL_FLAG_OMIT_REPLY = 0x2 - ETHTOOL_FLAG_STATS = 0x4 ETHTOOL_FLASHDEV = 0x33 ETHTOOL_FLASH_MAX_FILENAME = 0x80 ETHTOOL_FWVERS_LEN = 0x20 @@ -1339,6 +1343,7 @@ const ( F_OFD_SETLK = 0x25 F_OFD_SETLKW = 0x26 F_OK = 0x0 + F_SEAL_EXEC = 0x20 F_SEAL_FUTURE_WRITE = 0x10 F_SEAL_GROW = 0x4 F_SEAL_SEAL = 0x1 @@ -1627,6 +1632,7 @@ const ( IP_FREEBIND = 0xf IP_HDRINCL = 0x3 IP_IPSEC_POLICY = 0x10 + IP_LOCAL_PORT_RANGE = 0x33 IP_MAXPACKET = 0xffff IP_MAX_MEMBERSHIPS = 0x14 IP_MF = 0x2000 @@ -1653,6 +1659,7 @@ const ( IP_PMTUDISC_OMIT = 0x5 IP_PMTUDISC_PROBE = 0x3 IP_PMTUDISC_WANT = 0x1 + IP_PROTOCOL = 0x34 IP_RECVERR = 0xb IP_RECVERR_RFC4884 = 0x1a IP_RECVFRAGSIZE = 0x19 @@ -1698,6 +1705,7 @@ const ( KEXEC_ARCH_S390 = 0x160000 KEXEC_ARCH_SH = 0x2a0000 KEXEC_ARCH_X86_64 = 0x3e0000 + KEXEC_CRASH_HOTPLUG_SUPPORT = 0x8 KEXEC_FILE_DEBUG = 0x8 KEXEC_FILE_NO_INITRAMFS = 0x4 KEXEC_FILE_ON_CRASH = 0x2 @@ -1773,6 +1781,7 @@ const ( KEY_SPEC_USER_KEYRING = -0x4 KEY_SPEC_USER_SESSION_KEYRING = -0x5 LANDLOCK_ACCESS_FS_EXECUTE = 0x1 + LANDLOCK_ACCESS_FS_IOCTL_DEV = 0x8000 LANDLOCK_ACCESS_FS_MAKE_BLOCK = 0x800 LANDLOCK_ACCESS_FS_MAKE_CHAR = 0x40 LANDLOCK_ACCESS_FS_MAKE_DIR = 0x80 @@ -1854,6 +1863,19 @@ const ( MAP_FILE = 0x0 MAP_FIXED = 0x10 MAP_FIXED_NOREPLACE = 0x100000 + MAP_HUGE_16GB = 0x88000000 + MAP_HUGE_16KB = 0x38000000 + MAP_HUGE_16MB = 0x60000000 + MAP_HUGE_1GB = 0x78000000 + MAP_HUGE_1MB = 0x50000000 + MAP_HUGE_256MB = 0x70000000 + MAP_HUGE_2GB = 0x7c000000 + MAP_HUGE_2MB = 0x54000000 + MAP_HUGE_32MB = 0x64000000 + MAP_HUGE_512KB = 0x4c000000 + MAP_HUGE_512MB = 0x74000000 + MAP_HUGE_64KB = 0x40000000 + MAP_HUGE_8MB = 0x5c000000 MAP_HUGE_MASK = 0x3f MAP_HUGE_SHIFT = 0x1a MAP_PRIVATE = 0x2 @@ -1901,6 +1923,7 @@ const ( MNT_EXPIRE = 0x4 MNT_FORCE = 0x1 MNT_ID_REQ_SIZE_VER0 = 0x18 + MNT_ID_REQ_SIZE_VER1 = 0x20 MODULE_INIT_COMPRESSED_FILE = 0x4 MODULE_INIT_IGNORE_MODVERSIONS = 0x1 MODULE_INIT_IGNORE_VERMAGIC = 0x2 @@ -2166,10 +2189,10 @@ const ( NFT_REG_SIZE = 0x10 NFT_REJECT_ICMPX_MAX = 0x3 NFT_RT_MAX = 0x4 - NFT_SECMARK_CTX_MAXLEN = 0x100 + NFT_SECMARK_CTX_MAXLEN = 0x1000 NFT_SET_MAXNAMELEN = 0x100 NFT_SOCKET_MAX = 0x3 - NFT_TABLE_F_MASK = 0x3 + NFT_TABLE_F_MASK = 0x7 NFT_TABLE_MAXNAMELEN = 0x100 NFT_TRACETYPE_MAX = 0x3 NFT_TUNNEL_F_MASK = 0x7 @@ -2335,9 +2358,11 @@ const ( PERF_MEM_LVLNUM_IO = 0xa PERF_MEM_LVLNUM_L1 = 0x1 PERF_MEM_LVLNUM_L2 = 0x2 + PERF_MEM_LVLNUM_L2_MHB = 0x5 PERF_MEM_LVLNUM_L3 = 0x3 PERF_MEM_LVLNUM_L4 = 0x4 PERF_MEM_LVLNUM_LFB = 0xc + PERF_MEM_LVLNUM_MSC = 0x6 PERF_MEM_LVLNUM_NA = 0xf PERF_MEM_LVLNUM_PMEM = 0xe PERF_MEM_LVLNUM_RAM = 0xd @@ -2403,12 +2428,14 @@ const ( PERF_RECORD_MISC_USER = 0x2 PERF_SAMPLE_BRANCH_PLM_ALL = 0x7 PERF_SAMPLE_WEIGHT_TYPE = 0x1004000 + PID_FS_MAGIC = 0x50494446 PIPEFS_MAGIC = 0x50495045 PPPIOCGNPMODE = 0xc008744c PPPIOCNEWUNIT = 0xc004743e PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 + PROCFS_IOCTL_MAGIC = 'f' PROC_SUPER_MAGIC = 0x9fa0 PROT_EXEC = 0x4 PROT_GROWSDOWN = 0x1000000 @@ -2490,6 +2517,23 @@ const ( PR_PAC_GET_ENABLED_KEYS = 0x3d PR_PAC_RESET_KEYS = 0x36 PR_PAC_SET_ENABLED_KEYS = 0x3c + PR_PPC_DEXCR_CTRL_CLEAR = 0x4 + PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC = 0x10 + PR_PPC_DEXCR_CTRL_EDITABLE = 0x1 + PR_PPC_DEXCR_CTRL_MASK = 0x1f + PR_PPC_DEXCR_CTRL_SET = 0x2 + PR_PPC_DEXCR_CTRL_SET_ONEXEC = 0x8 + PR_PPC_DEXCR_IBRTPD = 0x1 + PR_PPC_DEXCR_NPHIE = 0x3 + PR_PPC_DEXCR_SBHE = 0x0 + PR_PPC_DEXCR_SRAPD = 0x2 + PR_PPC_GET_DEXCR = 0x48 + PR_PPC_SET_DEXCR = 0x49 + PR_RISCV_CTX_SW_FENCEI_OFF = 0x1 + PR_RISCV_CTX_SW_FENCEI_ON = 0x0 + PR_RISCV_SCOPE_PER_PROCESS = 0x0 + PR_RISCV_SCOPE_PER_THREAD = 0x1 + PR_RISCV_SET_ICACHE_FLUSH_CTX = 0x47 PR_RISCV_V_GET_CONTROL = 0x46 PR_RISCV_V_SET_CONTROL = 0x45 PR_RISCV_V_VSTATE_CTRL_CUR_MASK = 0x3 @@ -2894,10 +2938,12 @@ const ( RUSAGE_SELF = 0x0 RUSAGE_THREAD = 0x1 RWF_APPEND = 0x10 + RWF_ATOMIC = 0x40 RWF_DSYNC = 0x2 RWF_HIPRI = 0x1 + RWF_NOAPPEND = 0x20 RWF_NOWAIT = 0x8 - RWF_SUPPORTED = 0x1f + RWF_SUPPORTED = 0x7f RWF_SYNC = 0x4 RWF_WRITE_LIFE_NOT_SET = 0x0 SCHED_BATCH = 0x3 @@ -2918,7 +2964,9 @@ const ( SCHED_RESET_ON_FORK = 0x40000000 SCHED_RR = 0x2 SCM_CREDENTIALS = 0x2 + SCM_PIDFD = 0x4 SCM_RIGHTS = 0x1 + SCM_SECURITY = 0x3 SCM_TIMESTAMP = 0x1d SC_LOG_FLUSH = 0x100000 SECCOMP_ADDFD_FLAG_SEND = 0x2 @@ -3051,6 +3099,8 @@ const ( SIOCSMIIREG = 0x8949 SIOCSRARP = 0x8962 SIOCWANDEV = 0x894a + SK_DIAG_BPF_STORAGE_MAX = 0x3 + SK_DIAG_BPF_STORAGE_REQ_MAX = 0x1 SMACK_MAGIC = 0x43415d53 SMART_AUTOSAVE = 0xd2 SMART_AUTO_OFFLINE = 0xdb @@ -3071,6 +3121,8 @@ const ( SOCKFS_MAGIC = 0x534f434b SOCK_BUF_LOCK_MASK = 0x3 SOCK_DCCP = 0x6 + SOCK_DESTROY = 0x15 + SOCK_DIAG_BY_FAMILY = 0x14 SOCK_IOC_TYPE = 0x89 SOCK_PACKET = 0xa SOCK_RAW = 0x3 @@ -3164,6 +3216,7 @@ const ( STATX_ATTR_MOUNT_ROOT = 0x2000 STATX_ATTR_NODUMP = 0x40 STATX_ATTR_VERITY = 0x100000 + STATX_ATTR_WRITE_ATOMIC = 0x400000 STATX_BASIC_STATS = 0x7ff STATX_BLOCKS = 0x400 STATX_BTIME = 0x800 @@ -3177,8 +3230,10 @@ const ( STATX_MTIME = 0x40 STATX_NLINK = 0x4 STATX_SIZE = 0x200 + STATX_SUBVOL = 0x8000 STATX_TYPE = 0x1 STATX_UID = 0x8 + STATX_WRITE_ATOMIC = 0x10000 STATX__RESERVED = 0x80000000 SYNC_FILE_RANGE_WAIT_AFTER = 0x4 SYNC_FILE_RANGE_WAIT_BEFORE = 0x1 @@ -3260,6 +3315,7 @@ const ( TCP_MAX_WINSHIFT = 0xe TCP_MD5SIG = 0xe TCP_MD5SIG_EXT = 0x20 + TCP_MD5SIG_FLAG_IFINDEX = 0x2 TCP_MD5SIG_FLAG_PREFIX = 0x1 TCP_MD5SIG_MAXKEYLEN = 0x50 TCP_MSS = 0x200 @@ -3576,6 +3632,7 @@ const ( XDP_UMEM_PGOFF_COMPLETION_RING = 0x180000000 XDP_UMEM_PGOFF_FILL_RING = 0x100000000 XDP_UMEM_REG = 0x4 + XDP_UMEM_TX_METADATA_LEN = 0x4 XDP_UMEM_TX_SW_CSUM = 0x2 XDP_UMEM_UNALIGNED_CHUNK_FLAG = 0x1 XDP_USE_NEED_WAKEUP = 0x8 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go index 42ff8c3..8aa6d77 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -118,6 +120,7 @@ const ( IXOFF = 0x1000 IXON = 0x400 MAP_32BIT = 0x40 + MAP_ABOVE4G = 0x80 MAP_ANON = 0x20 MAP_ANONYMOUS = 0x20 MAP_DENYWRITE = 0x800 @@ -150,9 +153,14 @@ const ( NFDBITS = 0x20 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go index dca4360..da428f4 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -118,6 +120,7 @@ const ( IXOFF = 0x1000 IXON = 0x400 MAP_32BIT = 0x40 + MAP_ABOVE4G = 0x80 MAP_ANON = 0x20 MAP_ANONYMOUS = 0x20 MAP_DENYWRITE = 0x800 @@ -150,9 +153,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go index 5cca668..bf45bfe 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x20 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go index d8cae6d..71c6716 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 ESR_MAGIC = 0x45535201 EXTPROC = 0x10000 @@ -87,6 +89,7 @@ const ( FICLONE = 0x40049409 FICLONERANGE = 0x4020940d FLUSHO = 0x1000 + FPMR_MAGIC = 0x46504d52 FPSIMD_MAGIC = 0x46508001 FS_IOC_ENABLE_VERITY = 0x40806685 FS_IOC_GETFLAGS = 0x80086601 @@ -151,9 +154,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go index 28e39af..9476628 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -152,9 +154,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go index cd66e92..b9e85f3 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x80 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x20 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go index c1595eb..a48b68a 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x80 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go index ee9456b..ea00e85 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x80 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go index 8cfca81..91c6468 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x80 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x20 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go index 60b0deb..8cbf38d 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x20 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000000 FF1 = 0x4000 @@ -150,9 +152,14 @@ const ( NL3 = 0x300 NLDLY = 0x300 NOFLSH = 0x80000000 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x4 ONLCR = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go index f90aa72..a2df734 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x20 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000000 FF1 = 0x4000 @@ -150,9 +152,14 @@ const ( NL3 = 0x300 NLDLY = 0x300 NOFLSH = 0x80000000 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x4 ONLCR = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go index ba9e015..2479137 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x20 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000000 FF1 = 0x4000 @@ -150,9 +152,14 @@ const ( NL3 = 0x300 NLDLY = 0x300 NOFLSH = 0x80000000 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x4 ONLCR = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go index 07cdfd6..d265f14 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go index 2f1dd21..3f2d644 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go @@ -78,6 +78,8 @@ const ( ECHOPRT = 0x400 EFD_CLOEXEC = 0x80000 EFD_NONBLOCK = 0x800 + EPIOCGPARAMS = 0x80088a02 + EPIOCSPARAMS = 0x40088a01 EPOLL_CLOEXEC = 0x80000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -148,9 +150,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x8008b705 NS_GET_NSTYPE = 0xb703 NS_GET_OWNER_UID = 0xb704 NS_GET_PARENT = 0xb702 + NS_GET_PID_FROM_PIDNS = 0x8004b706 + NS_GET_PID_IN_PIDNS = 0x8004b708 + NS_GET_TGID_FROM_PIDNS = 0x8004b707 + NS_GET_TGID_IN_PIDNS = 0x8004b709 NS_GET_USERNS = 0xb701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go index f40519d..5d8b727 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go @@ -82,6 +82,8 @@ const ( EFD_CLOEXEC = 0x400000 EFD_NONBLOCK = 0x4000 EMT_TAGOVF = 0x1 + EPIOCGPARAMS = 0x40088a02 + EPIOCSPARAMS = 0x80088a01 EPOLL_CLOEXEC = 0x400000 EXTPROC = 0x10000 FF1 = 0x8000 @@ -153,9 +155,14 @@ const ( NFDBITS = 0x40 NLDLY = 0x100 NOFLSH = 0x80 + NS_GET_MNTNS_ID = 0x4008b705 NS_GET_NSTYPE = 0x2000b703 NS_GET_OWNER_UID = 0x2000b704 NS_GET_PARENT = 0x2000b702 + NS_GET_PID_FROM_PIDNS = 0x4004b706 + NS_GET_PID_IN_PIDNS = 0x4004b708 + NS_GET_TGID_FROM_PIDNS = 0x4004b707 + NS_GET_TGID_IN_PIDNS = 0x4004b709 NS_GET_USERNS = 0x2000b701 OLCUC = 0x2 ONLCR = 0x4 diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go index da08b2a..1ec2b14 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go @@ -581,6 +581,8 @@ const ( AT_EMPTY_PATH = 0x1000 AT_REMOVEDIR = 0x200 RENAME_NOREPLACE = 1 << 0 + ST_RDONLY = 1 + ST_NOSUID = 2 ) const ( diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go index ccb02f2..24b346e 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go @@ -740,6 +740,54 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func renamexNp(from string, to string, flag uint32) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(from) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(to) + if err != nil { + return + } + _, _, e1 := syscall_syscall(libc_renamex_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flag)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_renamex_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_renamex_np renamex_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func renameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(from) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(to) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_renameatx_np_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), uintptr(flag), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_renameatx_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_renameatx_np renameatx_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { var _p0 unsafe.Pointer if len(mib) > 0 { @@ -760,6 +808,59 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func pthread_chdir_np(path string) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := syscall_syscall(libc_pthread_chdir_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_chdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_chdir_np pthread_chdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pthread_fchdir_np(fd int) (err error) { + _, _, e1 := syscall_syscall(libc_pthread_fchdir_np_trampoline_addr, uintptr(fd), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_fchdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_fchdir_np pthread_fchdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) { + var _p0 unsafe.Pointer + if len(iov) > 0 { + _p0 = unsafe.Pointer(&iov[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_connectx_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) { _, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags)) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s index 8b8bb28..ebd2131 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s @@ -223,11 +223,36 @@ TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_ioctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB) +TEXT libc_renamex_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_renamex_np(SB) +GLOBL ·libc_renamex_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_renamex_np_trampoline_addr(SB)/8, $libc_renamex_np_trampoline<>(SB) + +TEXT libc_renameatx_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_renameatx_np(SB) +GLOBL ·libc_renameatx_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_renameatx_np_trampoline_addr(SB)/8, $libc_renameatx_np_trampoline<>(SB) + TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sysctl(SB) GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_pthread_chdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_chdir_np(SB) +GLOBL ·libc_pthread_chdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_chdir_np_trampoline_addr(SB)/8, $libc_pthread_chdir_np_trampoline<>(SB) + +TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_fchdir_np(SB) +GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB) + +TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_connectx(SB) +GLOBL ·libc_connectx_trampoline_addr(SB), RODATA, $8 +DATA ·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB) + TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go index 1b40b99..824b9c2 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go @@ -740,6 +740,54 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func renamexNp(from string, to string, flag uint32) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(from) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(to) + if err != nil { + return + } + _, _, e1 := syscall_syscall(libc_renamex_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flag)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_renamex_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_renamex_np renamex_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func renameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(from) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(to) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_renameatx_np_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), uintptr(flag), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_renameatx_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_renameatx_np renameatx_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { var _p0 unsafe.Pointer if len(mib) > 0 { @@ -760,6 +808,59 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func pthread_chdir_np(path string) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := syscall_syscall(libc_pthread_chdir_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_chdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_chdir_np pthread_chdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pthread_fchdir_np(fd int) (err error) { + _, _, e1 := syscall_syscall(libc_pthread_fchdir_np_trampoline_addr, uintptr(fd), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_fchdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_fchdir_np pthread_fchdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) { + var _p0 unsafe.Pointer + if len(iov) > 0 { + _p0 = unsafe.Pointer(&iov[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_connectx_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) { _, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags)) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s index 08362c1..4f178a2 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s @@ -223,11 +223,36 @@ TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_ioctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB) +TEXT libc_renamex_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_renamex_np(SB) +GLOBL ·libc_renamex_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_renamex_np_trampoline_addr(SB)/8, $libc_renamex_np_trampoline<>(SB) + +TEXT libc_renameatx_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_renameatx_np(SB) +GLOBL ·libc_renameatx_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_renameatx_np_trampoline_addr(SB)/8, $libc_renameatx_np_trampoline<>(SB) + TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sysctl(SB) GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_pthread_chdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_chdir_np(SB) +GLOBL ·libc_pthread_chdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_chdir_np_trampoline_addr(SB)/8, $libc_pthread_chdir_np_trampoline<>(SB) + +TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_fchdir_np(SB) +GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB) + +TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_connectx(SB) +GLOBL ·libc_connectx_trampoline_addr(SB), RODATA, $8 +DATA ·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB) + TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux.go b/vendor/golang.org/x/sys/unix/zsyscall_linux.go index 87d8612..af30da5 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux.go @@ -971,23 +971,6 @@ func Getpriority(which int, who int) (prio int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Getrandom(buf []byte, flags int) (n int, err error) { - var _p0 unsafe.Pointer - if len(buf) > 0 { - _p0 = unsafe.Pointer(&buf[0]) - } else { - _p0 = unsafe.Pointer(&_zero) - } - r0, _, e1 := Syscall(SYS_GETRANDOM, uintptr(_p0), uintptr(len(buf)), uintptr(flags)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Getrusage(who int, rusage *Rusage) (err error) { _, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) if e1 != 0 { @@ -2229,3 +2212,19 @@ func Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) } return } + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Mseal(b []byte, flags uint) (err error) { + var _p0 unsafe.Pointer + if len(b) > 0 { + _p0 = unsafe.Pointer(&b[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, e1 := Syscall(SYS_MSEAL, uintptr(_p0), uintptr(len(b)), uintptr(flags)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go index 9dc4241..1851df1 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s index 41b5617..0b43c69 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s @@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $4 DATA ·libc_mknodat_trampoline_addr(SB)/4, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_mount(SB) +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $4 +DATA ·libc_mount_trampoline_addr(SB)/4, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_nanosleep(SB) GLOBL ·libc_nanosleep_trampoline_addr(SB), RODATA, $4 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go index 0d3a075..e1ec0db 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s index 4019a65..880c6d6 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s @@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_mount(SB) +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 +DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_nanosleep(SB) GLOBL ·libc_nanosleep_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go index c39f777..7c8452a 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s index ac4af24..b8ef95b 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s @@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $4 DATA ·libc_mknodat_trampoline_addr(SB)/4, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_mount(SB) +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $4 +DATA ·libc_mount_trampoline_addr(SB)/4, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_nanosleep(SB) GLOBL ·libc_nanosleep_trampoline_addr(SB), RODATA, $4 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go index 57571d0..2ffdf86 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s index f77d532..2af3b5c 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s @@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_mount(SB) +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 +DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_nanosleep(SB) GLOBL ·libc_nanosleep_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go index e62963e..1da08d5 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s index fae140b..b7a2513 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s @@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_mount(SB) +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 +DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_nanosleep(SB) GLOBL ·libc_nanosleep_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go index 0083135..6e85b0a 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s index 9d1e0ff..f15dadf 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s @@ -555,6 +555,12 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + CALL libc_mount(SB) + RET +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 +DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 CALL libc_nanosleep(SB) RET diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go index 79029ed..28b487d 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go @@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(fsType) + if err != nil { + return + } + var _p1 *byte + _p1, err = BytePtrFromString(dir) + if err != nil { + return + } + _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_mount_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_mount mount "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s index da115f9..1e7f321 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s @@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB) +TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_mount(SB) +GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 +DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) + TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_nanosleep(SB) GLOBL ·libc_nanosleep_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go index 53aef5d..524b082 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go @@ -457,4 +457,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go index 71d5247..f485dbf 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go @@ -341,6 +341,7 @@ const ( SYS_STATX = 332 SYS_IO_PGETEVENTS = 333 SYS_RSEQ = 334 + SYS_URETPROBE = 335 SYS_PIDFD_SEND_SIGNAL = 424 SYS_IO_URING_SETUP = 425 SYS_IO_URING_ENTER = 426 @@ -379,4 +380,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go index c747706..70b35bf 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go @@ -421,4 +421,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go index f96e214..1893e2f 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go @@ -85,7 +85,7 @@ const ( SYS_SPLICE = 76 SYS_TEE = 77 SYS_READLINKAT = 78 - SYS_FSTATAT = 79 + SYS_NEWFSTATAT = 79 SYS_FSTAT = 80 SYS_SYNC = 81 SYS_FSYNC = 82 @@ -324,4 +324,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go index 2842534..16a4017 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go @@ -84,6 +84,8 @@ const ( SYS_SPLICE = 76 SYS_TEE = 77 SYS_READLINKAT = 78 + SYS_NEWFSTATAT = 79 + SYS_FSTAT = 80 SYS_SYNC = 81 SYS_FSYNC = 82 SYS_FDATASYNC = 83 @@ -318,4 +320,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go index d095301..7e567f1 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go @@ -441,4 +441,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 4459 SYS_LSM_SET_SELF_ATTR = 4460 SYS_LSM_LIST_MODULES = 4461 + SYS_MSEAL = 4462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go index 295c7f4..38ae55e 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go @@ -371,4 +371,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 5459 SYS_LSM_SET_SELF_ATTR = 5460 SYS_LSM_LIST_MODULES = 5461 + SYS_MSEAL = 5462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go index d1a9eac..55e92e6 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go @@ -371,4 +371,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 5459 SYS_LSM_SET_SELF_ATTR = 5460 SYS_LSM_LIST_MODULES = 5461 + SYS_MSEAL = 5462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go index bec157c..60658d6 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go @@ -441,4 +441,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 4459 SYS_LSM_SET_SELF_ATTR = 4460 SYS_LSM_LIST_MODULES = 4461 + SYS_MSEAL = 4462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go index 7ee7bdc..e203e8a 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go @@ -448,4 +448,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go index fad1f25..5944b97 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go @@ -420,4 +420,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go index 7d3e163..c66d416 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go @@ -420,4 +420,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go index 0ed53ad..a5459e7 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go @@ -84,7 +84,7 @@ const ( SYS_SPLICE = 76 SYS_TEE = 77 SYS_READLINKAT = 78 - SYS_FSTATAT = 79 + SYS_NEWFSTATAT = 79 SYS_FSTAT = 80 SYS_SYNC = 81 SYS_FSYNC = 82 @@ -325,4 +325,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go index 2fba04a..01d8682 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go @@ -386,4 +386,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go index 621d00d..7b703e7 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go @@ -399,4 +399,5 @@ const ( SYS_LSM_GET_SELF_ATTR = 459 SYS_LSM_SET_SELF_ATTR = 460 SYS_LSM_LIST_MODULES = 461 + SYS_MSEAL = 462 ) diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go index 091d107..d003c3d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go @@ -306,6 +306,19 @@ type XVSockPgen struct { type _Socklen uint32 +type SaeAssocID uint32 + +type SaeConnID uint32 + +type SaEndpoints struct { + Srcif uint32 + Srcaddr *RawSockaddr + Srcaddrlen uint32 + Dstaddr *RawSockaddr + Dstaddrlen uint32 + _ [4]byte +} + type Xucred struct { Version uint32 Uid uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go index 28ff4ef..0d45a94 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go @@ -306,6 +306,19 @@ type XVSockPgen struct { type _Socklen uint32 +type SaeAssocID uint32 + +type SaeConnID uint32 + +type SaEndpoints struct { + Srcif uint32 + Srcaddr *RawSockaddr + Srcaddrlen uint32 + Dstaddr *RawSockaddr + Dstaddrlen uint32 + _ [4]byte +} + type Xucred struct { Version uint32 Uid uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go index 6cbd094..51e13eb 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go @@ -625,6 +625,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go index 7c03b6e..d002d8e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go @@ -630,6 +630,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go index 422107e..3f863d8 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go @@ -616,6 +616,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go index 505a12a..61c7293 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go @@ -610,6 +610,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go index cc986c7..b5d1741 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go @@ -612,6 +612,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go index 0036746..3a69e45 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go @@ -87,30 +87,35 @@ type StatxTimestamp struct { } type Statx_t struct { - Mask uint32 - Blksize uint32 - Attributes uint64 - Nlink uint32 - Uid uint32 - Gid uint32 - Mode uint16 - _ [1]uint16 - Ino uint64 - Size uint64 - Blocks uint64 - Attributes_mask uint64 - Atime StatxTimestamp - Btime StatxTimestamp - Ctime StatxTimestamp - Mtime StatxTimestamp - Rdev_major uint32 - Rdev_minor uint32 - Dev_major uint32 - Dev_minor uint32 - Mnt_id uint64 - Dio_mem_align uint32 - Dio_offset_align uint32 - _ [12]uint64 + Mask uint32 + Blksize uint32 + Attributes uint64 + Nlink uint32 + Uid uint32 + Gid uint32 + Mode uint16 + _ [1]uint16 + Ino uint64 + Size uint64 + Blocks uint64 + Attributes_mask uint64 + Atime StatxTimestamp + Btime StatxTimestamp + Ctime StatxTimestamp + Mtime StatxTimestamp + Rdev_major uint32 + Rdev_minor uint32 + Dev_major uint32 + Dev_minor uint32 + Mnt_id uint64 + Dio_mem_align uint32 + Dio_offset_align uint32 + Subvol uint64 + Atomic_write_unit_min uint32 + Atomic_write_unit_max uint32 + Atomic_write_segments_max uint32 + _ [1]uint32 + _ [9]uint64 } type Fsid struct { @@ -515,6 +520,29 @@ type TCPInfo struct { Total_rto_time uint32 } +type TCPVegasInfo struct { + Enabled uint32 + Rttcnt uint32 + Rtt uint32 + Minrtt uint32 +} + +type TCPDCTCPInfo struct { + Enabled uint16 + Ce_state uint16 + Alpha uint32 + Ab_ecn uint32 + Ab_tot uint32 +} + +type TCPBBRInfo struct { + Bw_lo uint32 + Bw_hi uint32 + Min_rtt uint32 + Pacing_gain uint32 + Cwnd_gain uint32 +} + type CanFilter struct { Id uint32 Mask uint32 @@ -556,6 +584,7 @@ const ( SizeofICMPv6Filter = 0x20 SizeofUcred = 0xc SizeofTCPInfo = 0xf8 + SizeofTCPCCInfo = 0x14 SizeofCanFilter = 0x8 SizeofTCPRepairOpt = 0x8 ) @@ -2485,7 +2514,7 @@ type XDPMmapOffsets struct { type XDPUmemReg struct { Addr uint64 Len uint64 - Chunk_size uint32 + Size uint32 Headroom uint32 Flags uint32 Tx_metadata_len uint32 @@ -3473,7 +3502,7 @@ const ( DEVLINK_PORT_FN_ATTR_STATE = 0x2 DEVLINK_PORT_FN_ATTR_OPSTATE = 0x3 DEVLINK_PORT_FN_ATTR_CAPS = 0x4 - DEVLINK_PORT_FUNCTION_ATTR_MAX = 0x5 + DEVLINK_PORT_FUNCTION_ATTR_MAX = 0x6 ) type FsverityDigest struct { @@ -3765,7 +3794,7 @@ const ( ETHTOOL_MSG_PSE_GET = 0x24 ETHTOOL_MSG_PSE_SET = 0x25 ETHTOOL_MSG_RSS_GET = 0x26 - ETHTOOL_MSG_USER_MAX = 0x2b + ETHTOOL_MSG_USER_MAX = 0x2c ETHTOOL_MSG_KERNEL_NONE = 0x0 ETHTOOL_MSG_STRSET_GET_REPLY = 0x1 ETHTOOL_MSG_LINKINFO_GET_REPLY = 0x2 @@ -3805,7 +3834,10 @@ const ( ETHTOOL_MSG_MODULE_NTF = 0x24 ETHTOOL_MSG_PSE_GET_REPLY = 0x25 ETHTOOL_MSG_RSS_GET_REPLY = 0x26 - ETHTOOL_MSG_KERNEL_MAX = 0x2b + ETHTOOL_MSG_KERNEL_MAX = 0x2c + ETHTOOL_FLAG_COMPACT_BITSETS = 0x1 + ETHTOOL_FLAG_OMIT_REPLY = 0x2 + ETHTOOL_FLAG_STATS = 0x4 ETHTOOL_A_HEADER_UNSPEC = 0x0 ETHTOOL_A_HEADER_DEV_INDEX = 0x1 ETHTOOL_A_HEADER_DEV_NAME = 0x2 @@ -3947,7 +3979,7 @@ const ( ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL = 0x17 ETHTOOL_A_COALESCE_USE_CQE_MODE_TX = 0x18 ETHTOOL_A_COALESCE_USE_CQE_MODE_RX = 0x19 - ETHTOOL_A_COALESCE_MAX = 0x1c + ETHTOOL_A_COALESCE_MAX = 0x1e ETHTOOL_A_PAUSE_UNSPEC = 0x0 ETHTOOL_A_PAUSE_HEADER = 0x1 ETHTOOL_A_PAUSE_AUTONEG = 0x2 @@ -3975,7 +4007,7 @@ const ( ETHTOOL_A_TSINFO_TX_TYPES = 0x3 ETHTOOL_A_TSINFO_RX_FILTERS = 0x4 ETHTOOL_A_TSINFO_PHC_INDEX = 0x5 - ETHTOOL_A_TSINFO_MAX = 0x5 + ETHTOOL_A_TSINFO_MAX = 0x6 ETHTOOL_A_CABLE_TEST_UNSPEC = 0x0 ETHTOOL_A_CABLE_TEST_HEADER = 0x1 ETHTOOL_A_CABLE_TEST_MAX = 0x1 @@ -4605,7 +4637,7 @@ const ( NL80211_ATTR_MAC_HINT = 0xc8 NL80211_ATTR_MAC_MASK = 0xd7 NL80211_ATTR_MAX_AP_ASSOC_STA = 0xca - NL80211_ATTR_MAX = 0x149 + NL80211_ATTR_MAX = 0x14c NL80211_ATTR_MAX_CRIT_PROT_DURATION = 0xb4 NL80211_ATTR_MAX_CSA_COUNTERS = 0xce NL80211_ATTR_MAX_MATCH_SETS = 0x85 @@ -5209,7 +5241,7 @@ const ( NL80211_FREQUENCY_ATTR_GO_CONCURRENT = 0xf NL80211_FREQUENCY_ATTR_INDOOR_ONLY = 0xe NL80211_FREQUENCY_ATTR_IR_CONCURRENT = 0xf - NL80211_FREQUENCY_ATTR_MAX = 0x1f + NL80211_FREQUENCY_ATTR_MAX = 0x21 NL80211_FREQUENCY_ATTR_MAX_TX_POWER = 0x6 NL80211_FREQUENCY_ATTR_NO_10MHZ = 0x11 NL80211_FREQUENCY_ATTR_NO_160MHZ = 0xc @@ -5703,7 +5735,7 @@ const ( NL80211_STA_FLAG_ASSOCIATED = 0x7 NL80211_STA_FLAG_AUTHENTICATED = 0x5 NL80211_STA_FLAG_AUTHORIZED = 0x1 - NL80211_STA_FLAG_MAX = 0x7 + NL80211_STA_FLAG_MAX = 0x8 NL80211_STA_FLAG_MAX_OLD_API = 0x6 NL80211_STA_FLAG_MFP = 0x4 NL80211_STA_FLAG_SHORT_PREAMBLE = 0x2 @@ -6001,3 +6033,34 @@ type CachestatRange struct { Off uint64 Len uint64 } + +const ( + SK_MEMINFO_RMEM_ALLOC = 0x0 + SK_MEMINFO_RCVBUF = 0x1 + SK_MEMINFO_WMEM_ALLOC = 0x2 + SK_MEMINFO_SNDBUF = 0x3 + SK_MEMINFO_FWD_ALLOC = 0x4 + SK_MEMINFO_WMEM_QUEUED = 0x5 + SK_MEMINFO_OPTMEM = 0x6 + SK_MEMINFO_BACKLOG = 0x7 + SK_MEMINFO_DROPS = 0x8 + SK_MEMINFO_VARS = 0x9 + SKNLGRP_NONE = 0x0 + SKNLGRP_INET_TCP_DESTROY = 0x1 + SKNLGRP_INET_UDP_DESTROY = 0x2 + SKNLGRP_INET6_TCP_DESTROY = 0x3 + SKNLGRP_INET6_UDP_DESTROY = 0x4 + SK_DIAG_BPF_STORAGE_REQ_NONE = 0x0 + SK_DIAG_BPF_STORAGE_REQ_MAP_FD = 0x1 + SK_DIAG_BPF_STORAGE_REP_NONE = 0x0 + SK_DIAG_BPF_STORAGE = 0x1 + SK_DIAG_BPF_STORAGE_NONE = 0x0 + SK_DIAG_BPF_STORAGE_PAD = 0x1 + SK_DIAG_BPF_STORAGE_MAP_ID = 0x2 + SK_DIAG_BPF_STORAGE_MAP_VALUE = 0x3 +) + +type SockDiagReq struct { + Family uint8 + Protocol uint8 +} diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go index 15adc04..ad05b51 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go @@ -727,6 +727,37 @@ const ( RISCV_HWPROBE_EXT_ZBA = 0x8 RISCV_HWPROBE_EXT_ZBB = 0x10 RISCV_HWPROBE_EXT_ZBS = 0x20 + RISCV_HWPROBE_EXT_ZICBOZ = 0x40 + RISCV_HWPROBE_EXT_ZBC = 0x80 + RISCV_HWPROBE_EXT_ZBKB = 0x100 + RISCV_HWPROBE_EXT_ZBKC = 0x200 + RISCV_HWPROBE_EXT_ZBKX = 0x400 + RISCV_HWPROBE_EXT_ZKND = 0x800 + RISCV_HWPROBE_EXT_ZKNE = 0x1000 + RISCV_HWPROBE_EXT_ZKNH = 0x2000 + RISCV_HWPROBE_EXT_ZKSED = 0x4000 + RISCV_HWPROBE_EXT_ZKSH = 0x8000 + RISCV_HWPROBE_EXT_ZKT = 0x10000 + RISCV_HWPROBE_EXT_ZVBB = 0x20000 + RISCV_HWPROBE_EXT_ZVBC = 0x40000 + RISCV_HWPROBE_EXT_ZVKB = 0x80000 + RISCV_HWPROBE_EXT_ZVKG = 0x100000 + RISCV_HWPROBE_EXT_ZVKNED = 0x200000 + RISCV_HWPROBE_EXT_ZVKNHA = 0x400000 + RISCV_HWPROBE_EXT_ZVKNHB = 0x800000 + RISCV_HWPROBE_EXT_ZVKSED = 0x1000000 + RISCV_HWPROBE_EXT_ZVKSH = 0x2000000 + RISCV_HWPROBE_EXT_ZVKT = 0x4000000 + RISCV_HWPROBE_EXT_ZFH = 0x8000000 + RISCV_HWPROBE_EXT_ZFHMIN = 0x10000000 + RISCV_HWPROBE_EXT_ZIHINTNTL = 0x20000000 + RISCV_HWPROBE_EXT_ZVFH = 0x40000000 + RISCV_HWPROBE_EXT_ZVFHMIN = 0x80000000 + RISCV_HWPROBE_EXT_ZFA = 0x100000000 + RISCV_HWPROBE_EXT_ZTSO = 0x200000000 + RISCV_HWPROBE_EXT_ZACAS = 0x400000000 + RISCV_HWPROBE_EXT_ZICOND = 0x800000000 + RISCV_HWPROBE_EXT_ZIHINTPAUSE = 0x1000000000 RISCV_HWPROBE_KEY_CPUPERF_0 = 0x5 RISCV_HWPROBE_MISALIGNED_UNKNOWN = 0x0 RISCV_HWPROBE_MISALIGNED_EMULATED = 0x1 @@ -734,4 +765,6 @@ const ( RISCV_HWPROBE_MISALIGNED_FAST = 0x3 RISCV_HWPROBE_MISALIGNED_UNSUPPORTED = 0x4 RISCV_HWPROBE_MISALIGNED_MASK = 0x7 + RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE = 0x6 + RISCV_HWPROBE_WHICH_CPUS = 0x1 ) diff --git a/vendor/golang.org/x/sys/windows/dll_windows.go b/vendor/golang.org/x/sys/windows/dll_windows.go index 115341f..4e613cf 100644 --- a/vendor/golang.org/x/sys/windows/dll_windows.go +++ b/vendor/golang.org/x/sys/windows/dll_windows.go @@ -65,7 +65,7 @@ func LoadDLL(name string) (dll *DLL, err error) { return d, nil } -// MustLoadDLL is like LoadDLL but panics if load operation failes. +// MustLoadDLL is like LoadDLL but panics if load operation fails. func MustLoadDLL(name string) *DLL { d, e := LoadDLL(name) if e != nil { diff --git a/vendor/golang.org/x/sys/windows/security_windows.go b/vendor/golang.org/x/sys/windows/security_windows.go index 26be94a..b6e1ab7 100644 --- a/vendor/golang.org/x/sys/windows/security_windows.go +++ b/vendor/golang.org/x/sys/windows/security_windows.go @@ -68,6 +68,7 @@ type UserInfo10 struct { //sys NetUserGetInfo(serverName *uint16, userName *uint16, level uint32, buf **byte) (neterr error) = netapi32.NetUserGetInfo //sys NetGetJoinInformation(server *uint16, name **uint16, bufType *uint32) (neterr error) = netapi32.NetGetJoinInformation //sys NetApiBufferFree(buf *byte) (neterr error) = netapi32.NetApiBufferFree +//sys NetUserEnum(serverName *uint16, level uint32, filter uint32, buf **byte, prefMaxLen uint32, entriesRead *uint32, totalEntries *uint32, resumeHandle *uint32) (neterr error) = netapi32.NetUserEnum const ( // do not reorder @@ -893,7 +894,7 @@ type ACL struct { aclRevision byte sbz1 byte aclSize uint16 - aceCount uint16 + AceCount uint16 sbz2 uint16 } @@ -1086,6 +1087,27 @@ type EXPLICIT_ACCESS struct { Trustee TRUSTEE } +// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-ace_header +type ACE_HEADER struct { + AceType uint8 + AceFlags uint8 + AceSize uint16 +} + +// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-access_allowed_ace +type ACCESS_ALLOWED_ACE struct { + Header ACE_HEADER + Mask ACCESS_MASK + SidStart uint32 +} + +const ( + // Constants for AceType + // https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-ace_header + ACCESS_ALLOWED_ACE_TYPE = 0 + ACCESS_DENIED_ACE_TYPE = 1 +) + // This type is the union inside of TRUSTEE and must be created using one of the TrusteeValueFrom* functions. type TrusteeValue uintptr @@ -1157,6 +1179,7 @@ type OBJECTS_AND_NAME struct { //sys makeSelfRelativeSD(absoluteSD *SECURITY_DESCRIPTOR, selfRelativeSD *SECURITY_DESCRIPTOR, selfRelativeSDSize *uint32) (err error) = advapi32.MakeSelfRelativeSD //sys setEntriesInAcl(countExplicitEntries uint32, explicitEntries *EXPLICIT_ACCESS, oldACL *ACL, newACL **ACL) (ret error) = advapi32.SetEntriesInAclW +//sys GetAce(acl *ACL, aceIndex uint32, pAce **ACCESS_ALLOWED_ACE) (err error) = advapi32.GetAce // Control returns the security descriptor control bits. func (sd *SECURITY_DESCRIPTOR) Control() (control SECURITY_DESCRIPTOR_CONTROL, revision uint32, err error) { diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go index 6525c62..5cee9a3 100644 --- a/vendor/golang.org/x/sys/windows/syscall_windows.go +++ b/vendor/golang.org/x/sys/windows/syscall_windows.go @@ -17,8 +17,10 @@ import ( "unsafe" ) -type Handle uintptr -type HWND uintptr +type ( + Handle uintptr + HWND uintptr +) const ( InvalidHandle = ^Handle(0) @@ -211,6 +213,10 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys OpenProcess(desiredAccess uint32, inheritHandle bool, processId uint32) (handle Handle, err error) //sys ShellExecute(hwnd Handle, verb *uint16, file *uint16, args *uint16, cwd *uint16, showCmd int32) (err error) [failretval<=32] = shell32.ShellExecuteW //sys GetWindowThreadProcessId(hwnd HWND, pid *uint32) (tid uint32, err error) = user32.GetWindowThreadProcessId +//sys LoadKeyboardLayout(name *uint16, flags uint32) (hkl Handle, err error) [failretval==0] = user32.LoadKeyboardLayoutW +//sys UnloadKeyboardLayout(hkl Handle) (err error) = user32.UnloadKeyboardLayout +//sys GetKeyboardLayout(tid uint32) (hkl Handle) = user32.GetKeyboardLayout +//sys ToUnicodeEx(vkey uint32, scancode uint32, keystate *byte, pwszBuff *uint16, cchBuff int32, flags uint32, hkl Handle) (ret int32) = user32.ToUnicodeEx //sys GetShellWindow() (shellWindow HWND) = user32.GetShellWindow //sys MessageBox(hwnd HWND, text *uint16, caption *uint16, boxtype uint32) (ret int32, err error) [failretval==0] = user32.MessageBoxW //sys ExitWindowsEx(flags uint32, reason uint32) (err error) = user32.ExitWindowsEx @@ -307,6 +313,10 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode //sys GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo //sys setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition +//sys GetConsoleCP() (cp uint32, err error) = kernel32.GetConsoleCP +//sys GetConsoleOutputCP() (cp uint32, err error) = kernel32.GetConsoleOutputCP +//sys SetConsoleCP(cp uint32) (err error) = kernel32.SetConsoleCP +//sys SetConsoleOutputCP(cp uint32) (err error) = kernel32.SetConsoleOutputCP //sys WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW //sys ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW //sys resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole @@ -1368,9 +1378,11 @@ func SetsockoptLinger(fd Handle, level, opt int, l *Linger) (err error) { func SetsockoptInet4Addr(fd Handle, level, opt int, value [4]byte) (err error) { return Setsockopt(fd, int32(level), int32(opt), (*byte)(unsafe.Pointer(&value[0])), 4) } + func SetsockoptIPMreq(fd Handle, level, opt int, mreq *IPMreq) (err error) { return Setsockopt(fd, int32(level), int32(opt), (*byte)(unsafe.Pointer(mreq)), int32(unsafe.Sizeof(*mreq))) } + func SetsockoptIPv6Mreq(fd Handle, level, opt int, mreq *IPv6Mreq) (err error) { return syscall.EWINDOWS } diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go index d8cb71d..7b97a15 100644 --- a/vendor/golang.org/x/sys/windows/types_windows.go +++ b/vendor/golang.org/x/sys/windows/types_windows.go @@ -1060,6 +1060,7 @@ const ( SIO_GET_EXTENSION_FUNCTION_POINTER = IOC_INOUT | IOC_WS2 | 6 SIO_KEEPALIVE_VALS = IOC_IN | IOC_VENDOR | 4 SIO_UDP_CONNRESET = IOC_IN | IOC_VENDOR | 12 + SIO_UDP_NETRESET = IOC_IN | IOC_VENDOR | 15 // cf. http://support.microsoft.com/default.aspx?scid=kb;en-us;257460 @@ -2003,7 +2004,21 @@ const ( MOVEFILE_FAIL_IF_NOT_TRACKABLE = 0x20 ) -const GAA_FLAG_INCLUDE_PREFIX = 0x00000010 +// Flags for GetAdaptersAddresses, see +// https://learn.microsoft.com/en-us/windows/win32/api/iphlpapi/nf-iphlpapi-getadaptersaddresses. +const ( + GAA_FLAG_SKIP_UNICAST = 0x1 + GAA_FLAG_SKIP_ANYCAST = 0x2 + GAA_FLAG_SKIP_MULTICAST = 0x4 + GAA_FLAG_SKIP_DNS_SERVER = 0x8 + GAA_FLAG_INCLUDE_PREFIX = 0x10 + GAA_FLAG_SKIP_FRIENDLY_NAME = 0x20 + GAA_FLAG_INCLUDE_WINS_INFO = 0x40 + GAA_FLAG_INCLUDE_GATEWAYS = 0x80 + GAA_FLAG_INCLUDE_ALL_INTERFACES = 0x100 + GAA_FLAG_INCLUDE_ALL_COMPARTMENTS = 0x200 + GAA_FLAG_INCLUDE_TUNNEL_BINDINGORDER = 0x400 +) const ( IF_TYPE_OTHER = 1 @@ -2017,6 +2032,50 @@ const ( IF_TYPE_IEEE1394 = 144 ) +// Enum NL_PREFIX_ORIGIN for [IpAdapterUnicastAddress], see +// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_prefix_origin +const ( + IpPrefixOriginOther = 0 + IpPrefixOriginManual = 1 + IpPrefixOriginWellKnown = 2 + IpPrefixOriginDhcp = 3 + IpPrefixOriginRouterAdvertisement = 4 + IpPrefixOriginUnchanged = 1 << 4 +) + +// Enum NL_SUFFIX_ORIGIN for [IpAdapterUnicastAddress], see +// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_suffix_origin +const ( + NlsoOther = 0 + NlsoManual = 1 + NlsoWellKnown = 2 + NlsoDhcp = 3 + NlsoLinkLayerAddress = 4 + NlsoRandom = 5 + IpSuffixOriginOther = 0 + IpSuffixOriginManual = 1 + IpSuffixOriginWellKnown = 2 + IpSuffixOriginDhcp = 3 + IpSuffixOriginLinkLayerAddress = 4 + IpSuffixOriginRandom = 5 + IpSuffixOriginUnchanged = 1 << 4 +) + +// Enum NL_DAD_STATE for [IpAdapterUnicastAddress], see +// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_dad_state +const ( + NldsInvalid = 0 + NldsTentative = 1 + NldsDuplicate = 2 + NldsDeprecated = 3 + NldsPreferred = 4 + IpDadStateInvalid = 0 + IpDadStateTentative = 1 + IpDadStateDuplicate = 2 + IpDadStateDeprecated = 3 + IpDadStatePreferred = 4 +) + type SocketAddress struct { Sockaddr *syscall.RawSockaddrAny SockaddrLength int32 @@ -3404,3 +3463,14 @@ type DCB struct { EvtChar byte wReserved1 uint16 } + +// Keyboard Layout Flags. +// See https://learn.microsoft.com/en-us/windows/win32/api/winuser/nf-winuser-loadkeyboardlayoutw +const ( + KLF_ACTIVATE = 0x00000001 + KLF_SUBSTITUTE_OK = 0x00000002 + KLF_REORDER = 0x00000008 + KLF_REPLACELANG = 0x00000010 + KLF_NOTELLSHELL = 0x00000080 + KLF_SETFORPROCESS = 0x00000100 +) diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go index 5c6035d..4c2e1bd 100644 --- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go +++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go @@ -91,6 +91,7 @@ var ( procEnumServicesStatusExW = modadvapi32.NewProc("EnumServicesStatusExW") procEqualSid = modadvapi32.NewProc("EqualSid") procFreeSid = modadvapi32.NewProc("FreeSid") + procGetAce = modadvapi32.NewProc("GetAce") procGetLengthSid = modadvapi32.NewProc("GetLengthSid") procGetNamedSecurityInfoW = modadvapi32.NewProc("GetNamedSecurityInfoW") procGetSecurityDescriptorControl = modadvapi32.NewProc("GetSecurityDescriptorControl") @@ -246,7 +247,9 @@ var ( procGetCommandLineW = modkernel32.NewProc("GetCommandLineW") procGetComputerNameExW = modkernel32.NewProc("GetComputerNameExW") procGetComputerNameW = modkernel32.NewProc("GetComputerNameW") + procGetConsoleCP = modkernel32.NewProc("GetConsoleCP") procGetConsoleMode = modkernel32.NewProc("GetConsoleMode") + procGetConsoleOutputCP = modkernel32.NewProc("GetConsoleOutputCP") procGetConsoleScreenBufferInfo = modkernel32.NewProc("GetConsoleScreenBufferInfo") procGetCurrentDirectoryW = modkernel32.NewProc("GetCurrentDirectoryW") procGetCurrentProcessId = modkernel32.NewProc("GetCurrentProcessId") @@ -346,8 +349,10 @@ var ( procSetCommMask = modkernel32.NewProc("SetCommMask") procSetCommState = modkernel32.NewProc("SetCommState") procSetCommTimeouts = modkernel32.NewProc("SetCommTimeouts") + procSetConsoleCP = modkernel32.NewProc("SetConsoleCP") procSetConsoleCursorPosition = modkernel32.NewProc("SetConsoleCursorPosition") procSetConsoleMode = modkernel32.NewProc("SetConsoleMode") + procSetConsoleOutputCP = modkernel32.NewProc("SetConsoleOutputCP") procSetCurrentDirectoryW = modkernel32.NewProc("SetCurrentDirectoryW") procSetDefaultDllDirectories = modkernel32.NewProc("SetDefaultDllDirectories") procSetDllDirectoryW = modkernel32.NewProc("SetDllDirectoryW") @@ -401,6 +406,7 @@ var ( procTransmitFile = modmswsock.NewProc("TransmitFile") procNetApiBufferFree = modnetapi32.NewProc("NetApiBufferFree") procNetGetJoinInformation = modnetapi32.NewProc("NetGetJoinInformation") + procNetUserEnum = modnetapi32.NewProc("NetUserEnum") procNetUserGetInfo = modnetapi32.NewProc("NetUserGetInfo") procNtCreateFile = modntdll.NewProc("NtCreateFile") procNtCreateNamedPipeFile = modntdll.NewProc("NtCreateNamedPipeFile") @@ -476,12 +482,16 @@ var ( procGetDesktopWindow = moduser32.NewProc("GetDesktopWindow") procGetForegroundWindow = moduser32.NewProc("GetForegroundWindow") procGetGUIThreadInfo = moduser32.NewProc("GetGUIThreadInfo") + procGetKeyboardLayout = moduser32.NewProc("GetKeyboardLayout") procGetShellWindow = moduser32.NewProc("GetShellWindow") procGetWindowThreadProcessId = moduser32.NewProc("GetWindowThreadProcessId") procIsWindow = moduser32.NewProc("IsWindow") procIsWindowUnicode = moduser32.NewProc("IsWindowUnicode") procIsWindowVisible = moduser32.NewProc("IsWindowVisible") + procLoadKeyboardLayoutW = moduser32.NewProc("LoadKeyboardLayoutW") procMessageBoxW = moduser32.NewProc("MessageBoxW") + procToUnicodeEx = moduser32.NewProc("ToUnicodeEx") + procUnloadKeyboardLayout = moduser32.NewProc("UnloadKeyboardLayout") procCreateEnvironmentBlock = moduserenv.NewProc("CreateEnvironmentBlock") procDestroyEnvironmentBlock = moduserenv.NewProc("DestroyEnvironmentBlock") procGetUserProfileDirectoryW = moduserenv.NewProc("GetUserProfileDirectoryW") @@ -787,6 +797,14 @@ func FreeSid(sid *SID) (err error) { return } +func GetAce(acl *ACL, aceIndex uint32, pAce **ACCESS_ALLOWED_ACE) (err error) { + r1, _, e1 := syscall.Syscall(procGetAce.Addr(), 3, uintptr(unsafe.Pointer(acl)), uintptr(aceIndex), uintptr(unsafe.Pointer(pAce))) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func GetLengthSid(sid *SID) (len uint32) { r0, _, _ := syscall.Syscall(procGetLengthSid.Addr(), 1, uintptr(unsafe.Pointer(sid)), 0, 0) len = uint32(r0) @@ -2148,6 +2166,15 @@ func GetComputerName(buf *uint16, n *uint32) (err error) { return } +func GetConsoleCP() (cp uint32, err error) { + r0, _, e1 := syscall.Syscall(procGetConsoleCP.Addr(), 0, 0, 0, 0) + cp = uint32(r0) + if cp == 0 { + err = errnoErr(e1) + } + return +} + func GetConsoleMode(console Handle, mode *uint32) (err error) { r1, _, e1 := syscall.Syscall(procGetConsoleMode.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(mode)), 0) if r1 == 0 { @@ -2156,6 +2183,15 @@ func GetConsoleMode(console Handle, mode *uint32) (err error) { return } +func GetConsoleOutputCP() (cp uint32, err error) { + r0, _, e1 := syscall.Syscall(procGetConsoleOutputCP.Addr(), 0, 0, 0, 0) + cp = uint32(r0) + if cp == 0 { + err = errnoErr(e1) + } + return +} + func GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) { r1, _, e1 := syscall.Syscall(procGetConsoleScreenBufferInfo.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(info)), 0) if r1 == 0 { @@ -3024,6 +3060,14 @@ func SetCommTimeouts(handle Handle, timeouts *CommTimeouts) (err error) { return } +func SetConsoleCP(cp uint32) (err error) { + r1, _, e1 := syscall.Syscall(procSetConsoleCP.Addr(), 1, uintptr(cp), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func setConsoleCursorPosition(console Handle, position uint32) (err error) { r1, _, e1 := syscall.Syscall(procSetConsoleCursorPosition.Addr(), 2, uintptr(console), uintptr(position), 0) if r1 == 0 { @@ -3040,6 +3084,14 @@ func SetConsoleMode(console Handle, mode uint32) (err error) { return } +func SetConsoleOutputCP(cp uint32) (err error) { + r1, _, e1 := syscall.Syscall(procSetConsoleOutputCP.Addr(), 1, uintptr(cp), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func SetCurrentDirectory(path *uint16) (err error) { r1, _, e1 := syscall.Syscall(procSetCurrentDirectoryW.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0) if r1 == 0 { @@ -3486,6 +3538,14 @@ func NetGetJoinInformation(server *uint16, name **uint16, bufType *uint32) (nete return } +func NetUserEnum(serverName *uint16, level uint32, filter uint32, buf **byte, prefMaxLen uint32, entriesRead *uint32, totalEntries *uint32, resumeHandle *uint32) (neterr error) { + r0, _, _ := syscall.Syscall9(procNetUserEnum.Addr(), 8, uintptr(unsafe.Pointer(serverName)), uintptr(level), uintptr(filter), uintptr(unsafe.Pointer(buf)), uintptr(prefMaxLen), uintptr(unsafe.Pointer(entriesRead)), uintptr(unsafe.Pointer(totalEntries)), uintptr(unsafe.Pointer(resumeHandle)), 0) + if r0 != 0 { + neterr = syscall.Errno(r0) + } + return +} + func NetUserGetInfo(serverName *uint16, userName *uint16, level uint32, buf **byte) (neterr error) { r0, _, _ := syscall.Syscall6(procNetUserGetInfo.Addr(), 4, uintptr(unsafe.Pointer(serverName)), uintptr(unsafe.Pointer(userName)), uintptr(level), uintptr(unsafe.Pointer(buf)), 0, 0) if r0 != 0 { @@ -4064,6 +4124,12 @@ func GetGUIThreadInfo(thread uint32, info *GUIThreadInfo) (err error) { return } +func GetKeyboardLayout(tid uint32) (hkl Handle) { + r0, _, _ := syscall.Syscall(procGetKeyboardLayout.Addr(), 1, uintptr(tid), 0, 0) + hkl = Handle(r0) + return +} + func GetShellWindow() (shellWindow HWND) { r0, _, _ := syscall.Syscall(procGetShellWindow.Addr(), 0, 0, 0, 0) shellWindow = HWND(r0) @@ -4097,6 +4163,15 @@ func IsWindowVisible(hwnd HWND) (isVisible bool) { return } +func LoadKeyboardLayout(name *uint16, flags uint32) (hkl Handle, err error) { + r0, _, e1 := syscall.Syscall(procLoadKeyboardLayoutW.Addr(), 2, uintptr(unsafe.Pointer(name)), uintptr(flags), 0) + hkl = Handle(r0) + if hkl == 0 { + err = errnoErr(e1) + } + return +} + func MessageBox(hwnd HWND, text *uint16, caption *uint16, boxtype uint32) (ret int32, err error) { r0, _, e1 := syscall.Syscall6(procMessageBoxW.Addr(), 4, uintptr(hwnd), uintptr(unsafe.Pointer(text)), uintptr(unsafe.Pointer(caption)), uintptr(boxtype), 0, 0) ret = int32(r0) @@ -4106,6 +4181,20 @@ func MessageBox(hwnd HWND, text *uint16, caption *uint16, boxtype uint32) (ret i return } +func ToUnicodeEx(vkey uint32, scancode uint32, keystate *byte, pwszBuff *uint16, cchBuff int32, flags uint32, hkl Handle) (ret int32) { + r0, _, _ := syscall.Syscall9(procToUnicodeEx.Addr(), 7, uintptr(vkey), uintptr(scancode), uintptr(unsafe.Pointer(keystate)), uintptr(unsafe.Pointer(pwszBuff)), uintptr(cchBuff), uintptr(flags), uintptr(hkl), 0, 0) + ret = int32(r0) + return +} + +func UnloadKeyboardLayout(hkl Handle) (err error) { + r1, _, e1 := syscall.Syscall(procUnloadKeyboardLayout.Addr(), 1, uintptr(hkl), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func CreateEnvironmentBlock(block **uint16, token Token, inheritExisting bool) (err error) { var _p0 uint32 if inheritExisting { diff --git a/vendor/golang.org/x/term/LICENSE b/vendor/golang.org/x/term/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/term/LICENSE +++ b/vendor/golang.org/x/term/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/term/term_windows.go b/vendor/golang.org/x/term/term_windows.go index 465f560..df6bf94 100644 --- a/vendor/golang.org/x/term/term_windows.go +++ b/vendor/golang.org/x/term/term_windows.go @@ -26,6 +26,7 @@ func makeRaw(fd int) (*State, error) { return nil, err } raw := st &^ (windows.ENABLE_ECHO_INPUT | windows.ENABLE_PROCESSED_INPUT | windows.ENABLE_LINE_INPUT | windows.ENABLE_PROCESSED_OUTPUT) + raw |= windows.ENABLE_VIRTUAL_TERMINAL_INPUT if err := windows.SetConsoleMode(windows.Handle(fd), raw); err != nil { return nil, err } diff --git a/vendor/golang.org/x/text/LICENSE b/vendor/golang.org/x/text/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/text/LICENSE +++ b/vendor/golang.org/x/text/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/tools/LICENSE b/vendor/golang.org/x/tools/LICENSE index 6a66aea..2a7cf70 100644 --- a/vendor/golang.org/x/tools/LICENSE +++ b/vendor/golang.org/x/tools/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/tools/cover/profile.go b/vendor/golang.org/x/tools/cover/profile.go new file mode 100644 index 0000000..47a9a54 --- /dev/null +++ b/vendor/golang.org/x/tools/cover/profile.go @@ -0,0 +1,266 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package cover provides support for parsing coverage profiles +// generated by "go test -coverprofile=cover.out". +package cover // import "golang.org/x/tools/cover" + +import ( + "bufio" + "errors" + "fmt" + "io" + "math" + "os" + "sort" + "strconv" + "strings" +) + +// Profile represents the profiling data for a specific file. +type Profile struct { + FileName string + Mode string + Blocks []ProfileBlock +} + +// ProfileBlock represents a single block of profiling data. +type ProfileBlock struct { + StartLine, StartCol int + EndLine, EndCol int + NumStmt, Count int +} + +type byFileName []*Profile + +func (p byFileName) Len() int { return len(p) } +func (p byFileName) Less(i, j int) bool { return p[i].FileName < p[j].FileName } +func (p byFileName) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// ParseProfiles parses profile data in the specified file and returns a +// Profile for each source file described therein. +func ParseProfiles(fileName string) ([]*Profile, error) { + pf, err := os.Open(fileName) + if err != nil { + return nil, err + } + defer pf.Close() + return ParseProfilesFromReader(pf) +} + +// ParseProfilesFromReader parses profile data from the Reader and +// returns a Profile for each source file described therein. +func ParseProfilesFromReader(rd io.Reader) ([]*Profile, error) { + // First line is "mode: foo", where foo is "set", "count", or "atomic". + // Rest of file is in the format + // encoding/base64/base64.go:34.44,37.40 3 1 + // where the fields are: name.go:line.column,line.column numberOfStatements count + files := make(map[string]*Profile) + s := bufio.NewScanner(rd) + mode := "" + for s.Scan() { + line := s.Text() + if mode == "" { + const p = "mode: " + if !strings.HasPrefix(line, p) || line == p { + return nil, fmt.Errorf("bad mode line: %v", line) + } + mode = line[len(p):] + continue + } + fn, b, err := parseLine(line) + if err != nil { + return nil, fmt.Errorf("line %q doesn't match expected format: %v", line, err) + } + p := files[fn] + if p == nil { + p = &Profile{ + FileName: fn, + Mode: mode, + } + files[fn] = p + } + p.Blocks = append(p.Blocks, b) + } + if err := s.Err(); err != nil { + return nil, err + } + for _, p := range files { + sort.Sort(blocksByStart(p.Blocks)) + // Merge samples from the same location. + j := 1 + for i := 1; i < len(p.Blocks); i++ { + b := p.Blocks[i] + last := p.Blocks[j-1] + if b.StartLine == last.StartLine && + b.StartCol == last.StartCol && + b.EndLine == last.EndLine && + b.EndCol == last.EndCol { + if b.NumStmt != last.NumStmt { + return nil, fmt.Errorf("inconsistent NumStmt: changed from %d to %d", last.NumStmt, b.NumStmt) + } + if mode == "set" { + p.Blocks[j-1].Count |= b.Count + } else { + p.Blocks[j-1].Count += b.Count + } + continue + } + p.Blocks[j] = b + j++ + } + p.Blocks = p.Blocks[:j] + } + // Generate a sorted slice. + profiles := make([]*Profile, 0, len(files)) + for _, profile := range files { + profiles = append(profiles, profile) + } + sort.Sort(byFileName(profiles)) + return profiles, nil +} + +// parseLine parses a line from a coverage file. +// It is equivalent to the regex +// ^(.+):([0-9]+)\.([0-9]+),([0-9]+)\.([0-9]+) ([0-9]+) ([0-9]+)$ +// +// However, it is much faster: https://golang.org/cl/179377 +func parseLine(l string) (fileName string, block ProfileBlock, err error) { + end := len(l) + + b := ProfileBlock{} + b.Count, end, err = seekBack(l, ' ', end, "Count") + if err != nil { + return "", b, err + } + b.NumStmt, end, err = seekBack(l, ' ', end, "NumStmt") + if err != nil { + return "", b, err + } + b.EndCol, end, err = seekBack(l, '.', end, "EndCol") + if err != nil { + return "", b, err + } + b.EndLine, end, err = seekBack(l, ',', end, "EndLine") + if err != nil { + return "", b, err + } + b.StartCol, end, err = seekBack(l, '.', end, "StartCol") + if err != nil { + return "", b, err + } + b.StartLine, end, err = seekBack(l, ':', end, "StartLine") + if err != nil { + return "", b, err + } + fn := l[0:end] + if fn == "" { + return "", b, errors.New("a FileName cannot be blank") + } + return fn, b, nil +} + +// seekBack searches backwards from end to find sep in l, then returns the +// value between sep and end as an integer. +// If seekBack fails, the returned error will reference what. +func seekBack(l string, sep byte, end int, what string) (value int, nextSep int, err error) { + // Since we're seeking backwards and we know only ASCII is legal for these values, + // we can ignore the possibility of non-ASCII characters. + for start := end - 1; start >= 0; start-- { + if l[start] == sep { + i, err := strconv.Atoi(l[start+1 : end]) + if err != nil { + return 0, 0, fmt.Errorf("couldn't parse %q: %v", what, err) + } + if i < 0 { + return 0, 0, fmt.Errorf("negative values are not allowed for %s, found %d", what, i) + } + return i, start, nil + } + } + return 0, 0, fmt.Errorf("couldn't find a %s before %s", string(sep), what) +} + +type blocksByStart []ProfileBlock + +func (b blocksByStart) Len() int { return len(b) } +func (b blocksByStart) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b blocksByStart) Less(i, j int) bool { + bi, bj := b[i], b[j] + return bi.StartLine < bj.StartLine || bi.StartLine == bj.StartLine && bi.StartCol < bj.StartCol +} + +// Boundary represents the position in a source file of the beginning or end of a +// block as reported by the coverage profile. In HTML mode, it will correspond to +// the opening or closing of a tag and will be used to colorize the source +type Boundary struct { + Offset int // Location as a byte offset in the source file. + Start bool // Is this the start of a block? + Count int // Event count from the cover profile. + Norm float64 // Count normalized to [0..1]. + Index int // Order in input file. +} + +// Boundaries returns a Profile as a set of Boundary objects within the provided src. +func (p *Profile) Boundaries(src []byte) (boundaries []Boundary) { + // Find maximum count. + max := 0 + for _, b := range p.Blocks { + if b.Count > max { + max = b.Count + } + } + // Divisor for normalization. + divisor := math.Log(float64(max)) + + // boundary returns a Boundary, populating the Norm field with a normalized Count. + index := 0 + boundary := func(offset int, start bool, count int) Boundary { + b := Boundary{Offset: offset, Start: start, Count: count, Index: index} + index++ + if !start || count == 0 { + return b + } + if max <= 1 { + b.Norm = 0.8 // Profile is in"set" mode; we want a heat map. Use cov8 in the CSS. + } else if count > 0 { + b.Norm = math.Log(float64(count)) / divisor + } + return b + } + + line, col := 1, 2 // TODO: Why is this 2? + for si, bi := 0, 0; si < len(src) && bi < len(p.Blocks); { + b := p.Blocks[bi] + if b.StartLine == line && b.StartCol == col { + boundaries = append(boundaries, boundary(si, true, b.Count)) + } + if b.EndLine == line && b.EndCol == col || line > b.EndLine { + boundaries = append(boundaries, boundary(si, false, 0)) + bi++ + continue // Don't advance through src; maybe the next block starts here. + } + if src[si] == '\n' { + line++ + col = 0 + } + col++ + si++ + } + sort.Sort(boundariesByPos(boundaries)) + return +} + +type boundariesByPos []Boundary + +func (b boundariesByPos) Len() int { return len(b) } +func (b boundariesByPos) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b boundariesByPos) Less(i, j int) bool { + if b[i].Offset == b[j].Offset { + // Boundaries at the same offset should be ordered according to + // their original position. + return b[i].Index < b[j].Index + } + return b[i].Offset < b[j].Offset +} diff --git a/vendor/golang.org/x/tools/go/ast/astutil/enclosing.go b/vendor/golang.org/x/tools/go/ast/astutil/enclosing.go index 9fa5aa1..6e34df4 100644 --- a/vendor/golang.org/x/tools/go/ast/astutil/enclosing.go +++ b/vendor/golang.org/x/tools/go/ast/astutil/enclosing.go @@ -11,8 +11,6 @@ import ( "go/ast" "go/token" "sort" - - "golang.org/x/tools/internal/typeparams" ) // PathEnclosingInterval returns the node that encloses the source @@ -108,8 +106,21 @@ func PathEnclosingInterval(root *ast.File, start, end token.Pos) (path []ast.Nod // Does augmented child strictly contain [start, end)? if augPos <= start && end <= augEnd { - _, isToken := child.(tokenNode) - return isToken || visit(child) + if is[tokenNode](child) { + return true + } + + // childrenOf elides the FuncType node beneath FuncDecl. + // Add it back here for TypeParams, Params, Results, + // all FieldLists). But we don't add it back for the "func" token + // even though it is is the tree at FuncDecl.Type.Func. + if decl, ok := node.(*ast.FuncDecl); ok { + if fields, ok := child.(*ast.FieldList); ok && fields != decl.Recv { + path = append(path, decl.Type) + } + } + + return visit(child) } // Does [start, end) overlap multiple children? @@ -315,6 +326,8 @@ func childrenOf(n ast.Node) []ast.Node { // // As a workaround, we inline the case for FuncType // here and order things correctly. + // We also need to insert the elided FuncType just + // before the 'visit' recursion. // children = nil // discard ast.Walk(FuncDecl) info subtrees children = append(children, tok(n.Type.Func, len("func"))) @@ -322,7 +335,7 @@ func childrenOf(n ast.Node) []ast.Node { children = append(children, n.Recv) } children = append(children, n.Name) - if tparams := typeparams.ForFuncType(n.Type); tparams != nil { + if tparams := n.Type.TypeParams; tparams != nil { children = append(children, tparams) } if n.Type.Params != nil { @@ -377,7 +390,7 @@ func childrenOf(n ast.Node) []ast.Node { tok(n.Lbrack, len("[")), tok(n.Rbrack, len("]"))) - case *typeparams.IndexListExpr: + case *ast.IndexListExpr: children = append(children, tok(n.Lbrack, len("[")), tok(n.Rbrack, len("]"))) @@ -588,7 +601,7 @@ func NodeDescription(n ast.Node) string { return "decrement statement" case *ast.IndexExpr: return "index expression" - case *typeparams.IndexListExpr: + case *ast.IndexListExpr: return "index list expression" case *ast.InterfaceType: return "interface type" @@ -634,3 +647,8 @@ func NodeDescription(n ast.Node) string { } panic(fmt.Sprintf("unexpected node type: %T", n)) } + +func is[T any](x any) bool { + _, ok := x.(T) + return ok +} diff --git a/vendor/golang.org/x/tools/go/ast/astutil/rewrite.go b/vendor/golang.org/x/tools/go/ast/astutil/rewrite.go index f430b21..58934f7 100644 --- a/vendor/golang.org/x/tools/go/ast/astutil/rewrite.go +++ b/vendor/golang.org/x/tools/go/ast/astutil/rewrite.go @@ -9,8 +9,6 @@ import ( "go/ast" "reflect" "sort" - - "golang.org/x/tools/internal/typeparams" ) // An ApplyFunc is invoked by Apply for each node n, even if n is nil, @@ -252,7 +250,7 @@ func (a *application) apply(parent ast.Node, name string, iter *iterator, n ast. a.apply(n, "X", nil, n.X) a.apply(n, "Index", nil, n.Index) - case *typeparams.IndexListExpr: + case *ast.IndexListExpr: a.apply(n, "X", nil, n.X) a.applyList(n, "Indices") @@ -293,7 +291,7 @@ func (a *application) apply(parent ast.Node, name string, iter *iterator, n ast. a.apply(n, "Fields", nil, n.Fields) case *ast.FuncType: - if tparams := typeparams.ForFuncType(n); tparams != nil { + if tparams := n.TypeParams; tparams != nil { a.apply(n, "TypeParams", nil, tparams) } a.apply(n, "Params", nil, n.Params) @@ -408,7 +406,7 @@ func (a *application) apply(parent ast.Node, name string, iter *iterator, n ast. case *ast.TypeSpec: a.apply(n, "Doc", nil, n.Doc) a.apply(n, "Name", nil, n.Name) - if tparams := typeparams.ForTypeSpec(n); tparams != nil { + if tparams := n.TypeParams; tparams != nil { a.apply(n, "TypeParams", nil, tparams) } a.apply(n, "Type", nil, n.Type) diff --git a/vendor/golang.org/x/tools/go/ast/astutil/util.go b/vendor/golang.org/x/tools/go/ast/astutil/util.go index 919d530..ca71e3e 100644 --- a/vendor/golang.org/x/tools/go/ast/astutil/util.go +++ b/vendor/golang.org/x/tools/go/ast/astutil/util.go @@ -7,12 +7,5 @@ package astutil import "go/ast" // Unparen returns e with any enclosing parentheses stripped. -func Unparen(e ast.Expr) ast.Expr { - for { - p, ok := e.(*ast.ParenExpr) - if !ok { - return e - } - e = p.X - } -} +// Deprecated: use [ast.Unparen]. +func Unparen(e ast.Expr) ast.Expr { return ast.Unparen(e) } diff --git a/vendor/golang.org/x/tools/go/ast/inspector/inspector.go b/vendor/golang.org/x/tools/go/ast/inspector/inspector.go index 1fc1de0..0e0ba4c 100644 --- a/vendor/golang.org/x/tools/go/ast/inspector/inspector.go +++ b/vendor/golang.org/x/tools/go/ast/inspector/inspector.go @@ -73,6 +73,15 @@ func (in *Inspector) Preorder(types []ast.Node, f func(ast.Node)) { // check, Preorder is almost twice as fast as Nodes. The two // features seem to contribute similar slowdowns (~1.4x each). + // This function is equivalent to the PreorderSeq call below, + // but to avoid the additional dynamic call (which adds 13-35% + // to the benchmarks), we expand it out. + // + // in.PreorderSeq(types...)(func(n ast.Node) bool { + // f(n) + // return true + // }) + mask := maskOf(types) for i := 0; i < len(in.events); { ev := in.events[i] diff --git a/vendor/golang.org/x/tools/go/ast/inspector/iter.go b/vendor/golang.org/x/tools/go/ast/inspector/iter.go new file mode 100644 index 0000000..b7e9591 --- /dev/null +++ b/vendor/golang.org/x/tools/go/ast/inspector/iter.go @@ -0,0 +1,85 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.23 + +package inspector + +import ( + "go/ast" + "iter" +) + +// PreorderSeq returns an iterator that visits all the +// nodes of the files supplied to New in depth-first order. +// It visits each node n before n's children. +// The complete traversal sequence is determined by ast.Inspect. +// +// The types argument, if non-empty, enables type-based +// filtering of events: only nodes whose type matches an +// element of the types slice are included in the sequence. +func (in *Inspector) PreorderSeq(types ...ast.Node) iter.Seq[ast.Node] { + + // This implementation is identical to Preorder, + // except that it supports breaking out of the loop. + + return func(yield func(ast.Node) bool) { + mask := maskOf(types) + for i := 0; i < len(in.events); { + ev := in.events[i] + if ev.index > i { + // push + if ev.typ&mask != 0 { + if !yield(ev.node) { + break + } + } + pop := ev.index + if in.events[pop].typ&mask == 0 { + // Subtrees do not contain types: skip them and pop. + i = pop + 1 + continue + } + } + i++ + } + } +} + +// All[N] returns an iterator over all the nodes of type N. +// N must be a pointer-to-struct type that implements ast.Node. +// +// Example: +// +// for call := range All[*ast.CallExpr](in) { ... } +func All[N interface { + *S + ast.Node +}, S any](in *Inspector) iter.Seq[N] { + + // To avoid additional dynamic call overheads, + // we duplicate rather than call the logic of PreorderSeq. + + mask := typeOf((N)(nil)) + return func(yield func(N) bool) { + for i := 0; i < len(in.events); { + ev := in.events[i] + if ev.index > i { + // push + if ev.typ&mask != 0 { + if !yield(ev.node.(N)) { + break + } + } + pop := ev.index + if in.events[pop].typ&mask == 0 { + // Subtrees do not contain types: skip them and pop. + i = pop + 1 + continue + } + } + i++ + } + } +} diff --git a/vendor/golang.org/x/tools/go/ast/inspector/typeof.go b/vendor/golang.org/x/tools/go/ast/inspector/typeof.go index 703c813..2a872f8 100644 --- a/vendor/golang.org/x/tools/go/ast/inspector/typeof.go +++ b/vendor/golang.org/x/tools/go/ast/inspector/typeof.go @@ -12,8 +12,6 @@ package inspector import ( "go/ast" "math" - - "golang.org/x/tools/internal/typeparams" ) const ( @@ -171,7 +169,7 @@ func typeOf(n ast.Node) uint64 { return 1 << nIncDecStmt case *ast.IndexExpr: return 1 << nIndexExpr - case *typeparams.IndexListExpr: + case *ast.IndexListExpr: return 1 << nIndexListExpr case *ast.InterfaceType: return 1 << nInterfaceType diff --git a/vendor/golang.org/x/tools/go/gcexportdata/gcexportdata.go b/vendor/golang.org/x/tools/go/gcexportdata/gcexportdata.go new file mode 100644 index 0000000..137cc8d --- /dev/null +++ b/vendor/golang.org/x/tools/go/gcexportdata/gcexportdata.go @@ -0,0 +1,186 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package gcexportdata provides functions for locating, reading, and +// writing export data files containing type information produced by the +// gc compiler. This package supports go1.7 export data format and all +// later versions. +// +// Although it might seem convenient for this package to live alongside +// go/types in the standard library, this would cause version skew +// problems for developer tools that use it, since they must be able to +// consume the outputs of the gc compiler both before and after a Go +// update such as from Go 1.7 to Go 1.8. Because this package lives in +// golang.org/x/tools, sites can update their version of this repo some +// time before the Go 1.8 release and rebuild and redeploy their +// developer tools, which will then be able to consume both Go 1.7 and +// Go 1.8 export data files, so they will work before and after the +// Go update. (See discussion at https://golang.org/issue/15651.) +package gcexportdata // import "golang.org/x/tools/go/gcexportdata" + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "go/token" + "go/types" + "io" + "os/exec" + + "golang.org/x/tools/internal/gcimporter" +) + +// Find returns the name of an object (.o) or archive (.a) file +// containing type information for the specified import path, +// using the go command. +// If no file was found, an empty filename is returned. +// +// A relative srcDir is interpreted relative to the current working directory. +// +// Find also returns the package's resolved (canonical) import path, +// reflecting the effects of srcDir and vendoring on importPath. +// +// Deprecated: Use the higher-level API in golang.org/x/tools/go/packages, +// which is more efficient. +func Find(importPath, srcDir string) (filename, path string) { + cmd := exec.Command("go", "list", "-json", "-export", "--", importPath) + cmd.Dir = srcDir + out, err := cmd.Output() + if err != nil { + return "", "" + } + var data struct { + ImportPath string + Export string + } + json.Unmarshal(out, &data) + return data.Export, data.ImportPath +} + +// NewReader returns a reader for the export data section of an object +// (.o) or archive (.a) file read from r. The new reader may provide +// additional trailing data beyond the end of the export data. +func NewReader(r io.Reader) (io.Reader, error) { + buf := bufio.NewReader(r) + _, size, err := gcimporter.FindExportData(buf) + if err != nil { + return nil, err + } + + if size >= 0 { + // We were given an archive and found the __.PKGDEF in it. + // This tells us the size of the export data, and we don't + // need to return the entire file. + return &io.LimitedReader{ + R: buf, + N: size, + }, nil + } else { + // We were given an object file. As such, we don't know how large + // the export data is and must return the entire file. + return buf, nil + } +} + +// readAll works the same way as io.ReadAll, but avoids allocations and copies +// by preallocating a byte slice of the necessary size if the size is known up +// front. This is always possible when the input is an archive. In that case, +// NewReader will return the known size using an io.LimitedReader. +func readAll(r io.Reader) ([]byte, error) { + if lr, ok := r.(*io.LimitedReader); ok { + data := make([]byte, lr.N) + _, err := io.ReadFull(lr, data) + return data, err + } + return io.ReadAll(r) +} + +// Read reads export data from in, decodes it, and returns type +// information for the package. +// +// The package path (effectively its linker symbol prefix) is +// specified by path, since unlike the package name, this information +// may not be recorded in the export data. +// +// File position information is added to fset. +// +// Read may inspect and add to the imports map to ensure that references +// within the export data to other packages are consistent. The caller +// must ensure that imports[path] does not exist, or exists but is +// incomplete (see types.Package.Complete), and Read inserts the +// resulting package into this map entry. +// +// On return, the state of the reader is undefined. +func Read(in io.Reader, fset *token.FileSet, imports map[string]*types.Package, path string) (*types.Package, error) { + data, err := readAll(in) + if err != nil { + return nil, fmt.Errorf("reading export data for %q: %v", path, err) + } + + if bytes.HasPrefix(data, []byte("!")) { + return nil, fmt.Errorf("can't read export data for %q directly from an archive file (call gcexportdata.NewReader first to extract export data)", path) + } + + // The indexed export format starts with an 'i'; the older + // binary export format starts with a 'c', 'd', or 'v' + // (from "version"). Select appropriate importer. + if len(data) > 0 { + switch data[0] { + case 'v', 'c', 'd': // binary, till go1.10 + return nil, fmt.Errorf("binary (%c) import format is no longer supported", data[0]) + + case 'i': // indexed, till go1.19 + _, pkg, err := gcimporter.IImportData(fset, imports, data[1:], path) + return pkg, err + + case 'u': // unified, from go1.20 + _, pkg, err := gcimporter.UImportData(fset, imports, data[1:], path) + return pkg, err + + default: + l := len(data) + if l > 10 { + l = 10 + } + return nil, fmt.Errorf("unexpected export data with prefix %q for path %s", string(data[:l]), path) + } + } + return nil, fmt.Errorf("empty export data for %s", path) +} + +// Write writes encoded type information for the specified package to out. +// The FileSet provides file position information for named objects. +func Write(out io.Writer, fset *token.FileSet, pkg *types.Package) error { + if _, err := io.WriteString(out, "i"); err != nil { + return err + } + return gcimporter.IExportData(out, fset, pkg) +} + +// ReadBundle reads an export bundle from in, decodes it, and returns type +// information for the packages. +// File position information is added to fset. +// +// ReadBundle may inspect and add to the imports map to ensure that references +// within the export bundle to other packages are consistent. +// +// On return, the state of the reader is undefined. +// +// Experimental: This API is experimental and may change in the future. +func ReadBundle(in io.Reader, fset *token.FileSet, imports map[string]*types.Package) ([]*types.Package, error) { + data, err := readAll(in) + if err != nil { + return nil, fmt.Errorf("reading export bundle: %v", err) + } + return gcimporter.IImportBundle(fset, imports, data) +} + +// WriteBundle writes encoded type information for the specified packages to out. +// The FileSet provides file position information for named objects. +// +// Experimental: This API is experimental and may change in the future. +func WriteBundle(out io.Writer, fset *token.FileSet, pkgs []*types.Package) error { + return gcimporter.IExportBundle(out, fset, pkgs) +} diff --git a/vendor/golang.org/x/tools/go/gcexportdata/importer.go b/vendor/golang.org/x/tools/go/gcexportdata/importer.go new file mode 100644 index 0000000..37a7247 --- /dev/null +++ b/vendor/golang.org/x/tools/go/gcexportdata/importer.go @@ -0,0 +1,75 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gcexportdata + +import ( + "fmt" + "go/token" + "go/types" + "os" +) + +// NewImporter returns a new instance of the types.Importer interface +// that reads type information from export data files written by gc. +// The Importer also satisfies types.ImporterFrom. +// +// Export data files are located using "go build" workspace conventions +// and the build.Default context. +// +// Use this importer instead of go/importer.For("gc", ...) to avoid the +// version-skew problems described in the documentation of this package, +// or to control the FileSet or access the imports map populated during +// package loading. +// +// Deprecated: Use the higher-level API in golang.org/x/tools/go/packages, +// which is more efficient. +func NewImporter(fset *token.FileSet, imports map[string]*types.Package) types.ImporterFrom { + return importer{fset, imports} +} + +type importer struct { + fset *token.FileSet + imports map[string]*types.Package +} + +func (imp importer) Import(importPath string) (*types.Package, error) { + return imp.ImportFrom(importPath, "", 0) +} + +func (imp importer) ImportFrom(importPath, srcDir string, mode types.ImportMode) (_ *types.Package, err error) { + filename, path := Find(importPath, srcDir) + if filename == "" { + if importPath == "unsafe" { + // Even for unsafe, call Find first in case + // the package was vendored. + return types.Unsafe, nil + } + return nil, fmt.Errorf("can't find import: %s", importPath) + } + + if pkg, ok := imp.imports[path]; ok && pkg.Complete() { + return pkg, nil // cache hit + } + + // open file + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer func() { + f.Close() + if err != nil { + // add file name to error + err = fmt.Errorf("reading export data: %s: %v", filename, err) + } + }() + + r, err := NewReader(f) + if err != nil { + return nil, err + } + + return Read(r, imp.fset, imp.imports, path) +} diff --git a/vendor/golang.org/x/tools/go/packages/doc.go b/vendor/golang.org/x/tools/go/packages/doc.go new file mode 100644 index 0000000..f1931d1 --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/doc.go @@ -0,0 +1,251 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package packages loads Go packages for inspection and analysis. + +The [Load] function takes as input a list of patterns and returns a +list of [Package] values describing individual packages matched by those +patterns. +A [Config] specifies configuration options, the most important of which is +the [LoadMode], which controls the amount of detail in the loaded packages. + +Load passes most patterns directly to the underlying build tool. +The default build tool is the go command. +Its supported patterns are described at +https://pkg.go.dev/cmd/go#hdr-Package_lists_and_patterns. +Other build systems may be supported by providing a "driver"; +see [The driver protocol]. + +All patterns with the prefix "query=", where query is a +non-empty string of letters from [a-z], are reserved and may be +interpreted as query operators. + +Two query operators are currently supported: "file" and "pattern". + +The query "file=path/to/file.go" matches the package or packages enclosing +the Go source file path/to/file.go. For example "file=~/go/src/fmt/print.go" +might return the packages "fmt" and "fmt [fmt.test]". + +The query "pattern=string" causes "string" to be passed directly to +the underlying build tool. In most cases this is unnecessary, +but an application can use Load("pattern=" + x) as an escaping mechanism +to ensure that x is not interpreted as a query operator if it contains '='. + +All other query operators are reserved for future use and currently +cause Load to report an error. + +The Package struct provides basic information about the package, including + + - ID, a unique identifier for the package in the returned set; + - GoFiles, the names of the package's Go source files; + - Imports, a map from source import strings to the Packages they name; + - Types, the type information for the package's exported symbols; + - Syntax, the parsed syntax trees for the package's source code; and + - TypesInfo, the result of a complete type-check of the package syntax trees. + +(See the documentation for type Package for the complete list of fields +and more detailed descriptions.) + +For example, + + Load(nil, "bytes", "unicode...") + +returns four Package structs describing the standard library packages +bytes, unicode, unicode/utf16, and unicode/utf8. Note that one pattern +can match multiple packages and that a package might be matched by +multiple patterns: in general it is not possible to determine which +packages correspond to which patterns. + +Note that the list returned by Load contains only the packages matched +by the patterns. Their dependencies can be found by walking the import +graph using the Imports fields. + +The Load function can be configured by passing a pointer to a Config as +the first argument. A nil Config is equivalent to the zero Config, which +causes Load to run in [LoadFiles] mode, collecting minimal information. +See the documentation for type Config for details. + +As noted earlier, the Config.Mode controls the amount of detail +reported about the loaded packages. See the documentation for type LoadMode +for details. + +Most tools should pass their command-line arguments (after any flags) +uninterpreted to Load, so that it can interpret them +according to the conventions of the underlying build system. + +See the Example function for typical usage. + +# The driver protocol + +Load may be used to load Go packages even in Go projects that use +alternative build systems, by installing an appropriate "driver" +program for the build system and specifying its location in the +GOPACKAGESDRIVER environment variable. +For example, +https://github.com/bazelbuild/rules_go/wiki/Editor-and-tool-integration +explains how to use the driver for Bazel. + +The driver program is responsible for interpreting patterns in its +preferred notation and reporting information about the packages that +those patterns identify. Drivers must also support the special "file=" +and "pattern=" patterns described above. + +The patterns are provided as positional command-line arguments. A +JSON-encoded [DriverRequest] message providing additional information +is written to the driver's standard input. The driver must write a +JSON-encoded [DriverResponse] message to its standard output. (This +message differs from the JSON schema produced by 'go list'.) + +The value of the PWD environment variable seen by the driver process +is the preferred name of its working directory. (The working directory +may have other aliases due to symbolic links; see the comment on the +Dir field of [exec.Cmd] for related information.) +When the driver process emits in its response the name of a file +that is a descendant of this directory, it must use an absolute path +that has the value of PWD as a prefix, to ensure that the returned +filenames satisfy the original query. +*/ +package packages // import "golang.org/x/tools/go/packages" + +/* + +Motivation and design considerations + +The new package's design solves problems addressed by two existing +packages: go/build, which locates and describes packages, and +golang.org/x/tools/go/loader, which loads, parses and type-checks them. +The go/build.Package structure encodes too much of the 'go build' way +of organizing projects, leaving us in need of a data type that describes a +package of Go source code independent of the underlying build system. +We wanted something that works equally well with go build and vgo, and +also other build systems such as Bazel and Blaze, making it possible to +construct analysis tools that work in all these environments. +Tools such as errcheck and staticcheck were essentially unavailable to +the Go community at Google, and some of Google's internal tools for Go +are unavailable externally. +This new package provides a uniform way to obtain package metadata by +querying each of these build systems, optionally supporting their +preferred command-line notations for packages, so that tools integrate +neatly with users' build environments. The Metadata query function +executes an external query tool appropriate to the current workspace. + +Loading packages always returns the complete import graph "all the way down", +even if all you want is information about a single package, because the query +mechanisms of all the build systems we currently support ({go,vgo} list, and +blaze/bazel aspect-based query) cannot provide detailed information +about one package without visiting all its dependencies too, so there is +no additional asymptotic cost to providing transitive information. +(This property might not be true of a hypothetical 5th build system.) + +In calls to TypeCheck, all initial packages, and any package that +transitively depends on one of them, must be loaded from source. +Consider A->B->C->D->E: if A,C are initial, A,B,C must be loaded from +source; D may be loaded from export data, and E may not be loaded at all +(though it's possible that D's export data mentions it, so a +types.Package may be created for it and exposed.) + +The old loader had a feature to suppress type-checking of function +bodies on a per-package basis, primarily intended to reduce the work of +obtaining type information for imported packages. Now that imports are +satisfied by export data, the optimization no longer seems necessary. + +Despite some early attempts, the old loader did not exploit export data, +instead always using the equivalent of WholeProgram mode. This was due +to the complexity of mixing source and export data packages (now +resolved by the upward traversal mentioned above), and because export data +files were nearly always missing or stale. Now that 'go build' supports +caching, all the underlying build systems can guarantee to produce +export data in a reasonable (amortized) time. + +Test "main" packages synthesized by the build system are now reported as +first-class packages, avoiding the need for clients (such as go/ssa) to +reinvent this generation logic. + +One way in which go/packages is simpler than the old loader is in its +treatment of in-package tests. In-package tests are packages that +consist of all the files of the library under test, plus the test files. +The old loader constructed in-package tests by a two-phase process of +mutation called "augmentation": first it would construct and type check +all the ordinary library packages and type-check the packages that +depend on them; then it would add more (test) files to the package and +type-check again. This two-phase approach had four major problems: +1) in processing the tests, the loader modified the library package, + leaving no way for a client application to see both the test + package and the library package; one would mutate into the other. +2) because test files can declare additional methods on types defined in + the library portion of the package, the dispatch of method calls in + the library portion was affected by the presence of the test files. + This should have been a clue that the packages were logically + different. +3) this model of "augmentation" assumed at most one in-package test + per library package, which is true of projects using 'go build', + but not other build systems. +4) because of the two-phase nature of test processing, all packages that + import the library package had to be processed before augmentation, + forcing a "one-shot" API and preventing the client from calling Load + in several times in sequence as is now possible in WholeProgram mode. + (TypeCheck mode has a similar one-shot restriction for a different reason.) + +Early drafts of this package supported "multi-shot" operation. +Although it allowed clients to make a sequence of calls (or concurrent +calls) to Load, building up the graph of Packages incrementally, +it was of marginal value: it complicated the API +(since it allowed some options to vary across calls but not others), +it complicated the implementation, +it cannot be made to work in Types mode, as explained above, +and it was less efficient than making one combined call (when this is possible). +Among the clients we have inspected, none made multiple calls to load +but could not be easily and satisfactorily modified to make only a single call. +However, applications changes may be required. +For example, the ssadump command loads the user-specified packages +and in addition the runtime package. It is tempting to simply append +"runtime" to the user-provided list, but that does not work if the user +specified an ad-hoc package such as [a.go b.go]. +Instead, ssadump no longer requests the runtime package, +but seeks it among the dependencies of the user-specified packages, +and emits an error if it is not found. + +Questions & Tasks + +- Add GOARCH/GOOS? + They are not portable concepts, but could be made portable. + Our goal has been to allow users to express themselves using the conventions + of the underlying build system: if the build system honors GOARCH + during a build and during a metadata query, then so should + applications built atop that query mechanism. + Conversely, if the target architecture of the build is determined by + command-line flags, the application can pass the relevant + flags through to the build system using a command such as: + myapp -query_flag="--cpu=amd64" -query_flag="--os=darwin" + However, this approach is low-level, unwieldy, and non-portable. + GOOS and GOARCH seem important enough to warrant a dedicated option. + +- How should we handle partial failures such as a mixture of good and + malformed patterns, existing and non-existent packages, successful and + failed builds, import failures, import cycles, and so on, in a call to + Load? + +- Support bazel, blaze, and go1.10 list, not just go1.11 list. + +- Handle (and test) various partial success cases, e.g. + a mixture of good packages and: + invalid patterns + nonexistent packages + empty packages + packages with malformed package or import declarations + unreadable files + import cycles + other parse errors + type errors + Make sure we record errors at the correct place in the graph. + +- Missing packages among initial arguments are not reported. + Return bogus packages for them, like golist does. + +- "undeclared name" errors (for example) are reported out of source file + order. I suspect this is due to the breadth-first resolution now used + by go/types. Is that a bug? Discuss with gri. + +*/ diff --git a/vendor/golang.org/x/tools/go/packages/external.go b/vendor/golang.org/x/tools/go/packages/external.go new file mode 100644 index 0000000..8f7afcb --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/external.go @@ -0,0 +1,156 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package packages + +// This file defines the protocol that enables an external "driver" +// tool to supply package metadata in place of 'go list'. + +import ( + "bytes" + "encoding/json" + "fmt" + "os" + "os/exec" + "strings" +) + +// DriverRequest defines the schema of a request for package metadata +// from an external driver program. The JSON-encoded DriverRequest +// message is provided to the driver program's standard input. The +// query patterns are provided as command-line arguments. +// +// See the package documentation for an overview. +type DriverRequest struct { + Mode LoadMode `json:"mode"` + + // Env specifies the environment the underlying build system should be run in. + Env []string `json:"env"` + + // BuildFlags are flags that should be passed to the underlying build system. + BuildFlags []string `json:"build_flags"` + + // Tests specifies whether the patterns should also return test packages. + Tests bool `json:"tests"` + + // Overlay maps file paths (relative to the driver's working directory) + // to the contents of overlay files (see Config.Overlay). + Overlay map[string][]byte `json:"overlay"` +} + +// DriverResponse defines the schema of a response from an external +// driver program, providing the results of a query for package +// metadata. The driver program must write a JSON-encoded +// DriverResponse message to its standard output. +// +// See the package documentation for an overview. +type DriverResponse struct { + // NotHandled is returned if the request can't be handled by the current + // driver. If an external driver returns a response with NotHandled, the + // rest of the DriverResponse is ignored, and go/packages will fallback + // to the next driver. If go/packages is extended in the future to support + // lists of multiple drivers, go/packages will fall back to the next driver. + NotHandled bool + + // Compiler and Arch are the arguments pass of types.SizesFor + // to get a types.Sizes to use when type checking. + Compiler string + Arch string + + // Roots is the set of package IDs that make up the root packages. + // We have to encode this separately because when we encode a single package + // we cannot know if it is one of the roots as that requires knowledge of the + // graph it is part of. + Roots []string `json:",omitempty"` + + // Packages is the full set of packages in the graph. + // The packages are not connected into a graph. + // The Imports if populated will be stubs that only have their ID set. + // Imports will be connected and then type and syntax information added in a + // later pass (see refine). + Packages []*Package + + // GoVersion is the minor version number used by the driver + // (e.g. the go command on the PATH) when selecting .go files. + // Zero means unknown. + GoVersion int +} + +// driver is the type for functions that query the build system for the +// packages named by the patterns. +type driver func(cfg *Config, patterns ...string) (*DriverResponse, error) + +// findExternalDriver returns the file path of a tool that supplies +// the build system package structure, or "" if not found. +// If GOPACKAGESDRIVER is set in the environment findExternalTool returns its +// value, otherwise it searches for a binary named gopackagesdriver on the PATH. +func findExternalDriver(cfg *Config) driver { + const toolPrefix = "GOPACKAGESDRIVER=" + tool := "" + for _, env := range cfg.Env { + if val := strings.TrimPrefix(env, toolPrefix); val != env { + tool = val + } + } + if tool != "" && tool == "off" { + return nil + } + if tool == "" { + var err error + tool, err = exec.LookPath("gopackagesdriver") + if err != nil { + return nil + } + } + return func(cfg *Config, words ...string) (*DriverResponse, error) { + req, err := json.Marshal(DriverRequest{ + Mode: cfg.Mode, + Env: cfg.Env, + BuildFlags: cfg.BuildFlags, + Tests: cfg.Tests, + Overlay: cfg.Overlay, + }) + if err != nil { + return nil, fmt.Errorf("failed to encode message to driver tool: %v", err) + } + + buf := new(bytes.Buffer) + stderr := new(bytes.Buffer) + cmd := exec.CommandContext(cfg.Context, tool, words...) + cmd.Dir = cfg.Dir + // The cwd gets resolved to the real path. On Darwin, where + // /tmp is a symlink, this breaks anything that expects the + // working directory to keep the original path, including the + // go command when dealing with modules. + // + // os.Getwd stdlib has a special feature where if the + // cwd and the PWD are the same node then it trusts + // the PWD, so by setting it in the env for the child + // process we fix up all the paths returned by the go + // command. + // + // (See similar trick in Invocation.run in ../../internal/gocommand/invoke.go) + cmd.Env = append(slicesClip(cfg.Env), "PWD="+cfg.Dir) + cmd.Stdin = bytes.NewReader(req) + cmd.Stdout = buf + cmd.Stderr = stderr + + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("%v: %v: %s", tool, err, cmd.Stderr) + } + if len(stderr.Bytes()) != 0 && os.Getenv("GOPACKAGESPRINTDRIVERERRORS") != "" { + fmt.Fprintf(os.Stderr, "%s stderr: <<%s>>\n", cmdDebugStr(cmd), stderr) + } + + var response DriverResponse + if err := json.Unmarshal(buf.Bytes(), &response); err != nil { + return nil, err + } + return &response, nil + } +} + +// slicesClip removes unused capacity from the slice, returning s[:len(s):len(s)]. +// TODO(adonovan): use go1.21 slices.Clip. +func slicesClip[S ~[]E, E any](s S) S { return s[:len(s):len(s)] } diff --git a/vendor/golang.org/x/tools/go/packages/golist.go b/vendor/golang.org/x/tools/go/packages/golist.go new file mode 100644 index 0000000..1a3a5b4 --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/golist.go @@ -0,0 +1,1066 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package packages + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "log" + "os" + "os/exec" + "path" + "path/filepath" + "reflect" + "sort" + "strconv" + "strings" + "sync" + "unicode" + + "golang.org/x/tools/internal/gocommand" + "golang.org/x/tools/internal/packagesinternal" +) + +// debug controls verbose logging. +var debug, _ = strconv.ParseBool(os.Getenv("GOPACKAGESDEBUG")) + +// A goTooOldError reports that the go command +// found by exec.LookPath is too old to use the new go list behavior. +type goTooOldError struct { + error +} + +// responseDeduper wraps a DriverResponse, deduplicating its contents. +type responseDeduper struct { + seenRoots map[string]bool + seenPackages map[string]*Package + dr *DriverResponse +} + +func newDeduper() *responseDeduper { + return &responseDeduper{ + dr: &DriverResponse{}, + seenRoots: map[string]bool{}, + seenPackages: map[string]*Package{}, + } +} + +// addAll fills in r with a DriverResponse. +func (r *responseDeduper) addAll(dr *DriverResponse) { + for _, pkg := range dr.Packages { + r.addPackage(pkg) + } + for _, root := range dr.Roots { + r.addRoot(root) + } + r.dr.GoVersion = dr.GoVersion +} + +func (r *responseDeduper) addPackage(p *Package) { + if r.seenPackages[p.ID] != nil { + return + } + r.seenPackages[p.ID] = p + r.dr.Packages = append(r.dr.Packages, p) +} + +func (r *responseDeduper) addRoot(id string) { + if r.seenRoots[id] { + return + } + r.seenRoots[id] = true + r.dr.Roots = append(r.dr.Roots, id) +} + +type golistState struct { + cfg *Config + ctx context.Context + + envOnce sync.Once + goEnvError error + goEnv map[string]string + + rootsOnce sync.Once + rootDirsError error + rootDirs map[string]string + + goVersionOnce sync.Once + goVersionError error + goVersion int // The X in Go 1.X. + + // vendorDirs caches the (non)existence of vendor directories. + vendorDirs map[string]bool +} + +// getEnv returns Go environment variables. Only specific variables are +// populated -- computing all of them is slow. +func (state *golistState) getEnv() (map[string]string, error) { + state.envOnce.Do(func() { + var b *bytes.Buffer + b, state.goEnvError = state.invokeGo("env", "-json", "GOMOD", "GOPATH") + if state.goEnvError != nil { + return + } + + state.goEnv = make(map[string]string) + decoder := json.NewDecoder(b) + if state.goEnvError = decoder.Decode(&state.goEnv); state.goEnvError != nil { + return + } + }) + return state.goEnv, state.goEnvError +} + +// mustGetEnv is a convenience function that can be used if getEnv has already succeeded. +func (state *golistState) mustGetEnv() map[string]string { + env, err := state.getEnv() + if err != nil { + panic(fmt.Sprintf("mustGetEnv: %v", err)) + } + return env +} + +// goListDriver uses the go list command to interpret the patterns and produce +// the build system package structure. +// See driver for more details. +func goListDriver(cfg *Config, patterns ...string) (_ *DriverResponse, err error) { + // Make sure that any asynchronous go commands are killed when we return. + parentCtx := cfg.Context + if parentCtx == nil { + parentCtx = context.Background() + } + ctx, cancel := context.WithCancel(parentCtx) + defer cancel() + + response := newDeduper() + + state := &golistState{ + cfg: cfg, + ctx: ctx, + vendorDirs: map[string]bool{}, + } + + // Fill in response.Sizes asynchronously if necessary. + if cfg.Mode&NeedTypesSizes != 0 || cfg.Mode&NeedTypes != 0 { + errCh := make(chan error) + go func() { + compiler, arch, err := getSizesForArgs(ctx, state.cfgInvocation(), cfg.gocmdRunner) + response.dr.Compiler = compiler + response.dr.Arch = arch + errCh <- err + }() + defer func() { + if sizesErr := <-errCh; sizesErr != nil { + err = sizesErr + } + }() + } + + // Determine files requested in contains patterns + var containFiles []string + restPatterns := make([]string, 0, len(patterns)) + // Extract file= and other [querytype]= patterns. Report an error if querytype + // doesn't exist. +extractQueries: + for _, pattern := range patterns { + eqidx := strings.Index(pattern, "=") + if eqidx < 0 { + restPatterns = append(restPatterns, pattern) + } else { + query, value := pattern[:eqidx], pattern[eqidx+len("="):] + switch query { + case "file": + containFiles = append(containFiles, value) + case "pattern": + restPatterns = append(restPatterns, value) + case "": // not a reserved query + restPatterns = append(restPatterns, pattern) + default: + for _, rune := range query { + if rune < 'a' || rune > 'z' { // not a reserved query + restPatterns = append(restPatterns, pattern) + continue extractQueries + } + } + // Reject all other patterns containing "=" + return nil, fmt.Errorf("invalid query type %q in query pattern %q", query, pattern) + } + } + } + + // See if we have any patterns to pass through to go list. Zero initial + // patterns also requires a go list call, since it's the equivalent of + // ".". + if len(restPatterns) > 0 || len(patterns) == 0 { + dr, err := state.createDriverResponse(restPatterns...) + if err != nil { + return nil, err + } + response.addAll(dr) + } + + if len(containFiles) != 0 { + if err := state.runContainsQueries(response, containFiles); err != nil { + return nil, err + } + } + + // (We may yet return an error due to defer.) + return response.dr, nil +} + +func (state *golistState) runContainsQueries(response *responseDeduper, queries []string) error { + for _, query := range queries { + // TODO(matloob): Do only one query per directory. + fdir := filepath.Dir(query) + // Pass absolute path of directory to go list so that it knows to treat it as a directory, + // not a package path. + pattern, err := filepath.Abs(fdir) + if err != nil { + return fmt.Errorf("could not determine absolute path of file= query path %q: %v", query, err) + } + dirResponse, err := state.createDriverResponse(pattern) + + // If there was an error loading the package, or no packages are returned, + // or the package is returned with errors, try to load the file as an + // ad-hoc package. + // Usually the error will appear in a returned package, but may not if we're + // in module mode and the ad-hoc is located outside a module. + if err != nil || len(dirResponse.Packages) == 0 || len(dirResponse.Packages) == 1 && len(dirResponse.Packages[0].GoFiles) == 0 && + len(dirResponse.Packages[0].Errors) == 1 { + var queryErr error + if dirResponse, queryErr = state.adhocPackage(pattern, query); queryErr != nil { + return err // return the original error + } + } + isRoot := make(map[string]bool, len(dirResponse.Roots)) + for _, root := range dirResponse.Roots { + isRoot[root] = true + } + for _, pkg := range dirResponse.Packages { + // Add any new packages to the main set + // We don't bother to filter packages that will be dropped by the changes of roots, + // that will happen anyway during graph construction outside this function. + // Over-reporting packages is not a problem. + response.addPackage(pkg) + // if the package was not a root one, it cannot have the file + if !isRoot[pkg.ID] { + continue + } + for _, pkgFile := range pkg.GoFiles { + if filepath.Base(query) == filepath.Base(pkgFile) { + response.addRoot(pkg.ID) + break + } + } + } + } + return nil +} + +// adhocPackage attempts to load or construct an ad-hoc package for a given +// query, if the original call to the driver produced inadequate results. +func (state *golistState) adhocPackage(pattern, query string) (*DriverResponse, error) { + response, err := state.createDriverResponse(query) + if err != nil { + return nil, err + } + // If we get nothing back from `go list`, + // try to make this file into its own ad-hoc package. + // TODO(rstambler): Should this check against the original response? + if len(response.Packages) == 0 { + response.Packages = append(response.Packages, &Package{ + ID: "command-line-arguments", + PkgPath: query, + GoFiles: []string{query}, + CompiledGoFiles: []string{query}, + Imports: make(map[string]*Package), + }) + response.Roots = append(response.Roots, "command-line-arguments") + } + // Handle special cases. + if len(response.Packages) == 1 { + // golang/go#33482: If this is a file= query for ad-hoc packages where + // the file only exists on an overlay, and exists outside of a module, + // add the file to the package and remove the errors. + if response.Packages[0].ID == "command-line-arguments" || + filepath.ToSlash(response.Packages[0].PkgPath) == filepath.ToSlash(query) { + if len(response.Packages[0].GoFiles) == 0 { + filename := filepath.Join(pattern, filepath.Base(query)) // avoid recomputing abspath + // TODO(matloob): check if the file is outside of a root dir? + for path := range state.cfg.Overlay { + if path == filename { + response.Packages[0].Errors = nil + response.Packages[0].GoFiles = []string{path} + response.Packages[0].CompiledGoFiles = []string{path} + } + } + } + } + } + return response, nil +} + +// Fields must match go list; +// see $GOROOT/src/cmd/go/internal/load/pkg.go. +type jsonPackage struct { + ImportPath string + Dir string + Name string + Export string + GoFiles []string + CompiledGoFiles []string + IgnoredGoFiles []string + IgnoredOtherFiles []string + EmbedPatterns []string + EmbedFiles []string + CFiles []string + CgoFiles []string + CXXFiles []string + MFiles []string + HFiles []string + FFiles []string + SFiles []string + SwigFiles []string + SwigCXXFiles []string + SysoFiles []string + Imports []string + ImportMap map[string]string + Deps []string + Module *Module + TestGoFiles []string + TestImports []string + XTestGoFiles []string + XTestImports []string + ForTest string // q in a "p [q.test]" package, else "" + DepOnly bool + + Error *packagesinternal.PackageError + DepsErrors []*packagesinternal.PackageError +} + +type jsonPackageError struct { + ImportStack []string + Pos string + Err string +} + +func otherFiles(p *jsonPackage) [][]string { + return [][]string{p.CFiles, p.CXXFiles, p.MFiles, p.HFiles, p.FFiles, p.SFiles, p.SwigFiles, p.SwigCXXFiles, p.SysoFiles} +} + +// createDriverResponse uses the "go list" command to expand the pattern +// words and return a response for the specified packages. +func (state *golistState) createDriverResponse(words ...string) (*DriverResponse, error) { + // go list uses the following identifiers in ImportPath and Imports: + // + // "p" -- importable package or main (command) + // "q.test" -- q's test executable + // "p [q.test]" -- variant of p as built for q's test executable + // "q_test [q.test]" -- q's external test package + // + // The packages p that are built differently for a test q.test + // are q itself, plus any helpers used by the external test q_test, + // typically including "testing" and all its dependencies. + + // Run "go list" for complete + // information on the specified packages. + goVersion, err := state.getGoVersion() + if err != nil { + return nil, err + } + buf, err := state.invokeGo("list", golistargs(state.cfg, words, goVersion)...) + if err != nil { + return nil, err + } + + seen := make(map[string]*jsonPackage) + pkgs := make(map[string]*Package) + additionalErrors := make(map[string][]Error) + // Decode the JSON and convert it to Package form. + response := &DriverResponse{ + GoVersion: goVersion, + } + for dec := json.NewDecoder(buf); dec.More(); { + p := new(jsonPackage) + if err := dec.Decode(p); err != nil { + return nil, fmt.Errorf("JSON decoding failed: %v", err) + } + + if p.ImportPath == "" { + // The documentation for go list says that “[e]rroneous packages will have + // a non-empty ImportPath”. If for some reason it comes back empty, we + // prefer to error out rather than silently discarding data or handing + // back a package without any way to refer to it. + if p.Error != nil { + return nil, Error{ + Pos: p.Error.Pos, + Msg: p.Error.Err, + } + } + return nil, fmt.Errorf("package missing import path: %+v", p) + } + + // Work around https://golang.org/issue/33157: + // go list -e, when given an absolute path, will find the package contained at + // that directory. But when no package exists there, it will return a fake package + // with an error and the ImportPath set to the absolute path provided to go list. + // Try to convert that absolute path to what its package path would be if it's + // contained in a known module or GOPATH entry. This will allow the package to be + // properly "reclaimed" when overlays are processed. + if filepath.IsAbs(p.ImportPath) && p.Error != nil { + pkgPath, ok, err := state.getPkgPath(p.ImportPath) + if err != nil { + return nil, err + } + if ok { + p.ImportPath = pkgPath + } + } + + if old, found := seen[p.ImportPath]; found { + // If one version of the package has an error, and the other doesn't, assume + // that this is a case where go list is reporting a fake dependency variant + // of the imported package: When a package tries to invalidly import another + // package, go list emits a variant of the imported package (with the same + // import path, but with an error on it, and the package will have a + // DepError set on it). An example of when this can happen is for imports of + // main packages: main packages can not be imported, but they may be + // separately matched and listed by another pattern. + // See golang.org/issue/36188 for more details. + + // The plan is that eventually, hopefully in Go 1.15, the error will be + // reported on the importing package rather than the duplicate "fake" + // version of the imported package. Once all supported versions of Go + // have the new behavior this logic can be deleted. + // TODO(matloob): delete the workaround logic once all supported versions of + // Go return the errors on the proper package. + + // There should be exactly one version of a package that doesn't have an + // error. + if old.Error == nil && p.Error == nil { + if !reflect.DeepEqual(p, old) { + return nil, fmt.Errorf("internal error: go list gives conflicting information for package %v", p.ImportPath) + } + continue + } + + // Determine if this package's error needs to be bubbled up. + // This is a hack, and we expect for go list to eventually set the error + // on the package. + if old.Error != nil { + var errkind string + if strings.Contains(old.Error.Err, "not an importable package") { + errkind = "not an importable package" + } else if strings.Contains(old.Error.Err, "use of internal package") && strings.Contains(old.Error.Err, "not allowed") { + errkind = "use of internal package not allowed" + } + if errkind != "" { + if len(old.Error.ImportStack) < 1 { + return nil, fmt.Errorf(`internal error: go list gave a %q error with empty import stack`, errkind) + } + importingPkg := old.Error.ImportStack[len(old.Error.ImportStack)-1] + if importingPkg == old.ImportPath { + // Using an older version of Go which put this package itself on top of import + // stack, instead of the importer. Look for importer in second from top + // position. + if len(old.Error.ImportStack) < 2 { + return nil, fmt.Errorf(`internal error: go list gave a %q error with an import stack without importing package`, errkind) + } + importingPkg = old.Error.ImportStack[len(old.Error.ImportStack)-2] + } + additionalErrors[importingPkg] = append(additionalErrors[importingPkg], Error{ + Pos: old.Error.Pos, + Msg: old.Error.Err, + Kind: ListError, + }) + } + } + + // Make sure that if there's a version of the package without an error, + // that's the one reported to the user. + if old.Error == nil { + continue + } + + // This package will replace the old one at the end of the loop. + } + seen[p.ImportPath] = p + + pkg := &Package{ + Name: p.Name, + ID: p.ImportPath, + GoFiles: absJoin(p.Dir, p.GoFiles, p.CgoFiles), + CompiledGoFiles: absJoin(p.Dir, p.CompiledGoFiles), + OtherFiles: absJoin(p.Dir, otherFiles(p)...), + EmbedFiles: absJoin(p.Dir, p.EmbedFiles), + EmbedPatterns: absJoin(p.Dir, p.EmbedPatterns), + IgnoredFiles: absJoin(p.Dir, p.IgnoredGoFiles, p.IgnoredOtherFiles), + forTest: p.ForTest, + depsErrors: p.DepsErrors, + Module: p.Module, + } + + if (state.cfg.Mode&typecheckCgo) != 0 && len(p.CgoFiles) != 0 { + if len(p.CompiledGoFiles) > len(p.GoFiles) { + // We need the cgo definitions, which are in the first + // CompiledGoFile after the non-cgo ones. This is a hack but there + // isn't currently a better way to find it. We also need the pure + // Go files and unprocessed cgo files, all of which are already + // in pkg.GoFiles. + cgoTypes := p.CompiledGoFiles[len(p.GoFiles)] + pkg.CompiledGoFiles = append([]string{cgoTypes}, pkg.GoFiles...) + } else { + // golang/go#38990: go list silently fails to do cgo processing + pkg.CompiledGoFiles = nil + pkg.Errors = append(pkg.Errors, Error{ + Msg: "go list failed to return CompiledGoFiles. This may indicate failure to perform cgo processing; try building at the command line. See https://golang.org/issue/38990.", + Kind: ListError, + }) + } + } + + // Work around https://golang.org/issue/28749: + // cmd/go puts assembly, C, and C++ files in CompiledGoFiles. + // Remove files from CompiledGoFiles that are non-go files + // (or are not files that look like they are from the cache). + if len(pkg.CompiledGoFiles) > 0 { + out := pkg.CompiledGoFiles[:0] + for _, f := range pkg.CompiledGoFiles { + if ext := filepath.Ext(f); ext != ".go" && ext != "" { // ext == "" means the file is from the cache, so probably cgo-processed file + continue + } + out = append(out, f) + } + pkg.CompiledGoFiles = out + } + + // Extract the PkgPath from the package's ID. + if i := strings.IndexByte(pkg.ID, ' '); i >= 0 { + pkg.PkgPath = pkg.ID[:i] + } else { + pkg.PkgPath = pkg.ID + } + + if pkg.PkgPath == "unsafe" { + pkg.CompiledGoFiles = nil // ignore fake unsafe.go file (#59929) + } else if len(pkg.CompiledGoFiles) == 0 { + // Work around for pre-go.1.11 versions of go list. + // TODO(matloob): they should be handled by the fallback. + // Can we delete this? + pkg.CompiledGoFiles = pkg.GoFiles + } + + // Assume go list emits only absolute paths for Dir. + if p.Dir != "" && !filepath.IsAbs(p.Dir) { + log.Fatalf("internal error: go list returned non-absolute Package.Dir: %s", p.Dir) + } + + if p.Export != "" && !filepath.IsAbs(p.Export) { + pkg.ExportFile = filepath.Join(p.Dir, p.Export) + } else { + pkg.ExportFile = p.Export + } + + // imports + // + // Imports contains the IDs of all imported packages. + // ImportsMap records (path, ID) only where they differ. + ids := make(map[string]bool) + for _, id := range p.Imports { + ids[id] = true + } + pkg.Imports = make(map[string]*Package) + for path, id := range p.ImportMap { + pkg.Imports[path] = &Package{ID: id} // non-identity import + delete(ids, id) + } + for id := range ids { + if id == "C" { + continue + } + + pkg.Imports[id] = &Package{ID: id} // identity import + } + if !p.DepOnly { + response.Roots = append(response.Roots, pkg.ID) + } + + // Temporary work-around for golang/go#39986. Parse filenames out of + // error messages. This happens if there are unrecoverable syntax + // errors in the source, so we can't match on a specific error message. + // + // TODO(rfindley): remove this heuristic, in favor of considering + // InvalidGoFiles from the list driver. + if err := p.Error; err != nil && state.shouldAddFilenameFromError(p) { + addFilenameFromPos := func(pos string) bool { + split := strings.Split(pos, ":") + if len(split) < 1 { + return false + } + filename := strings.TrimSpace(split[0]) + if filename == "" { + return false + } + if !filepath.IsAbs(filename) { + filename = filepath.Join(state.cfg.Dir, filename) + } + info, _ := os.Stat(filename) + if info == nil { + return false + } + pkg.CompiledGoFiles = append(pkg.CompiledGoFiles, filename) + pkg.GoFiles = append(pkg.GoFiles, filename) + return true + } + found := addFilenameFromPos(err.Pos) + // In some cases, go list only reports the error position in the + // error text, not the error position. One such case is when the + // file's package name is a keyword (see golang.org/issue/39763). + if !found { + addFilenameFromPos(err.Err) + } + } + + if p.Error != nil { + msg := strings.TrimSpace(p.Error.Err) // Trim to work around golang.org/issue/32363. + // Address golang.org/issue/35964 by appending import stack to error message. + if msg == "import cycle not allowed" && len(p.Error.ImportStack) != 0 { + msg += fmt.Sprintf(": import stack: %v", p.Error.ImportStack) + } + pkg.Errors = append(pkg.Errors, Error{ + Pos: p.Error.Pos, + Msg: msg, + Kind: ListError, + }) + } + + pkgs[pkg.ID] = pkg + } + + for id, errs := range additionalErrors { + if p, ok := pkgs[id]; ok { + p.Errors = append(p.Errors, errs...) + } + } + for _, pkg := range pkgs { + response.Packages = append(response.Packages, pkg) + } + sort.Slice(response.Packages, func(i, j int) bool { return response.Packages[i].ID < response.Packages[j].ID }) + + return response, nil +} + +func (state *golistState) shouldAddFilenameFromError(p *jsonPackage) bool { + if len(p.GoFiles) > 0 || len(p.CompiledGoFiles) > 0 { + return false + } + + goV, err := state.getGoVersion() + if err != nil { + return false + } + + // On Go 1.14 and earlier, only add filenames from errors if the import stack is empty. + // The import stack behaves differently for these versions than newer Go versions. + if goV < 15 { + return len(p.Error.ImportStack) == 0 + } + + // On Go 1.15 and later, only parse filenames out of error if there's no import stack, + // or the current package is at the top of the import stack. This is not guaranteed + // to work perfectly, but should avoid some cases where files in errors don't belong to this + // package. + return len(p.Error.ImportStack) == 0 || p.Error.ImportStack[len(p.Error.ImportStack)-1] == p.ImportPath +} + +// getGoVersion returns the effective minor version of the go command. +func (state *golistState) getGoVersion() (int, error) { + state.goVersionOnce.Do(func() { + state.goVersion, state.goVersionError = gocommand.GoVersion(state.ctx, state.cfgInvocation(), state.cfg.gocmdRunner) + }) + return state.goVersion, state.goVersionError +} + +// getPkgPath finds the package path of a directory if it's relative to a root +// directory. +func (state *golistState) getPkgPath(dir string) (string, bool, error) { + absDir, err := filepath.Abs(dir) + if err != nil { + return "", false, err + } + roots, err := state.determineRootDirs() + if err != nil { + return "", false, err + } + + for rdir, rpath := range roots { + // Make sure that the directory is in the module, + // to avoid creating a path relative to another module. + if !strings.HasPrefix(absDir, rdir) { + continue + } + // TODO(matloob): This doesn't properly handle symlinks. + r, err := filepath.Rel(rdir, dir) + if err != nil { + continue + } + if rpath != "" { + // We choose only one root even though the directory even it can belong in multiple modules + // or GOPATH entries. This is okay because we only need to work with absolute dirs when a + // file is missing from disk, for instance when gopls calls go/packages in an overlay. + // Once the file is saved, gopls, or the next invocation of the tool will get the correct + // result straight from golist. + // TODO(matloob): Implement module tiebreaking? + return path.Join(rpath, filepath.ToSlash(r)), true, nil + } + return filepath.ToSlash(r), true, nil + } + return "", false, nil +} + +// absJoin absolutizes and flattens the lists of files. +func absJoin(dir string, fileses ...[]string) (res []string) { + for _, files := range fileses { + for _, file := range files { + if !filepath.IsAbs(file) { + file = filepath.Join(dir, file) + } + res = append(res, file) + } + } + return res +} + +func jsonFlag(cfg *Config, goVersion int) string { + if goVersion < 19 { + return "-json" + } + var fields []string + added := make(map[string]bool) + addFields := func(fs ...string) { + for _, f := range fs { + if !added[f] { + added[f] = true + fields = append(fields, f) + } + } + } + addFields("Name", "ImportPath", "Error") // These fields are always needed + if cfg.Mode&NeedFiles != 0 || cfg.Mode&NeedTypes != 0 { + addFields("Dir", "GoFiles", "IgnoredGoFiles", "IgnoredOtherFiles", "CFiles", + "CgoFiles", "CXXFiles", "MFiles", "HFiles", "FFiles", "SFiles", + "SwigFiles", "SwigCXXFiles", "SysoFiles") + if cfg.Tests { + addFields("TestGoFiles", "XTestGoFiles") + } + } + if cfg.Mode&NeedTypes != 0 { + // CompiledGoFiles seems to be required for the test case TestCgoNoSyntax, + // even when -compiled isn't passed in. + // TODO(#52435): Should we make the test ask for -compiled, or automatically + // request CompiledGoFiles in certain circumstances? + addFields("Dir", "CompiledGoFiles") + } + if cfg.Mode&NeedCompiledGoFiles != 0 { + addFields("Dir", "CompiledGoFiles", "Export") + } + if cfg.Mode&NeedImports != 0 { + // When imports are requested, DepOnly is used to distinguish between packages + // explicitly requested and transitive imports of those packages. + addFields("DepOnly", "Imports", "ImportMap") + if cfg.Tests { + addFields("TestImports", "XTestImports") + } + } + if cfg.Mode&NeedDeps != 0 { + addFields("DepOnly") + } + if usesExportData(cfg) { + // Request Dir in the unlikely case Export is not absolute. + addFields("Dir", "Export") + } + if cfg.Mode&needInternalForTest != 0 { + addFields("ForTest") + } + if cfg.Mode&needInternalDepsErrors != 0 { + addFields("DepsErrors") + } + if cfg.Mode&NeedModule != 0 { + addFields("Module") + } + if cfg.Mode&NeedEmbedFiles != 0 { + addFields("EmbedFiles") + } + if cfg.Mode&NeedEmbedPatterns != 0 { + addFields("EmbedPatterns") + } + return "-json=" + strings.Join(fields, ",") +} + +func golistargs(cfg *Config, words []string, goVersion int) []string { + const findFlags = NeedImports | NeedTypes | NeedSyntax | NeedTypesInfo + fullargs := []string{ + "-e", jsonFlag(cfg, goVersion), + fmt.Sprintf("-compiled=%t", cfg.Mode&(NeedCompiledGoFiles|NeedSyntax|NeedTypes|NeedTypesInfo|NeedTypesSizes) != 0), + fmt.Sprintf("-test=%t", cfg.Tests), + fmt.Sprintf("-export=%t", usesExportData(cfg)), + fmt.Sprintf("-deps=%t", cfg.Mode&NeedImports != 0), + // go list doesn't let you pass -test and -find together, + // probably because you'd just get the TestMain. + fmt.Sprintf("-find=%t", !cfg.Tests && cfg.Mode&findFlags == 0 && !usesExportData(cfg)), + } + + // golang/go#60456: with go1.21 and later, go list serves pgo variants, which + // can be costly to compute and may result in redundant processing for the + // caller. Disable these variants. If someone wants to add e.g. a NeedPGO + // mode flag, that should be a separate proposal. + if goVersion >= 21 { + fullargs = append(fullargs, "-pgo=off") + } + + fullargs = append(fullargs, cfg.BuildFlags...) + fullargs = append(fullargs, "--") + fullargs = append(fullargs, words...) + return fullargs +} + +// cfgInvocation returns an Invocation that reflects cfg's settings. +func (state *golistState) cfgInvocation() gocommand.Invocation { + cfg := state.cfg + return gocommand.Invocation{ + BuildFlags: cfg.BuildFlags, + ModFile: cfg.modFile, + ModFlag: cfg.modFlag, + CleanEnv: cfg.Env != nil, + Env: cfg.Env, + Logf: cfg.Logf, + WorkingDir: cfg.Dir, + Overlay: cfg.goListOverlayFile, + } +} + +// invokeGo returns the stdout of a go command invocation. +func (state *golistState) invokeGo(verb string, args ...string) (*bytes.Buffer, error) { + cfg := state.cfg + + inv := state.cfgInvocation() + inv.Verb = verb + inv.Args = args + gocmdRunner := cfg.gocmdRunner + if gocmdRunner == nil { + gocmdRunner = &gocommand.Runner{} + } + stdout, stderr, friendlyErr, err := gocmdRunner.RunRaw(cfg.Context, inv) + if err != nil { + // Check for 'go' executable not being found. + if ee, ok := err.(*exec.Error); ok && ee.Err == exec.ErrNotFound { + return nil, fmt.Errorf("'go list' driver requires 'go', but %s", exec.ErrNotFound) + } + + exitErr, ok := err.(*exec.ExitError) + if !ok { + // Catastrophic error: + // - context cancellation + return nil, fmt.Errorf("couldn't run 'go': %w", err) + } + + // Old go version? + if strings.Contains(stderr.String(), "flag provided but not defined") { + return nil, goTooOldError{fmt.Errorf("unsupported version of go: %s: %s", exitErr, stderr)} + } + + // Related to #24854 + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "unexpected directory layout") { + return nil, friendlyErr + } + + // Is there an error running the C compiler in cgo? This will be reported in the "Error" field + // and should be suppressed by go list -e. + // + // This condition is not perfect yet because the error message can include other error messages than runtime/cgo. + isPkgPathRune := func(r rune) bool { + // From https://golang.org/ref/spec#Import_declarations: + // Implementation restriction: A compiler may restrict ImportPaths to non-empty strings + // using only characters belonging to Unicode's L, M, N, P, and S general categories + // (the Graphic characters without spaces) and may also exclude the + // characters !"#$%&'()*,:;<=>?[\]^`{|} and the Unicode replacement character U+FFFD. + return unicode.IsOneOf([]*unicode.RangeTable{unicode.L, unicode.M, unicode.N, unicode.P, unicode.S}, r) && + !strings.ContainsRune("!\"#$%&'()*,:;<=>?[\\]^`{|}\uFFFD", r) + } + // golang/go#36770: Handle case where cmd/go prints module download messages before the error. + msg := stderr.String() + for strings.HasPrefix(msg, "go: downloading") { + msg = msg[strings.IndexRune(msg, '\n')+1:] + } + if len(stderr.String()) > 0 && strings.HasPrefix(stderr.String(), "# ") { + msg := msg[len("# "):] + if strings.HasPrefix(strings.TrimLeftFunc(msg, isPkgPathRune), "\n") { + return stdout, nil + } + // Treat pkg-config errors as a special case (golang.org/issue/36770). + if strings.HasPrefix(msg, "pkg-config") { + return stdout, nil + } + } + + // This error only appears in stderr. See golang.org/cl/166398 for a fix in go list to show + // the error in the Err section of stdout in case -e option is provided. + // This fix is provided for backwards compatibility. + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "named files must be .go files") { + output := fmt.Sprintf(`{"ImportPath": "command-line-arguments","Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + strings.Trim(stderr.String(), "\n")) + return bytes.NewBufferString(output), nil + } + + // Similar to the previous error, but currently lacks a fix in Go. + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "named files must all be in one directory") { + output := fmt.Sprintf(`{"ImportPath": "command-line-arguments","Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + strings.Trim(stderr.String(), "\n")) + return bytes.NewBufferString(output), nil + } + + // Backwards compatibility for Go 1.11 because 1.12 and 1.13 put the directory in the ImportPath. + // If the package doesn't exist, put the absolute path of the directory into the error message, + // as Go 1.13 list does. + const noSuchDirectory = "no such directory" + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), noSuchDirectory) { + errstr := stderr.String() + abspath := strings.TrimSpace(errstr[strings.Index(errstr, noSuchDirectory)+len(noSuchDirectory):]) + output := fmt.Sprintf(`{"ImportPath": %q,"Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + abspath, strings.Trim(stderr.String(), "\n")) + return bytes.NewBufferString(output), nil + } + + // Workaround for #29280: go list -e has incorrect behavior when an ad-hoc package doesn't exist. + // Note that the error message we look for in this case is different that the one looked for above. + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "no such file or directory") { + output := fmt.Sprintf(`{"ImportPath": "command-line-arguments","Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + strings.Trim(stderr.String(), "\n")) + return bytes.NewBufferString(output), nil + } + + // Workaround for #34273. go list -e with GO111MODULE=on has incorrect behavior when listing a + // directory outside any module. + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "outside available modules") { + output := fmt.Sprintf(`{"ImportPath": %q,"Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + // TODO(matloob): command-line-arguments isn't correct here. + "command-line-arguments", strings.Trim(stderr.String(), "\n")) + return bytes.NewBufferString(output), nil + } + + // Another variation of the previous error + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "outside module root") { + output := fmt.Sprintf(`{"ImportPath": %q,"Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + // TODO(matloob): command-line-arguments isn't correct here. + "command-line-arguments", strings.Trim(stderr.String(), "\n")) + return bytes.NewBufferString(output), nil + } + + // Workaround for an instance of golang.org/issue/26755: go list -e will return a non-zero exit + // status if there's a dependency on a package that doesn't exist. But it should return + // a zero exit status and set an error on that package. + if len(stderr.String()) > 0 && strings.Contains(stderr.String(), "no Go files in") { + // Don't clobber stdout if `go list` actually returned something. + if len(stdout.String()) > 0 { + return stdout, nil + } + // try to extract package name from string + stderrStr := stderr.String() + var importPath string + colon := strings.Index(stderrStr, ":") + if colon > 0 && strings.HasPrefix(stderrStr, "go build ") { + importPath = stderrStr[len("go build "):colon] + } + output := fmt.Sprintf(`{"ImportPath": %q,"Incomplete": true,"Error": {"Pos": "","Err": %q}}`, + importPath, strings.Trim(stderrStr, "\n")) + return bytes.NewBufferString(output), nil + } + + // Export mode entails a build. + // If that build fails, errors appear on stderr + // (despite the -e flag) and the Export field is blank. + // Do not fail in that case. + // The same is true if an ad-hoc package given to go list doesn't exist. + // TODO(matloob): Remove these once we can depend on go list to exit with a zero status with -e even when + // packages don't exist or a build fails. + if !usesExportData(cfg) && !containsGoFile(args) { + return nil, friendlyErr + } + } + return stdout, nil +} + +func containsGoFile(s []string) bool { + for _, f := range s { + if strings.HasSuffix(f, ".go") { + return true + } + } + return false +} + +func cmdDebugStr(cmd *exec.Cmd) string { + env := make(map[string]string) + for _, kv := range cmd.Env { + split := strings.SplitN(kv, "=", 2) + k, v := split[0], split[1] + env[k] = v + } + + var args []string + for _, arg := range cmd.Args { + quoted := strconv.Quote(arg) + if quoted[1:len(quoted)-1] != arg || strings.Contains(arg, " ") { + args = append(args, quoted) + } else { + args = append(args, arg) + } + } + return fmt.Sprintf("GOROOT=%v GOPATH=%v GO111MODULE=%v GOPROXY=%v PWD=%v %v", env["GOROOT"], env["GOPATH"], env["GO111MODULE"], env["GOPROXY"], env["PWD"], strings.Join(args, " ")) +} + +// getSizesForArgs queries 'go list' for the appropriate +// Compiler and GOARCH arguments to pass to [types.SizesFor]. +func getSizesForArgs(ctx context.Context, inv gocommand.Invocation, gocmdRunner *gocommand.Runner) (string, string, error) { + inv.Verb = "list" + inv.Args = []string{"-f", "{{context.GOARCH}} {{context.Compiler}}", "--", "unsafe"} + stdout, stderr, friendlyErr, rawErr := gocmdRunner.RunRaw(ctx, inv) + var goarch, compiler string + if rawErr != nil { + rawErrMsg := rawErr.Error() + if strings.Contains(rawErrMsg, "cannot find main module") || + strings.Contains(rawErrMsg, "go.mod file not found") { + // User's running outside of a module. + // All bets are off. Get GOARCH and guess compiler is gc. + // TODO(matloob): Is this a problem in practice? + inv.Verb = "env" + inv.Args = []string{"GOARCH"} + envout, enverr := gocmdRunner.Run(ctx, inv) + if enverr != nil { + return "", "", enverr + } + goarch = strings.TrimSpace(envout.String()) + compiler = "gc" + } else if friendlyErr != nil { + return "", "", friendlyErr + } else { + // This should be unreachable, but be defensive + // in case RunRaw's error results are inconsistent. + return "", "", rawErr + } + } else { + fields := strings.Fields(stdout.String()) + if len(fields) < 2 { + return "", "", fmt.Errorf("could not parse GOARCH and Go compiler in format \" \":\nstdout: <<%s>>\nstderr: <<%s>>", + stdout.String(), stderr.String()) + } + goarch = fields[0] + compiler = fields[1] + } + return compiler, goarch, nil +} diff --git a/vendor/golang.org/x/tools/go/packages/golist_overlay.go b/vendor/golang.org/x/tools/go/packages/golist_overlay.go new file mode 100644 index 0000000..d823c47 --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/golist_overlay.go @@ -0,0 +1,83 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package packages + +import ( + "encoding/json" + "path/filepath" + + "golang.org/x/tools/internal/gocommand" +) + +// determineRootDirs returns a mapping from absolute directories that could +// contain code to their corresponding import path prefixes. +func (state *golistState) determineRootDirs() (map[string]string, error) { + env, err := state.getEnv() + if err != nil { + return nil, err + } + if env["GOMOD"] != "" { + state.rootsOnce.Do(func() { + state.rootDirs, state.rootDirsError = state.determineRootDirsModules() + }) + } else { + state.rootsOnce.Do(func() { + state.rootDirs, state.rootDirsError = state.determineRootDirsGOPATH() + }) + } + return state.rootDirs, state.rootDirsError +} + +func (state *golistState) determineRootDirsModules() (map[string]string, error) { + // List all of the modules--the first will be the directory for the main + // module. Any replaced modules will also need to be treated as roots. + // Editing files in the module cache isn't a great idea, so we don't + // plan to ever support that. + out, err := state.invokeGo("list", "-m", "-json", "all") + if err != nil { + // 'go list all' will fail if we're outside of a module and + // GO111MODULE=on. Try falling back without 'all'. + var innerErr error + out, innerErr = state.invokeGo("list", "-m", "-json") + if innerErr != nil { + return nil, err + } + } + roots := map[string]string{} + modules := map[string]string{} + var i int + for dec := json.NewDecoder(out); dec.More(); { + mod := new(gocommand.ModuleJSON) + if err := dec.Decode(mod); err != nil { + return nil, err + } + if mod.Dir != "" && mod.Path != "" { + // This is a valid module; add it to the map. + absDir, err := filepath.Abs(mod.Dir) + if err != nil { + return nil, err + } + modules[absDir] = mod.Path + // The first result is the main module. + if i == 0 || mod.Replace != nil && mod.Replace.Path != "" { + roots[absDir] = mod.Path + } + } + i++ + } + return roots, nil +} + +func (state *golistState) determineRootDirsGOPATH() (map[string]string, error) { + m := map[string]string{} + for _, dir := range filepath.SplitList(state.mustGetEnv()["GOPATH"]) { + absDir, err := filepath.Abs(dir) + if err != nil { + return nil, err + } + m[filepath.Join(absDir, "src")] = "" + } + return m, nil +} diff --git a/vendor/golang.org/x/tools/go/packages/loadmode_string.go b/vendor/golang.org/x/tools/go/packages/loadmode_string.go new file mode 100644 index 0000000..5fcad6e --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/loadmode_string.go @@ -0,0 +1,54 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package packages + +import ( + "fmt" + "strings" +) + +var modes = [...]struct { + mode LoadMode + name string +}{ + {NeedName, "NeedName"}, + {NeedFiles, "NeedFiles"}, + {NeedCompiledGoFiles, "NeedCompiledGoFiles"}, + {NeedImports, "NeedImports"}, + {NeedDeps, "NeedDeps"}, + {NeedExportFile, "NeedExportFile"}, + {NeedTypes, "NeedTypes"}, + {NeedSyntax, "NeedSyntax"}, + {NeedTypesInfo, "NeedTypesInfo"}, + {NeedTypesSizes, "NeedTypesSizes"}, + {NeedModule, "NeedModule"}, + {NeedEmbedFiles, "NeedEmbedFiles"}, + {NeedEmbedPatterns, "NeedEmbedPatterns"}, +} + +func (mode LoadMode) String() string { + if mode == 0 { + return "LoadMode(0)" + } + var out []string + // named bits + for _, item := range modes { + if (mode & item.mode) != 0 { + mode ^= item.mode + out = append(out, item.name) + } + } + // unnamed residue + if mode != 0 { + if out == nil { + return fmt.Sprintf("LoadMode(%#x)", int(mode)) + } + out = append(out, fmt.Sprintf("%#x", int(mode))) + } + if len(out) == 1 { + return out[0] + } + return "(" + strings.Join(out, "|") + ")" +} diff --git a/vendor/golang.org/x/tools/go/packages/packages.go b/vendor/golang.org/x/tools/go/packages/packages.go new file mode 100644 index 0000000..f227f1b --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/packages.go @@ -0,0 +1,1527 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package packages + +// See doc.go for package documentation and implementation notes. + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "go/ast" + "go/parser" + "go/scanner" + "go/token" + "go/types" + "io" + "log" + "os" + "path/filepath" + "runtime" + "strings" + "sync" + "time" + + "golang.org/x/sync/errgroup" + + "golang.org/x/tools/go/gcexportdata" + "golang.org/x/tools/internal/gocommand" + "golang.org/x/tools/internal/packagesinternal" + "golang.org/x/tools/internal/typesinternal" + "golang.org/x/tools/internal/versions" +) + +// A LoadMode controls the amount of detail to return when loading. +// The bits below can be combined to specify which fields should be +// filled in the result packages. +// +// The zero value is a special case, equivalent to combining +// the NeedName, NeedFiles, and NeedCompiledGoFiles bits. +// +// ID and Errors (if present) will always be filled. +// [Load] may return more information than requested. +// +// Unfortunately there are a number of open bugs related to +// interactions among the LoadMode bits: +// - https://github.com/golang/go/issues/56633 +// - https://github.com/golang/go/issues/56677 +// - https://github.com/golang/go/issues/58726 +// - https://github.com/golang/go/issues/63517 +type LoadMode int + +const ( + // NeedName adds Name and PkgPath. + NeedName LoadMode = 1 << iota + + // NeedFiles adds GoFiles and OtherFiles. + NeedFiles + + // NeedCompiledGoFiles adds CompiledGoFiles. + NeedCompiledGoFiles + + // NeedImports adds Imports. If NeedDeps is not set, the Imports field will contain + // "placeholder" Packages with only the ID set. + NeedImports + + // NeedDeps adds the fields requested by the LoadMode in the packages in Imports. + NeedDeps + + // NeedExportFile adds ExportFile. + NeedExportFile + + // NeedTypes adds Types, Fset, and IllTyped. + NeedTypes + + // NeedSyntax adds Syntax and Fset. + NeedSyntax + + // NeedTypesInfo adds TypesInfo. + NeedTypesInfo + + // NeedTypesSizes adds TypesSizes. + NeedTypesSizes + + // needInternalDepsErrors adds the internal deps errors field for use by gopls. + needInternalDepsErrors + + // needInternalForTest adds the internal forTest field. + // Tests must also be set on the context for this field to be populated. + needInternalForTest + + // typecheckCgo enables full support for type checking cgo. Requires Go 1.15+. + // Modifies CompiledGoFiles and Types, and has no effect on its own. + typecheckCgo + + // NeedModule adds Module. + NeedModule + + // NeedEmbedFiles adds EmbedFiles. + NeedEmbedFiles + + // NeedEmbedPatterns adds EmbedPatterns. + NeedEmbedPatterns + + // Be sure to update loadmode_string.go when adding new items! +) + +const ( + // LoadFiles loads the name and file names for the initial packages. + // + // Deprecated: LoadFiles exists for historical compatibility + // and should not be used. Please directly specify the needed fields using the Need values. + LoadFiles = NeedName | NeedFiles | NeedCompiledGoFiles + + // LoadImports loads the name, file names, and import mapping for the initial packages. + // + // Deprecated: LoadImports exists for historical compatibility + // and should not be used. Please directly specify the needed fields using the Need values. + LoadImports = LoadFiles | NeedImports + + // LoadTypes loads exported type information for the initial packages. + // + // Deprecated: LoadTypes exists for historical compatibility + // and should not be used. Please directly specify the needed fields using the Need values. + LoadTypes = LoadImports | NeedTypes | NeedTypesSizes + + // LoadSyntax loads typed syntax for the initial packages. + // + // Deprecated: LoadSyntax exists for historical compatibility + // and should not be used. Please directly specify the needed fields using the Need values. + LoadSyntax = LoadTypes | NeedSyntax | NeedTypesInfo + + // LoadAllSyntax loads typed syntax for the initial packages and all dependencies. + // + // Deprecated: LoadAllSyntax exists for historical compatibility + // and should not be used. Please directly specify the needed fields using the Need values. + LoadAllSyntax = LoadSyntax | NeedDeps + + // Deprecated: NeedExportsFile is a historical misspelling of NeedExportFile. + NeedExportsFile = NeedExportFile +) + +// A Config specifies details about how packages should be loaded. +// The zero value is a valid configuration. +// +// Calls to Load do not modify this struct. +// +// TODO(adonovan): #67702: this is currently false: in fact, +// calls to [Load] do not modify the public fields of this struct, but +// may modify hidden fields, so concurrent calls to [Load] must not +// use the same Config. But perhaps we should reestablish the +// documented invariant. +type Config struct { + // Mode controls the level of information returned for each package. + Mode LoadMode + + // Context specifies the context for the load operation. + // Cancelling the context may cause [Load] to abort and + // return an error. + Context context.Context + + // Logf is the logger for the config. + // If the user provides a logger, debug logging is enabled. + // If the GOPACKAGESDEBUG environment variable is set to true, + // but the logger is nil, default to log.Printf. + Logf func(format string, args ...interface{}) + + // Dir is the directory in which to run the build system's query tool + // that provides information about the packages. + // If Dir is empty, the tool is run in the current directory. + Dir string + + // Env is the environment to use when invoking the build system's query tool. + // If Env is nil, the current environment is used. + // As in os/exec's Cmd, only the last value in the slice for + // each environment key is used. To specify the setting of only + // a few variables, append to the current environment, as in: + // + // opt.Env = append(os.Environ(), "GOOS=plan9", "GOARCH=386") + // + Env []string + + // gocmdRunner guards go command calls from concurrency errors. + gocmdRunner *gocommand.Runner + + // BuildFlags is a list of command-line flags to be passed through to + // the build system's query tool. + BuildFlags []string + + // modFile will be used for -modfile in go command invocations. + modFile string + + // modFlag will be used for -modfile in go command invocations. + modFlag string + + // Fset provides source position information for syntax trees and types. + // If Fset is nil, Load will use a new fileset, but preserve Fset's value. + Fset *token.FileSet + + // ParseFile is called to read and parse each file + // when preparing a package's type-checked syntax tree. + // It must be safe to call ParseFile simultaneously from multiple goroutines. + // If ParseFile is nil, the loader will uses parser.ParseFile. + // + // ParseFile should parse the source from src and use filename only for + // recording position information. + // + // An application may supply a custom implementation of ParseFile + // to change the effective file contents or the behavior of the parser, + // or to modify the syntax tree. For example, selectively eliminating + // unwanted function bodies can significantly accelerate type checking. + ParseFile func(fset *token.FileSet, filename string, src []byte) (*ast.File, error) + + // If Tests is set, the loader includes not just the packages + // matching a particular pattern but also any related test packages, + // including test-only variants of the package and the test executable. + // + // For example, when using the go command, loading "fmt" with Tests=true + // returns four packages, with IDs "fmt" (the standard package), + // "fmt [fmt.test]" (the package as compiled for the test), + // "fmt_test" (the test functions from source files in package fmt_test), + // and "fmt.test" (the test binary). + // + // In build systems with explicit names for tests, + // setting Tests may have no effect. + Tests bool + + // Overlay is a mapping from absolute file paths to file contents. + // + // For each map entry, [Load] uses the alternative file + // contents provided by the overlay mapping instead of reading + // from the file system. This mechanism can be used to enable + // editor-integrated tools to correctly analyze the contents + // of modified but unsaved buffers, for example. + // + // The overlay mapping is passed to the build system's driver + // (see "The driver protocol") so that it too can report + // consistent package metadata about unsaved files. However, + // drivers may vary in their level of support for overlays. + Overlay map[string][]byte + + // goListOverlayFile is the JSON file that encodes the Overlay + // mapping, used by 'go list -overlay=...' + goListOverlayFile string +} + +// Load loads and returns the Go packages named by the given patterns. +// +// The cfg parameter specifies loading options; nil behaves the same as an empty [Config]. +// +// The [Config.Mode] field is a set of bits that determine what kinds +// of information should be computed and returned. Modes that require +// more information tend to be slower. See [LoadMode] for details +// and important caveats. Its zero value is equivalent to +// [NeedName] | [NeedFiles] | [NeedCompiledGoFiles]. +// +// Each call to Load returns a new set of [Package] instances. +// The Packages and their Imports form a directed acyclic graph. +// +// If the [NeedTypes] mode flag was set, each call to Load uses a new +// [types.Importer], so [types.Object] and [types.Type] values from +// different calls to Load must not be mixed as they will have +// inconsistent notions of type identity. +// +// If any of the patterns was invalid as defined by the +// underlying build system, Load returns an error. +// It may return an empty list of packages without an error, +// for instance for an empty expansion of a valid wildcard. +// Errors associated with a particular package are recorded in the +// corresponding Package's Errors list, and do not cause Load to +// return an error. Clients may need to handle such errors before +// proceeding with further analysis. The [PrintErrors] function is +// provided for convenient display of all errors. +func Load(cfg *Config, patterns ...string) ([]*Package, error) { + ld := newLoader(cfg) + response, external, err := defaultDriver(&ld.Config, patterns...) + if err != nil { + return nil, err + } + + ld.sizes = types.SizesFor(response.Compiler, response.Arch) + if ld.sizes == nil && ld.Config.Mode&(NeedTypes|NeedTypesSizes|NeedTypesInfo) != 0 { + // Type size information is needed but unavailable. + if external { + // An external driver may fail to populate the Compiler/GOARCH fields, + // especially since they are relatively new (see #63700). + // Provide a sensible fallback in this case. + ld.sizes = types.SizesFor("gc", runtime.GOARCH) + if ld.sizes == nil { // gccgo-only arch + ld.sizes = types.SizesFor("gc", "amd64") + } + } else { + // Go list should never fail to deliver accurate size information. + // Reject the whole Load since the error is the same for every package. + return nil, fmt.Errorf("can't determine type sizes for compiler %q on GOARCH %q", + response.Compiler, response.Arch) + } + } + + return ld.refine(response) +} + +// defaultDriver is a driver that implements go/packages' fallback behavior. +// It will try to request to an external driver, if one exists. If there's +// no external driver, or the driver returns a response with NotHandled set, +// defaultDriver will fall back to the go list driver. +// The boolean result indicates that an external driver handled the request. +func defaultDriver(cfg *Config, patterns ...string) (*DriverResponse, bool, error) { + const ( + // windowsArgMax specifies the maximum command line length for + // the Windows' CreateProcess function. + windowsArgMax = 32767 + // maxEnvSize is a very rough estimation of the maximum environment + // size of a user. + maxEnvSize = 16384 + // safeArgMax specifies the maximum safe command line length to use + // by the underlying driver excl. the environment. We choose the Windows' + // ARG_MAX as the starting point because it's one of the lowest ARG_MAX + // constants out of the different supported platforms, + // e.g., https://www.in-ulm.de/~mascheck/various/argmax/#results. + safeArgMax = windowsArgMax - maxEnvSize + ) + chunks, err := splitIntoChunks(patterns, safeArgMax) + if err != nil { + return nil, false, err + } + + if driver := findExternalDriver(cfg); driver != nil { + response, err := callDriverOnChunks(driver, cfg, chunks) + if err != nil { + return nil, false, err + } else if !response.NotHandled { + return response, true, nil + } + // (fall through) + } + + // go list fallback + // + // Write overlays once, as there are many calls + // to 'go list' (one per chunk plus others too). + overlay, cleanupOverlay, err := gocommand.WriteOverlays(cfg.Overlay) + if err != nil { + return nil, false, err + } + defer cleanupOverlay() + cfg.goListOverlayFile = overlay + + response, err := callDriverOnChunks(goListDriver, cfg, chunks) + if err != nil { + return nil, false, err + } + return response, false, err +} + +// splitIntoChunks chunks the slice so that the total number of characters +// in a chunk is no longer than argMax. +func splitIntoChunks(patterns []string, argMax int) ([][]string, error) { + if argMax <= 0 { + return nil, errors.New("failed to split patterns into chunks, negative safe argMax value") + } + var chunks [][]string + charsInChunk := 0 + nextChunkStart := 0 + for i, v := range patterns { + vChars := len(v) + if vChars > argMax { + // a single pattern is longer than the maximum safe ARG_MAX, hardly should happen + return nil, errors.New("failed to split patterns into chunks, a pattern is too long") + } + charsInChunk += vChars + 1 // +1 is for a whitespace between patterns that has to be counted too + if charsInChunk > argMax { + chunks = append(chunks, patterns[nextChunkStart:i]) + nextChunkStart = i + charsInChunk = vChars + } + } + // add the last chunk + if nextChunkStart < len(patterns) { + chunks = append(chunks, patterns[nextChunkStart:]) + } + return chunks, nil +} + +func callDriverOnChunks(driver driver, cfg *Config, chunks [][]string) (*DriverResponse, error) { + if len(chunks) == 0 { + return driver(cfg) + } + responses := make([]*DriverResponse, len(chunks)) + errNotHandled := errors.New("driver returned NotHandled") + var g errgroup.Group + for i, chunk := range chunks { + i := i + chunk := chunk + g.Go(func() (err error) { + responses[i], err = driver(cfg, chunk...) + if responses[i] != nil && responses[i].NotHandled { + err = errNotHandled + } + return err + }) + } + if err := g.Wait(); err != nil { + if errors.Is(err, errNotHandled) { + return &DriverResponse{NotHandled: true}, nil + } + return nil, err + } + return mergeResponses(responses...), nil +} + +func mergeResponses(responses ...*DriverResponse) *DriverResponse { + if len(responses) == 0 { + return nil + } + response := newDeduper() + response.dr.NotHandled = false + response.dr.Compiler = responses[0].Compiler + response.dr.Arch = responses[0].Arch + response.dr.GoVersion = responses[0].GoVersion + for _, v := range responses { + response.addAll(v) + } + return response.dr +} + +// A Package describes a loaded Go package. +// +// It also defines part of the JSON schema of [DriverResponse]. +// See the package documentation for an overview. +type Package struct { + // ID is a unique identifier for a package, + // in a syntax provided by the underlying build system. + // + // Because the syntax varies based on the build system, + // clients should treat IDs as opaque and not attempt to + // interpret them. + ID string + + // Name is the package name as it appears in the package source code. + Name string + + // PkgPath is the package path as used by the go/types package. + PkgPath string + + // Errors contains any errors encountered querying the metadata + // of the package, or while parsing or type-checking its files. + Errors []Error + + // TypeErrors contains the subset of errors produced during type checking. + TypeErrors []types.Error + + // GoFiles lists the absolute file paths of the package's Go source files. + // It may include files that should not be compiled, for example because + // they contain non-matching build tags, are documentary pseudo-files such as + // unsafe/unsafe.go or builtin/builtin.go, or are subject to cgo preprocessing. + GoFiles []string + + // CompiledGoFiles lists the absolute file paths of the package's source + // files that are suitable for type checking. + // This may differ from GoFiles if files are processed before compilation. + CompiledGoFiles []string + + // OtherFiles lists the absolute file paths of the package's non-Go source files, + // including assembly, C, C++, Fortran, Objective-C, SWIG, and so on. + OtherFiles []string + + // EmbedFiles lists the absolute file paths of the package's files + // embedded with go:embed. + EmbedFiles []string + + // EmbedPatterns lists the absolute file patterns of the package's + // files embedded with go:embed. + EmbedPatterns []string + + // IgnoredFiles lists source files that are not part of the package + // using the current build configuration but that might be part of + // the package using other build configurations. + IgnoredFiles []string + + // ExportFile is the absolute path to a file containing type + // information for the package as provided by the build system. + ExportFile string + + // Imports maps import paths appearing in the package's Go source files + // to corresponding loaded Packages. + Imports map[string]*Package + + // Module is the module information for the package if it exists. + // + // Note: it may be missing for std and cmd; see Go issue #65816. + Module *Module + + // -- The following fields are not part of the driver JSON schema. -- + + // Types provides type information for the package. + // The NeedTypes LoadMode bit sets this field for packages matching the + // patterns; type information for dependencies may be missing or incomplete, + // unless NeedDeps and NeedImports are also set. + // + // Each call to [Load] returns a consistent set of type + // symbols, as defined by the comment at [types.Identical]. + // Avoid mixing type information from two or more calls to [Load]. + Types *types.Package `json:"-"` + + // Fset provides position information for Types, TypesInfo, and Syntax. + // It is set only when Types is set. + Fset *token.FileSet `json:"-"` + + // IllTyped indicates whether the package or any dependency contains errors. + // It is set only when Types is set. + IllTyped bool `json:"-"` + + // Syntax is the package's syntax trees, for the files listed in CompiledGoFiles. + // + // The NeedSyntax LoadMode bit populates this field for packages matching the patterns. + // If NeedDeps and NeedImports are also set, this field will also be populated + // for dependencies. + // + // Syntax is kept in the same order as CompiledGoFiles, with the caveat that nils are + // removed. If parsing returned nil, Syntax may be shorter than CompiledGoFiles. + Syntax []*ast.File `json:"-"` + + // TypesInfo provides type information about the package's syntax trees. + // It is set only when Syntax is set. + TypesInfo *types.Info `json:"-"` + + // TypesSizes provides the effective size function for types in TypesInfo. + TypesSizes types.Sizes `json:"-"` + + // -- internal -- + + // forTest is the package under test, if any. + forTest string + + // depsErrors is the DepsErrors field from the go list response, if any. + depsErrors []*packagesinternal.PackageError +} + +// Module provides module information for a package. +// +// It also defines part of the JSON schema of [DriverResponse]. +// See the package documentation for an overview. +type Module struct { + Path string // module path + Version string // module version + Replace *Module // replaced by this module + Time *time.Time // time version was created + Main bool // is this the main module? + Indirect bool // is this module only an indirect dependency of main module? + Dir string // directory holding files for this module, if any + GoMod string // path to go.mod file used when loading this module, if any + GoVersion string // go version used in module + Error *ModuleError // error loading module +} + +// ModuleError holds errors loading a module. +type ModuleError struct { + Err string // the error itself +} + +func init() { + packagesinternal.GetForTest = func(p interface{}) string { + return p.(*Package).forTest + } + packagesinternal.GetDepsErrors = func(p interface{}) []*packagesinternal.PackageError { + return p.(*Package).depsErrors + } + packagesinternal.SetModFile = func(config interface{}, value string) { + config.(*Config).modFile = value + } + packagesinternal.SetModFlag = func(config interface{}, value string) { + config.(*Config).modFlag = value + } + packagesinternal.TypecheckCgo = int(typecheckCgo) + packagesinternal.DepsErrors = int(needInternalDepsErrors) + packagesinternal.ForTest = int(needInternalForTest) +} + +// An Error describes a problem with a package's metadata, syntax, or types. +type Error struct { + Pos string // "file:line:col" or "file:line" or "" or "-" + Msg string + Kind ErrorKind +} + +// ErrorKind describes the source of the error, allowing the user to +// differentiate between errors generated by the driver, the parser, or the +// type-checker. +type ErrorKind int + +const ( + UnknownError ErrorKind = iota + ListError + ParseError + TypeError +) + +func (err Error) Error() string { + pos := err.Pos + if pos == "" { + pos = "-" // like token.Position{}.String() + } + return pos + ": " + err.Msg +} + +// flatPackage is the JSON form of Package +// It drops all the type and syntax fields, and transforms the Imports +// +// TODO(adonovan): identify this struct with Package, effectively +// publishing the JSON protocol. +type flatPackage struct { + ID string + Name string `json:",omitempty"` + PkgPath string `json:",omitempty"` + Errors []Error `json:",omitempty"` + GoFiles []string `json:",omitempty"` + CompiledGoFiles []string `json:",omitempty"` + OtherFiles []string `json:",omitempty"` + EmbedFiles []string `json:",omitempty"` + EmbedPatterns []string `json:",omitempty"` + IgnoredFiles []string `json:",omitempty"` + ExportFile string `json:",omitempty"` + Imports map[string]string `json:",omitempty"` +} + +// MarshalJSON returns the Package in its JSON form. +// For the most part, the structure fields are written out unmodified, and +// the type and syntax fields are skipped. +// The imports are written out as just a map of path to package id. +// The errors are written using a custom type that tries to preserve the +// structure of error types we know about. +// +// This method exists to enable support for additional build systems. It is +// not intended for use by clients of the API and we may change the format. +func (p *Package) MarshalJSON() ([]byte, error) { + flat := &flatPackage{ + ID: p.ID, + Name: p.Name, + PkgPath: p.PkgPath, + Errors: p.Errors, + GoFiles: p.GoFiles, + CompiledGoFiles: p.CompiledGoFiles, + OtherFiles: p.OtherFiles, + EmbedFiles: p.EmbedFiles, + EmbedPatterns: p.EmbedPatterns, + IgnoredFiles: p.IgnoredFiles, + ExportFile: p.ExportFile, + } + if len(p.Imports) > 0 { + flat.Imports = make(map[string]string, len(p.Imports)) + for path, ipkg := range p.Imports { + flat.Imports[path] = ipkg.ID + } + } + return json.Marshal(flat) +} + +// UnmarshalJSON reads in a Package from its JSON format. +// See MarshalJSON for details about the format accepted. +func (p *Package) UnmarshalJSON(b []byte) error { + flat := &flatPackage{} + if err := json.Unmarshal(b, &flat); err != nil { + return err + } + *p = Package{ + ID: flat.ID, + Name: flat.Name, + PkgPath: flat.PkgPath, + Errors: flat.Errors, + GoFiles: flat.GoFiles, + CompiledGoFiles: flat.CompiledGoFiles, + OtherFiles: flat.OtherFiles, + EmbedFiles: flat.EmbedFiles, + EmbedPatterns: flat.EmbedPatterns, + IgnoredFiles: flat.IgnoredFiles, + ExportFile: flat.ExportFile, + } + if len(flat.Imports) > 0 { + p.Imports = make(map[string]*Package, len(flat.Imports)) + for path, id := range flat.Imports { + p.Imports[path] = &Package{ID: id} + } + } + return nil +} + +func (p *Package) String() string { return p.ID } + +// loaderPackage augments Package with state used during the loading phase +type loaderPackage struct { + *Package + importErrors map[string]error // maps each bad import to its error + loadOnce sync.Once + color uint8 // for cycle detection + needsrc bool // load from source (Mode >= LoadTypes) + needtypes bool // type information is either requested or depended on + initial bool // package was matched by a pattern + goVersion int // minor version number of go command on PATH +} + +// loader holds the working state of a single call to load. +type loader struct { + pkgs map[string]*loaderPackage + Config + sizes types.Sizes // non-nil if needed by mode + parseCache map[string]*parseValue + parseCacheMu sync.Mutex + exportMu sync.Mutex // enforces mutual exclusion of exportdata operations + + // Config.Mode contains the implied mode (see impliedLoadMode). + // Implied mode contains all the fields we need the data for. + // In requestedMode there are the actually requested fields. + // We'll zero them out before returning packages to the user. + // This makes it easier for us to get the conditions where + // we need certain modes right. + requestedMode LoadMode +} + +type parseValue struct { + f *ast.File + err error + ready chan struct{} +} + +func newLoader(cfg *Config) *loader { + ld := &loader{ + parseCache: map[string]*parseValue{}, + } + if cfg != nil { + ld.Config = *cfg + // If the user has provided a logger, use it. + ld.Config.Logf = cfg.Logf + } + if ld.Config.Logf == nil { + // If the GOPACKAGESDEBUG environment variable is set to true, + // but the user has not provided a logger, default to log.Printf. + if debug { + ld.Config.Logf = log.Printf + } else { + ld.Config.Logf = func(format string, args ...interface{}) {} + } + } + if ld.Config.Mode == 0 { + ld.Config.Mode = NeedName | NeedFiles | NeedCompiledGoFiles // Preserve zero behavior of Mode for backwards compatibility. + } + if ld.Config.Env == nil { + ld.Config.Env = os.Environ() + } + if ld.Config.gocmdRunner == nil { + ld.Config.gocmdRunner = &gocommand.Runner{} + } + if ld.Context == nil { + ld.Context = context.Background() + } + if ld.Dir == "" { + if dir, err := os.Getwd(); err == nil { + ld.Dir = dir + } + } + + // Save the actually requested fields. We'll zero them out before returning packages to the user. + ld.requestedMode = ld.Mode + ld.Mode = impliedLoadMode(ld.Mode) + + if ld.Mode&NeedTypes != 0 || ld.Mode&NeedSyntax != 0 { + if ld.Fset == nil { + ld.Fset = token.NewFileSet() + } + + // ParseFile is required even in LoadTypes mode + // because we load source if export data is missing. + if ld.ParseFile == nil { + ld.ParseFile = func(fset *token.FileSet, filename string, src []byte) (*ast.File, error) { + // We implicitly promise to keep doing ast.Object resolution. :( + const mode = parser.AllErrors | parser.ParseComments + return parser.ParseFile(fset, filename, src, mode) + } + } + } + + return ld +} + +// refine connects the supplied packages into a graph and then adds type +// and syntax information as requested by the LoadMode. +func (ld *loader) refine(response *DriverResponse) ([]*Package, error) { + roots := response.Roots + rootMap := make(map[string]int, len(roots)) + for i, root := range roots { + rootMap[root] = i + } + ld.pkgs = make(map[string]*loaderPackage) + // first pass, fixup and build the map and roots + var initial = make([]*loaderPackage, len(roots)) + for _, pkg := range response.Packages { + rootIndex := -1 + if i, found := rootMap[pkg.ID]; found { + rootIndex = i + } + + // Overlays can invalidate export data. + // TODO(matloob): make this check fine-grained based on dependencies on overlaid files + exportDataInvalid := len(ld.Overlay) > 0 || pkg.ExportFile == "" && pkg.PkgPath != "unsafe" + // This package needs type information if the caller requested types and the package is + // either a root, or it's a non-root and the user requested dependencies ... + needtypes := (ld.Mode&NeedTypes|NeedTypesInfo != 0 && (rootIndex >= 0 || ld.Mode&NeedDeps != 0)) + // This package needs source if the call requested source (or types info, which implies source) + // and the package is either a root, or itas a non- root and the user requested dependencies... + needsrc := ((ld.Mode&(NeedSyntax|NeedTypesInfo) != 0 && (rootIndex >= 0 || ld.Mode&NeedDeps != 0)) || + // ... or if we need types and the exportData is invalid. We fall back to (incompletely) + // typechecking packages from source if they fail to compile. + (ld.Mode&(NeedTypes|NeedTypesInfo) != 0 && exportDataInvalid)) && pkg.PkgPath != "unsafe" + lpkg := &loaderPackage{ + Package: pkg, + needtypes: needtypes, + needsrc: needsrc, + goVersion: response.GoVersion, + } + ld.pkgs[lpkg.ID] = lpkg + if rootIndex >= 0 { + initial[rootIndex] = lpkg + lpkg.initial = true + } + } + for i, root := range roots { + if initial[i] == nil { + return nil, fmt.Errorf("root package %v is missing", root) + } + } + + if ld.Mode&NeedImports != 0 { + // Materialize the import graph. + + const ( + white = 0 // new + grey = 1 // in progress + black = 2 // complete + ) + + // visit traverses the import graph, depth-first, + // and materializes the graph as Packages.Imports. + // + // Valid imports are saved in the Packages.Import map. + // Invalid imports (cycles and missing nodes) are saved in the importErrors map. + // Thus, even in the presence of both kinds of errors, + // the Import graph remains a DAG. + // + // visit returns whether the package needs src or has a transitive + // dependency on a package that does. These are the only packages + // for which we load source code. + var stack []*loaderPackage + var visit func(lpkg *loaderPackage) bool + visit = func(lpkg *loaderPackage) bool { + switch lpkg.color { + case black: + return lpkg.needsrc + case grey: + panic("internal error: grey node") + } + lpkg.color = grey + stack = append(stack, lpkg) // push + stubs := lpkg.Imports // the structure form has only stubs with the ID in the Imports + lpkg.Imports = make(map[string]*Package, len(stubs)) + for importPath, ipkg := range stubs { + var importErr error + imp := ld.pkgs[ipkg.ID] + if imp == nil { + // (includes package "C" when DisableCgo) + importErr = fmt.Errorf("missing package: %q", ipkg.ID) + } else if imp.color == grey { + importErr = fmt.Errorf("import cycle: %s", stack) + } + if importErr != nil { + if lpkg.importErrors == nil { + lpkg.importErrors = make(map[string]error) + } + lpkg.importErrors[importPath] = importErr + continue + } + + if visit(imp) { + lpkg.needsrc = true + } + lpkg.Imports[importPath] = imp.Package + } + + // Complete type information is required for the + // immediate dependencies of each source package. + if lpkg.needsrc && ld.Mode&NeedTypes != 0 { + for _, ipkg := range lpkg.Imports { + ld.pkgs[ipkg.ID].needtypes = true + } + } + + // NeedTypeSizes causes TypeSizes to be set even + // on packages for which types aren't needed. + if ld.Mode&NeedTypesSizes != 0 { + lpkg.TypesSizes = ld.sizes + } + stack = stack[:len(stack)-1] // pop + lpkg.color = black + + return lpkg.needsrc + } + + // For each initial package, create its import DAG. + for _, lpkg := range initial { + visit(lpkg) + } + + } else { + // !NeedImports: drop the stub (ID-only) import packages + // that we are not even going to try to resolve. + for _, lpkg := range initial { + lpkg.Imports = nil + } + } + + // Load type data and syntax if needed, starting at + // the initial packages (roots of the import DAG). + if ld.Mode&NeedTypes != 0 || ld.Mode&NeedSyntax != 0 { + var wg sync.WaitGroup + for _, lpkg := range initial { + wg.Add(1) + go func(lpkg *loaderPackage) { + ld.loadRecursive(lpkg) + wg.Done() + }(lpkg) + } + wg.Wait() + } + + // If the context is done, return its error and + // throw out [likely] incomplete packages. + if err := ld.Context.Err(); err != nil { + return nil, err + } + + result := make([]*Package, len(initial)) + for i, lpkg := range initial { + result[i] = lpkg.Package + } + for i := range ld.pkgs { + // Clear all unrequested fields, + // to catch programs that use more than they request. + if ld.requestedMode&NeedName == 0 { + ld.pkgs[i].Name = "" + ld.pkgs[i].PkgPath = "" + } + if ld.requestedMode&NeedFiles == 0 { + ld.pkgs[i].GoFiles = nil + ld.pkgs[i].OtherFiles = nil + ld.pkgs[i].IgnoredFiles = nil + } + if ld.requestedMode&NeedEmbedFiles == 0 { + ld.pkgs[i].EmbedFiles = nil + } + if ld.requestedMode&NeedEmbedPatterns == 0 { + ld.pkgs[i].EmbedPatterns = nil + } + if ld.requestedMode&NeedCompiledGoFiles == 0 { + ld.pkgs[i].CompiledGoFiles = nil + } + if ld.requestedMode&NeedImports == 0 { + ld.pkgs[i].Imports = nil + } + if ld.requestedMode&NeedExportFile == 0 { + ld.pkgs[i].ExportFile = "" + } + if ld.requestedMode&NeedTypes == 0 { + ld.pkgs[i].Types = nil + ld.pkgs[i].IllTyped = false + } + if ld.requestedMode&NeedSyntax == 0 { + ld.pkgs[i].Syntax = nil + } + if ld.requestedMode&NeedTypes == 0 && ld.requestedMode&NeedSyntax == 0 { + ld.pkgs[i].Fset = nil + } + if ld.requestedMode&NeedTypesInfo == 0 { + ld.pkgs[i].TypesInfo = nil + } + if ld.requestedMode&NeedTypesSizes == 0 { + ld.pkgs[i].TypesSizes = nil + } + if ld.requestedMode&NeedModule == 0 { + ld.pkgs[i].Module = nil + } + } + + return result, nil +} + +// loadRecursive loads the specified package and its dependencies, +// recursively, in parallel, in topological order. +// It is atomic and idempotent. +// Precondition: ld.Mode&NeedTypes. +func (ld *loader) loadRecursive(lpkg *loaderPackage) { + lpkg.loadOnce.Do(func() { + // Load the direct dependencies, in parallel. + var wg sync.WaitGroup + for _, ipkg := range lpkg.Imports { + imp := ld.pkgs[ipkg.ID] + wg.Add(1) + go func(imp *loaderPackage) { + ld.loadRecursive(imp) + wg.Done() + }(imp) + } + wg.Wait() + ld.loadPackage(lpkg) + }) +} + +// loadPackage loads the specified package. +// It must be called only once per Package, +// after immediate dependencies are loaded. +// Precondition: ld.Mode & NeedTypes. +func (ld *loader) loadPackage(lpkg *loaderPackage) { + if lpkg.PkgPath == "unsafe" { + // Fill in the blanks to avoid surprises. + lpkg.Types = types.Unsafe + lpkg.Fset = ld.Fset + lpkg.Syntax = []*ast.File{} + lpkg.TypesInfo = new(types.Info) + lpkg.TypesSizes = ld.sizes + return + } + + // Call NewPackage directly with explicit name. + // This avoids skew between golist and go/types when the files' + // package declarations are inconsistent. + lpkg.Types = types.NewPackage(lpkg.PkgPath, lpkg.Name) + lpkg.Fset = ld.Fset + + // Start shutting down if the context is done and do not load + // source or export data files. + // Packages that import this one will have ld.Context.Err() != nil. + // ld.Context.Err() will be returned later by refine. + if ld.Context.Err() != nil { + return + } + + // Subtle: we populate all Types fields with an empty Package + // before loading export data so that export data processing + // never has to create a types.Package for an indirect dependency, + // which would then require that such created packages be explicitly + // inserted back into the Import graph as a final step after export data loading. + // (Hence this return is after the Types assignment.) + // The Diamond test exercises this case. + if !lpkg.needtypes && !lpkg.needsrc { + return + } + if !lpkg.needsrc { + if err := ld.loadFromExportData(lpkg); err != nil { + lpkg.Errors = append(lpkg.Errors, Error{ + Pos: "-", + Msg: err.Error(), + Kind: UnknownError, // e.g. can't find/open/parse export data + }) + } + return // not a source package, don't get syntax trees + } + + appendError := func(err error) { + // Convert various error types into the one true Error. + var errs []Error + switch err := err.(type) { + case Error: + // from driver + errs = append(errs, err) + + case *os.PathError: + // from parser + errs = append(errs, Error{ + Pos: err.Path + ":1", + Msg: err.Err.Error(), + Kind: ParseError, + }) + + case scanner.ErrorList: + // from parser + for _, err := range err { + errs = append(errs, Error{ + Pos: err.Pos.String(), + Msg: err.Msg, + Kind: ParseError, + }) + } + + case types.Error: + // from type checker + lpkg.TypeErrors = append(lpkg.TypeErrors, err) + errs = append(errs, Error{ + Pos: err.Fset.Position(err.Pos).String(), + Msg: err.Msg, + Kind: TypeError, + }) + + default: + // unexpected impoverished error from parser? + errs = append(errs, Error{ + Pos: "-", + Msg: err.Error(), + Kind: UnknownError, + }) + + // If you see this error message, please file a bug. + log.Printf("internal error: error %q (%T) without position", err, err) + } + + lpkg.Errors = append(lpkg.Errors, errs...) + } + + // If the go command on the PATH is newer than the runtime, + // then the go/{scanner,ast,parser,types} packages from the + // standard library may be unable to process the files + // selected by go list. + // + // There is currently no way to downgrade the effective + // version of the go command (see issue 52078), so we proceed + // with the newer go command but, in case of parse or type + // errors, we emit an additional diagnostic. + // + // See: + // - golang.org/issue/52078 (flag to set release tags) + // - golang.org/issue/50825 (gopls legacy version support) + // - golang.org/issue/55883 (go/packages confusing error) + // + // Should we assert a hard minimum of (currently) go1.16 here? + var runtimeVersion int + if _, err := fmt.Sscanf(runtime.Version(), "go1.%d", &runtimeVersion); err == nil && runtimeVersion < lpkg.goVersion { + defer func() { + if len(lpkg.Errors) > 0 { + appendError(Error{ + Pos: "-", + Msg: fmt.Sprintf("This application uses version go1.%d of the source-processing packages but runs version go1.%d of 'go list'. It may fail to process source files that rely on newer language features. If so, rebuild the application using a newer version of Go.", runtimeVersion, lpkg.goVersion), + Kind: UnknownError, + }) + } + }() + } + + if ld.Config.Mode&NeedTypes != 0 && len(lpkg.CompiledGoFiles) == 0 && lpkg.ExportFile != "" { + // The config requested loading sources and types, but sources are missing. + // Add an error to the package and fall back to loading from export data. + appendError(Error{"-", fmt.Sprintf("sources missing for package %s", lpkg.ID), ParseError}) + _ = ld.loadFromExportData(lpkg) // ignore any secondary errors + + return // can't get syntax trees for this package + } + + files, errs := ld.parseFiles(lpkg.CompiledGoFiles) + for _, err := range errs { + appendError(err) + } + + lpkg.Syntax = files + if ld.Config.Mode&NeedTypes == 0 { + return + } + + // Start shutting down if the context is done and do not type check. + // Packages that import this one will have ld.Context.Err() != nil. + // ld.Context.Err() will be returned later by refine. + if ld.Context.Err() != nil { + return + } + + lpkg.TypesInfo = &types.Info{ + Types: make(map[ast.Expr]types.TypeAndValue), + Defs: make(map[*ast.Ident]types.Object), + Uses: make(map[*ast.Ident]types.Object), + Implicits: make(map[ast.Node]types.Object), + Instances: make(map[*ast.Ident]types.Instance), + Scopes: make(map[ast.Node]*types.Scope), + Selections: make(map[*ast.SelectorExpr]*types.Selection), + } + versions.InitFileVersions(lpkg.TypesInfo) + lpkg.TypesSizes = ld.sizes + + importer := importerFunc(func(path string) (*types.Package, error) { + if path == "unsafe" { + return types.Unsafe, nil + } + + // The imports map is keyed by import path. + ipkg := lpkg.Imports[path] + if ipkg == nil { + if err := lpkg.importErrors[path]; err != nil { + return nil, err + } + // There was skew between the metadata and the + // import declarations, likely due to an edit + // race, or because the ParseFile feature was + // used to supply alternative file contents. + return nil, fmt.Errorf("no metadata for %s", path) + } + + if ipkg.Types != nil && ipkg.Types.Complete() { + return ipkg.Types, nil + } + log.Fatalf("internal error: package %q without types was imported from %q", path, lpkg) + panic("unreachable") + }) + + // type-check + tc := &types.Config{ + Importer: importer, + + // Type-check bodies of functions only in initial packages. + // Example: for import graph A->B->C and initial packages {A,C}, + // we can ignore function bodies in B. + IgnoreFuncBodies: ld.Mode&NeedDeps == 0 && !lpkg.initial, + + Error: appendError, + Sizes: ld.sizes, // may be nil + } + if lpkg.Module != nil && lpkg.Module.GoVersion != "" { + tc.GoVersion = "go" + lpkg.Module.GoVersion + } + if (ld.Mode & typecheckCgo) != 0 { + if !typesinternal.SetUsesCgo(tc) { + appendError(Error{ + Msg: "typecheckCgo requires Go 1.15+", + Kind: ListError, + }) + return + } + } + + typErr := types.NewChecker(tc, ld.Fset, lpkg.Types, lpkg.TypesInfo).Files(lpkg.Syntax) + lpkg.importErrors = nil // no longer needed + + // In go/types go1.21 and go1.22, Checker.Files failed fast with a + // a "too new" error, without calling tc.Error and without + // proceeding to type-check the package (#66525). + // We rely on the runtimeVersion error to give the suggested remedy. + if typErr != nil && len(lpkg.Errors) == 0 && len(lpkg.Syntax) > 0 { + if msg := typErr.Error(); strings.HasPrefix(msg, "package requires newer Go version") { + appendError(types.Error{ + Fset: ld.Fset, + Pos: lpkg.Syntax[0].Package, + Msg: msg, + }) + } + } + + // If !Cgo, the type-checker uses FakeImportC mode, so + // it doesn't invoke the importer for import "C", + // nor report an error for the import, + // or for any undefined C.f reference. + // We must detect this explicitly and correctly + // mark the package as IllTyped (by reporting an error). + // TODO(adonovan): if these errors are annoying, + // we could just set IllTyped quietly. + if tc.FakeImportC { + outer: + for _, f := range lpkg.Syntax { + for _, imp := range f.Imports { + if imp.Path.Value == `"C"` { + err := types.Error{Fset: ld.Fset, Pos: imp.Pos(), Msg: `import "C" ignored`} + appendError(err) + break outer + } + } + } + } + + // If types.Checker.Files had an error that was unreported, + // make sure to report the unknown error so the package is illTyped. + if typErr != nil && len(lpkg.Errors) == 0 { + appendError(typErr) + } + + // Record accumulated errors. + illTyped := len(lpkg.Errors) > 0 + if !illTyped { + for _, imp := range lpkg.Imports { + if imp.IllTyped { + illTyped = true + break + } + } + } + lpkg.IllTyped = illTyped +} + +// An importFunc is an implementation of the single-method +// types.Importer interface based on a function value. +type importerFunc func(path string) (*types.Package, error) + +func (f importerFunc) Import(path string) (*types.Package, error) { return f(path) } + +// We use a counting semaphore to limit +// the number of parallel I/O calls per process. +var ioLimit = make(chan bool, 20) + +func (ld *loader) parseFile(filename string) (*ast.File, error) { + ld.parseCacheMu.Lock() + v, ok := ld.parseCache[filename] + if ok { + // cache hit + ld.parseCacheMu.Unlock() + <-v.ready + } else { + // cache miss + v = &parseValue{ready: make(chan struct{})} + ld.parseCache[filename] = v + ld.parseCacheMu.Unlock() + + var src []byte + for f, contents := range ld.Config.Overlay { + if sameFile(f, filename) { + src = contents + } + } + var err error + if src == nil { + ioLimit <- true // wait + src, err = os.ReadFile(filename) + <-ioLimit // signal + } + if err != nil { + v.err = err + } else { + v.f, v.err = ld.ParseFile(ld.Fset, filename, src) + } + + close(v.ready) + } + return v.f, v.err +} + +// parseFiles reads and parses the Go source files and returns the ASTs +// of the ones that could be at least partially parsed, along with a +// list of I/O and parse errors encountered. +// +// Because files are scanned in parallel, the token.Pos +// positions of the resulting ast.Files are not ordered. +func (ld *loader) parseFiles(filenames []string) ([]*ast.File, []error) { + var wg sync.WaitGroup + n := len(filenames) + parsed := make([]*ast.File, n) + errors := make([]error, n) + for i, file := range filenames { + wg.Add(1) + go func(i int, filename string) { + parsed[i], errors[i] = ld.parseFile(filename) + wg.Done() + }(i, file) + } + wg.Wait() + + // Eliminate nils, preserving order. + var o int + for _, f := range parsed { + if f != nil { + parsed[o] = f + o++ + } + } + parsed = parsed[:o] + + o = 0 + for _, err := range errors { + if err != nil { + errors[o] = err + o++ + } + } + errors = errors[:o] + + return parsed, errors +} + +// sameFile returns true if x and y have the same basename and denote +// the same file. +func sameFile(x, y string) bool { + if x == y { + // It could be the case that y doesn't exist. + // For instance, it may be an overlay file that + // hasn't been written to disk. To handle that case + // let x == y through. (We added the exact absolute path + // string to the CompiledGoFiles list, so the unwritten + // overlay case implies x==y.) + return true + } + if strings.EqualFold(filepath.Base(x), filepath.Base(y)) { // (optimisation) + if xi, err := os.Stat(x); err == nil { + if yi, err := os.Stat(y); err == nil { + return os.SameFile(xi, yi) + } + } + } + return false +} + +// loadFromExportData ensures that type information is present for the specified +// package, loading it from an export data file on the first request. +// On success it sets lpkg.Types to a new Package. +func (ld *loader) loadFromExportData(lpkg *loaderPackage) error { + if lpkg.PkgPath == "" { + log.Fatalf("internal error: Package %s has no PkgPath", lpkg) + } + + // Because gcexportdata.Read has the potential to create or + // modify the types.Package for each node in the transitive + // closure of dependencies of lpkg, all exportdata operations + // must be sequential. (Finer-grained locking would require + // changes to the gcexportdata API.) + // + // The exportMu lock guards the lpkg.Types field and the + // types.Package it points to, for each loaderPackage in the graph. + // + // Not all accesses to Package.Pkg need to be protected by exportMu: + // graph ordering ensures that direct dependencies of source + // packages are fully loaded before the importer reads their Pkg field. + ld.exportMu.Lock() + defer ld.exportMu.Unlock() + + if tpkg := lpkg.Types; tpkg != nil && tpkg.Complete() { + return nil // cache hit + } + + lpkg.IllTyped = true // fail safe + + if lpkg.ExportFile == "" { + // Errors while building export data will have been printed to stderr. + return fmt.Errorf("no export data file") + } + f, err := os.Open(lpkg.ExportFile) + if err != nil { + return err + } + defer f.Close() + + // Read gc export data. + // + // We don't currently support gccgo export data because all + // underlying workspaces use the gc toolchain. (Even build + // systems that support gccgo don't use it for workspace + // queries.) + r, err := gcexportdata.NewReader(f) + if err != nil { + return fmt.Errorf("reading %s: %v", lpkg.ExportFile, err) + } + + // Build the view. + // + // The gcexportdata machinery has no concept of package ID. + // It identifies packages by their PkgPath, which although not + // globally unique is unique within the scope of one invocation + // of the linker, type-checker, or gcexportdata. + // + // So, we must build a PkgPath-keyed view of the global + // (conceptually ID-keyed) cache of packages and pass it to + // gcexportdata. The view must contain every existing + // package that might possibly be mentioned by the + // current package---its transitive closure. + // + // In loadPackage, we unconditionally create a types.Package for + // each dependency so that export data loading does not + // create new ones. + // + // TODO(adonovan): it would be simpler and more efficient + // if the export data machinery invoked a callback to + // get-or-create a package instead of a map. + // + view := make(map[string]*types.Package) // view seen by gcexportdata + seen := make(map[*loaderPackage]bool) // all visited packages + var visit func(pkgs map[string]*Package) + visit = func(pkgs map[string]*Package) { + for _, p := range pkgs { + lpkg := ld.pkgs[p.ID] + if !seen[lpkg] { + seen[lpkg] = true + view[lpkg.PkgPath] = lpkg.Types + visit(lpkg.Imports) + } + } + } + visit(lpkg.Imports) + + viewLen := len(view) + 1 // adding the self package + // Parse the export data. + // (May modify incomplete packages in view but not create new ones.) + tpkg, err := gcexportdata.Read(r, ld.Fset, view, lpkg.PkgPath) + if err != nil { + return fmt.Errorf("reading %s: %v", lpkg.ExportFile, err) + } + if _, ok := view["go.shape"]; ok { + // Account for the pseudopackage "go.shape" that gets + // created by generic code. + viewLen++ + } + if viewLen != len(view) { + log.Panicf("golang.org/x/tools/go/packages: unexpected new packages during load of %s", lpkg.PkgPath) + } + + lpkg.Types = tpkg + lpkg.IllTyped = false + return nil +} + +// impliedLoadMode returns loadMode with its dependencies. +func impliedLoadMode(loadMode LoadMode) LoadMode { + if loadMode&(NeedDeps|NeedTypes|NeedTypesInfo) != 0 { + // All these things require knowing the import graph. + loadMode |= NeedImports + } + if loadMode&NeedTypes != 0 { + // Types require the GoVersion from Module. + loadMode |= NeedModule + } + + return loadMode +} + +func usesExportData(cfg *Config) bool { + return cfg.Mode&NeedExportFile != 0 || cfg.Mode&NeedTypes != 0 && cfg.Mode&NeedDeps == 0 +} + +var _ interface{} = io.Discard // assert build toolchain is go1.16 or later diff --git a/vendor/golang.org/x/tools/go/packages/visit.go b/vendor/golang.org/x/tools/go/packages/visit.go new file mode 100644 index 0000000..df14ffd --- /dev/null +++ b/vendor/golang.org/x/tools/go/packages/visit.go @@ -0,0 +1,68 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package packages + +import ( + "fmt" + "os" + "sort" +) + +// Visit visits all the packages in the import graph whose roots are +// pkgs, calling the optional pre function the first time each package +// is encountered (preorder), and the optional post function after a +// package's dependencies have been visited (postorder). +// The boolean result of pre(pkg) determines whether +// the imports of package pkg are visited. +func Visit(pkgs []*Package, pre func(*Package) bool, post func(*Package)) { + seen := make(map[*Package]bool) + var visit func(*Package) + visit = func(pkg *Package) { + if !seen[pkg] { + seen[pkg] = true + + if pre == nil || pre(pkg) { + paths := make([]string, 0, len(pkg.Imports)) + for path := range pkg.Imports { + paths = append(paths, path) + } + sort.Strings(paths) // Imports is a map, this makes visit stable + for _, path := range paths { + visit(pkg.Imports[path]) + } + } + + if post != nil { + post(pkg) + } + } + } + for _, pkg := range pkgs { + visit(pkg) + } +} + +// PrintErrors prints to os.Stderr the accumulated errors of all +// packages in the import graph rooted at pkgs, dependencies first. +// PrintErrors returns the number of errors printed. +func PrintErrors(pkgs []*Package) int { + var n int + errModules := make(map[*Module]bool) + Visit(pkgs, nil, func(pkg *Package) { + for _, err := range pkg.Errors { + fmt.Fprintln(os.Stderr, err) + n++ + } + + // Print pkg.Module.Error once if present. + mod := pkg.Module + if mod != nil && mod.Error != nil && !errModules[mod] { + errModules[mod] = true + fmt.Fprintln(os.Stderr, mod.Error.Err) + n++ + } + }) + return n +} diff --git a/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go b/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go new file mode 100644 index 0000000..a70b727 --- /dev/null +++ b/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go @@ -0,0 +1,788 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package objectpath defines a naming scheme for types.Objects +// (that is, named entities in Go programs) relative to their enclosing +// package. +// +// Type-checker objects are canonical, so they are usually identified by +// their address in memory (a pointer), but a pointer has meaning only +// within one address space. By contrast, objectpath names allow the +// identity of an object to be sent from one program to another, +// establishing a correspondence between types.Object variables that are +// distinct but logically equivalent. +// +// A single object may have multiple paths. In this example, +// +// type A struct{ X int } +// type B A +// +// the field X has two paths due to its membership of both A and B. +// The For(obj) function always returns one of these paths, arbitrarily +// but consistently. +package objectpath + +import ( + "fmt" + "go/types" + "strconv" + "strings" + + "golang.org/x/tools/internal/aliases" + "golang.org/x/tools/internal/typesinternal" +) + +// TODO(adonovan): think about generic aliases. + +// A Path is an opaque name that identifies a types.Object +// relative to its package. Conceptually, the name consists of a +// sequence of destructuring operations applied to the package scope +// to obtain the original object. +// The name does not include the package itself. +type Path string + +// Encoding +// +// An object path is a textual and (with training) human-readable encoding +// of a sequence of destructuring operators, starting from a types.Package. +// The sequences represent a path through the package/object/type graph. +// We classify these operators by their type: +// +// PO package->object Package.Scope.Lookup +// OT object->type Object.Type +// TT type->type Type.{Elem,Key,{,{,Recv}Type}Params,Results,Underlying,Rhs} [EKPRUTrCa] +// TO type->object Type.{At,Field,Method,Obj} [AFMO] +// +// All valid paths start with a package and end at an object +// and thus may be defined by the regular language: +// +// objectpath = PO (OT TT* TO)* +// +// The concrete encoding follows directly: +// - The only PO operator is Package.Scope.Lookup, which requires an identifier. +// - The only OT operator is Object.Type, +// which we encode as '.' because dot cannot appear in an identifier. +// - The TT operators are encoded as [EKPRUTrCa]; +// two of these ({,Recv}TypeParams) require an integer operand, +// which is encoded as a string of decimal digits. +// - The TO operators are encoded as [AFMO]; +// three of these (At,Field,Method) require an integer operand, +// which is encoded as a string of decimal digits. +// These indices are stable across different representations +// of the same package, even source and export data. +// The indices used are implementation specific and may not correspond to +// the argument to the go/types function. +// +// In the example below, +// +// package p +// +// type T interface { +// f() (a string, b struct{ X int }) +// } +// +// field X has the path "T.UM0.RA1.F0", +// representing the following sequence of operations: +// +// p.Lookup("T") T +// .Type().Underlying().Method(0). f +// .Type().Results().At(1) b +// .Type().Field(0) X +// +// The encoding is not maximally compact---every R or P is +// followed by an A, for example---but this simplifies the +// encoder and decoder. +const ( + // object->type operators + opType = '.' // .Type() (Object) + + // type->type operators + opElem = 'E' // .Elem() (Pointer, Slice, Array, Chan, Map) + opKey = 'K' // .Key() (Map) + opParams = 'P' // .Params() (Signature) + opResults = 'R' // .Results() (Signature) + opUnderlying = 'U' // .Underlying() (Named) + opTypeParam = 'T' // .TypeParams.At(i) (Named, Signature) + opRecvTypeParam = 'r' // .RecvTypeParams.At(i) (Signature) + opConstraint = 'C' // .Constraint() (TypeParam) + opRhs = 'a' // .Rhs() (Alias) + + // type->object operators + opAt = 'A' // .At(i) (Tuple) + opField = 'F' // .Field(i) (Struct) + opMethod = 'M' // .Method(i) (Named or Interface; not Struct: "promoted" names are ignored) + opObj = 'O' // .Obj() (Named, TypeParam) +) + +// For is equivalent to new(Encoder).For(obj). +// +// It may be more efficient to reuse a single Encoder across several calls. +func For(obj types.Object) (Path, error) { + return new(Encoder).For(obj) +} + +// An Encoder amortizes the cost of encoding the paths of multiple objects. +// The zero value of an Encoder is ready to use. +type Encoder struct { + scopeMemo map[*types.Scope][]types.Object // memoization of scopeObjects +} + +// For returns the path to an object relative to its package, +// or an error if the object is not accessible from the package's Scope. +// +// The For function guarantees to return a path only for the following objects: +// - package-level types +// - exported package-level non-types +// - methods +// - parameter and result variables +// - struct fields +// These objects are sufficient to define the API of their package. +// The objects described by a package's export data are drawn from this set. +// +// The set of objects accessible from a package's Scope depends on +// whether the package was produced by type-checking syntax, or +// reading export data; the latter may have a smaller Scope since +// export data trims objects that are not reachable from an exported +// declaration. For example, the For function will return a path for +// an exported method of an unexported type that is not reachable +// from any public declaration; this path will cause the Object +// function to fail if called on a package loaded from export data. +// TODO(adonovan): is this a bug or feature? Should this package +// compute accessibility in the same way? +// +// For does not return a path for predeclared names, imported package +// names, local names, and unexported package-level names (except +// types). +// +// Example: given this definition, +// +// package p +// +// type T interface { +// f() (a string, b struct{ X int }) +// } +// +// For(X) would return a path that denotes the following sequence of operations: +// +// p.Scope().Lookup("T") (TypeName T) +// .Type().Underlying().Method(0). (method Func f) +// .Type().Results().At(1) (field Var b) +// .Type().Field(0) (field Var X) +// +// where p is the package (*types.Package) to which X belongs. +func (enc *Encoder) For(obj types.Object) (Path, error) { + pkg := obj.Pkg() + + // This table lists the cases of interest. + // + // Object Action + // ------ ------ + // nil reject + // builtin reject + // pkgname reject + // label reject + // var + // package-level accept + // func param/result accept + // local reject + // struct field accept + // const + // package-level accept + // local reject + // func + // package-level accept + // init functions reject + // concrete method accept + // interface method accept + // type + // package-level accept + // local reject + // + // The only accessible package-level objects are members of pkg itself. + // + // The cases are handled in four steps: + // + // 1. reject nil and builtin + // 2. accept package-level objects + // 3. reject obviously invalid objects + // 4. search the API for the path to the param/result/field/method. + + // 1. reference to nil or builtin? + if pkg == nil { + return "", fmt.Errorf("predeclared %s has no path", obj) + } + scope := pkg.Scope() + + // 2. package-level object? + if scope.Lookup(obj.Name()) == obj { + // Only exported objects (and non-exported types) have a path. + // Non-exported types may be referenced by other objects. + if _, ok := obj.(*types.TypeName); !ok && !obj.Exported() { + return "", fmt.Errorf("no path for non-exported %v", obj) + } + return Path(obj.Name()), nil + } + + // 3. Not a package-level object. + // Reject obviously non-viable cases. + switch obj := obj.(type) { + case *types.TypeName: + if _, ok := types.Unalias(obj.Type()).(*types.TypeParam); !ok { + // With the exception of type parameters, only package-level type names + // have a path. + return "", fmt.Errorf("no path for %v", obj) + } + case *types.Const, // Only package-level constants have a path. + *types.Label, // Labels are function-local. + *types.PkgName: // PkgNames are file-local. + return "", fmt.Errorf("no path for %v", obj) + + case *types.Var: + // Could be: + // - a field (obj.IsField()) + // - a func parameter or result + // - a local var. + // Sadly there is no way to distinguish + // a param/result from a local + // so we must proceed to the find. + + case *types.Func: + // A func, if not package-level, must be a method. + if recv := obj.Type().(*types.Signature).Recv(); recv == nil { + return "", fmt.Errorf("func is not a method: %v", obj) + } + + if path, ok := enc.concreteMethod(obj); ok { + // Fast path for concrete methods that avoids looping over scope. + return path, nil + } + + default: + panic(obj) + } + + // 4. Search the API for the path to the var (field/param/result) or method. + + // First inspect package-level named types. + // In the presence of path aliases, these give + // the best paths because non-types may + // refer to types, but not the reverse. + empty := make([]byte, 0, 48) // initial space + objs := enc.scopeObjects(scope) + for _, o := range objs { + tname, ok := o.(*types.TypeName) + if !ok { + continue // handle non-types in second pass + } + + path := append(empty, o.Name()...) + path = append(path, opType) + + T := o.Type() + if alias, ok := T.(*types.Alias); ok { + if r := findTypeParam(obj, aliases.TypeParams(alias), path, opTypeParam, nil); r != nil { + return Path(r), nil + } + if r := find(obj, aliases.Rhs(alias), append(path, opRhs), nil); r != nil { + return Path(r), nil + } + + } else if tname.IsAlias() { + // legacy alias + if r := find(obj, T, path, nil); r != nil { + return Path(r), nil + } + + } else if named, ok := T.(*types.Named); ok { + // defined (named) type + if r := findTypeParam(obj, named.TypeParams(), path, opTypeParam, nil); r != nil { + return Path(r), nil + } + if r := find(obj, named.Underlying(), append(path, opUnderlying), nil); r != nil { + return Path(r), nil + } + } + } + + // Then inspect everything else: + // non-types, and declared methods of defined types. + for _, o := range objs { + path := append(empty, o.Name()...) + if _, ok := o.(*types.TypeName); !ok { + if o.Exported() { + // exported non-type (const, var, func) + if r := find(obj, o.Type(), append(path, opType), nil); r != nil { + return Path(r), nil + } + } + continue + } + + // Inspect declared methods of defined types. + if T, ok := types.Unalias(o.Type()).(*types.Named); ok { + path = append(path, opType) + // The method index here is always with respect + // to the underlying go/types data structures, + // which ultimately derives from source order + // and must be preserved by export data. + for i := 0; i < T.NumMethods(); i++ { + m := T.Method(i) + path2 := appendOpArg(path, opMethod, i) + if m == obj { + return Path(path2), nil // found declared method + } + if r := find(obj, m.Type(), append(path2, opType), nil); r != nil { + return Path(r), nil + } + } + } + } + + return "", fmt.Errorf("can't find path for %v in %s", obj, pkg.Path()) +} + +func appendOpArg(path []byte, op byte, arg int) []byte { + path = append(path, op) + path = strconv.AppendInt(path, int64(arg), 10) + return path +} + +// concreteMethod returns the path for meth, which must have a non-nil receiver. +// The second return value indicates success and may be false if the method is +// an interface method or if it is an instantiated method. +// +// This function is just an optimization that avoids the general scope walking +// approach. You are expected to fall back to the general approach if this +// function fails. +func (enc *Encoder) concreteMethod(meth *types.Func) (Path, bool) { + // Concrete methods can only be declared on package-scoped named types. For + // that reason we can skip the expensive walk over the package scope: the + // path will always be package -> named type -> method. We can trivially get + // the type name from the receiver, and only have to look over the type's + // methods to find the method index. + // + // Methods on generic types require special consideration, however. Consider + // the following package: + // + // L1: type S[T any] struct{} + // L2: func (recv S[A]) Foo() { recv.Bar() } + // L3: func (recv S[B]) Bar() { } + // L4: type Alias = S[int] + // L5: func _[T any]() { var s S[int]; s.Foo() } + // + // The receivers of methods on generic types are instantiations. L2 and L3 + // instantiate S with the type-parameters A and B, which are scoped to the + // respective methods. L4 and L5 each instantiate S with int. Each of these + // instantiations has its own method set, full of methods (and thus objects) + // with receivers whose types are the respective instantiations. In other + // words, we have + // + // S[A].Foo, S[A].Bar + // S[B].Foo, S[B].Bar + // S[int].Foo, S[int].Bar + // + // We may thus be trying to produce object paths for any of these objects. + // + // S[A].Foo and S[B].Bar are the origin methods, and their paths are S.Foo + // and S.Bar, which are the paths that this function naturally produces. + // + // S[A].Bar, S[B].Foo, and both methods on S[int] are instantiations that + // don't correspond to the origin methods. For S[int], this is significant. + // The most precise object path for S[int].Foo, for example, is Alias.Foo, + // not S.Foo. Our function, however, would produce S.Foo, which would + // resolve to a different object. + // + // For S[A].Bar and S[B].Foo it could be argued that S.Bar and S.Foo are + // still the correct paths, since only the origin methods have meaningful + // paths. But this is likely only true for trivial cases and has edge cases. + // Since this function is only an optimization, we err on the side of giving + // up, deferring to the slower but definitely correct algorithm. Most users + // of objectpath will only be giving us origin methods, anyway, as referring + // to instantiated methods is usually not useful. + + if meth.Origin() != meth { + return "", false + } + + _, named := typesinternal.ReceiverNamed(meth.Type().(*types.Signature).Recv()) + if named == nil { + return "", false + } + + if types.IsInterface(named) { + // Named interfaces don't have to be package-scoped + // + // TODO(dominikh): opt: if scope.Lookup(name) == named, then we can apply this optimization to interface + // methods, too, I think. + return "", false + } + + // Preallocate space for the name, opType, opMethod, and some digits. + name := named.Obj().Name() + path := make([]byte, 0, len(name)+8) + path = append(path, name...) + path = append(path, opType) + + // Method indices are w.r.t. the go/types data structures, + // ultimately deriving from source order, + // which is preserved by export data. + for i := 0; i < named.NumMethods(); i++ { + if named.Method(i) == meth { + path = appendOpArg(path, opMethod, i) + return Path(path), true + } + } + + // Due to golang/go#59944, go/types fails to associate the receiver with + // certain methods on cgo types. + // + // TODO(rfindley): replace this panic once golang/go#59944 is fixed in all Go + // versions gopls supports. + return "", false + // panic(fmt.Sprintf("couldn't find method %s on type %s; methods: %#v", meth, named, enc.namedMethods(named))) +} + +// find finds obj within type T, returning the path to it, or nil if not found. +// +// The seen map is used to short circuit cycles through type parameters. If +// nil, it will be allocated as necessary. +func find(obj types.Object, T types.Type, path []byte, seen map[*types.TypeName]bool) []byte { + switch T := T.(type) { + case *types.Alias: + return find(obj, types.Unalias(T), path, seen) + case *types.Basic, *types.Named: + // Named types belonging to pkg were handled already, + // so T must belong to another package. No path. + return nil + case *types.Pointer: + return find(obj, T.Elem(), append(path, opElem), seen) + case *types.Slice: + return find(obj, T.Elem(), append(path, opElem), seen) + case *types.Array: + return find(obj, T.Elem(), append(path, opElem), seen) + case *types.Chan: + return find(obj, T.Elem(), append(path, opElem), seen) + case *types.Map: + if r := find(obj, T.Key(), append(path, opKey), seen); r != nil { + return r + } + return find(obj, T.Elem(), append(path, opElem), seen) + case *types.Signature: + if r := findTypeParam(obj, T.RecvTypeParams(), path, opRecvTypeParam, nil); r != nil { + return r + } + if r := findTypeParam(obj, T.TypeParams(), path, opTypeParam, seen); r != nil { + return r + } + if r := find(obj, T.Params(), append(path, opParams), seen); r != nil { + return r + } + return find(obj, T.Results(), append(path, opResults), seen) + case *types.Struct: + for i := 0; i < T.NumFields(); i++ { + fld := T.Field(i) + path2 := appendOpArg(path, opField, i) + if fld == obj { + return path2 // found field var + } + if r := find(obj, fld.Type(), append(path2, opType), seen); r != nil { + return r + } + } + return nil + case *types.Tuple: + for i := 0; i < T.Len(); i++ { + v := T.At(i) + path2 := appendOpArg(path, opAt, i) + if v == obj { + return path2 // found param/result var + } + if r := find(obj, v.Type(), append(path2, opType), seen); r != nil { + return r + } + } + return nil + case *types.Interface: + for i := 0; i < T.NumMethods(); i++ { + m := T.Method(i) + path2 := appendOpArg(path, opMethod, i) + if m == obj { + return path2 // found interface method + } + if r := find(obj, m.Type(), append(path2, opType), seen); r != nil { + return r + } + } + return nil + case *types.TypeParam: + name := T.Obj() + if name == obj { + return append(path, opObj) + } + if seen[name] { + return nil + } + if seen == nil { + seen = make(map[*types.TypeName]bool) + } + seen[name] = true + if r := find(obj, T.Constraint(), append(path, opConstraint), seen); r != nil { + return r + } + return nil + } + panic(T) +} + +func findTypeParam(obj types.Object, list *types.TypeParamList, path []byte, op byte, seen map[*types.TypeName]bool) []byte { + for i := 0; i < list.Len(); i++ { + tparam := list.At(i) + path2 := appendOpArg(path, op, i) + if r := find(obj, tparam, path2, seen); r != nil { + return r + } + } + return nil +} + +// Object returns the object denoted by path p within the package pkg. +func Object(pkg *types.Package, p Path) (types.Object, error) { + pathstr := string(p) + if pathstr == "" { + return nil, fmt.Errorf("empty path") + } + + var pkgobj, suffix string + if dot := strings.IndexByte(pathstr, opType); dot < 0 { + pkgobj = pathstr + } else { + pkgobj = pathstr[:dot] + suffix = pathstr[dot:] // suffix starts with "." + } + + obj := pkg.Scope().Lookup(pkgobj) + if obj == nil { + return nil, fmt.Errorf("package %s does not contain %q", pkg.Path(), pkgobj) + } + + // abstraction of *types.{Pointer,Slice,Array,Chan,Map} + type hasElem interface { + Elem() types.Type + } + // abstraction of *types.{Named,Signature} + type hasTypeParams interface { + TypeParams() *types.TypeParamList + } + // abstraction of *types.{Named,TypeParam} + type hasObj interface { + Obj() *types.TypeName + } + + // The loop state is the pair (t, obj), + // exactly one of which is non-nil, initially obj. + // All suffixes start with '.' (the only object->type operation), + // followed by optional type->type operations, + // then a type->object operation. + // The cycle then repeats. + var t types.Type + for suffix != "" { + code := suffix[0] + suffix = suffix[1:] + + // Codes [AFMTr] have an integer operand. + var index int + switch code { + case opAt, opField, opMethod, opTypeParam, opRecvTypeParam: + rest := strings.TrimLeft(suffix, "0123456789") + numerals := suffix[:len(suffix)-len(rest)] + suffix = rest + i, err := strconv.Atoi(numerals) + if err != nil { + return nil, fmt.Errorf("invalid path: bad numeric operand %q for code %q", numerals, code) + } + index = int(i) + case opObj: + // no operand + default: + // The suffix must end with a type->object operation. + if suffix == "" { + return nil, fmt.Errorf("invalid path: ends with %q, want [AFMO]", code) + } + } + + if code == opType { + if t != nil { + return nil, fmt.Errorf("invalid path: unexpected %q in type context", opType) + } + t = obj.Type() + obj = nil + continue + } + + if t == nil { + return nil, fmt.Errorf("invalid path: code %q in object context", code) + } + + // Inv: t != nil, obj == nil + + t = types.Unalias(t) + switch code { + case opElem: + hasElem, ok := t.(hasElem) // Pointer, Slice, Array, Chan, Map + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want pointer, slice, array, chan or map)", code, t, t) + } + t = hasElem.Elem() + + case opKey: + mapType, ok := t.(*types.Map) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want map)", code, t, t) + } + t = mapType.Key() + + case opParams: + sig, ok := t.(*types.Signature) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want signature)", code, t, t) + } + t = sig.Params() + + case opResults: + sig, ok := t.(*types.Signature) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want signature)", code, t, t) + } + t = sig.Results() + + case opUnderlying: + named, ok := t.(*types.Named) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want named)", code, t, t) + } + t = named.Underlying() + + case opRhs: + if alias, ok := t.(*types.Alias); ok { + t = aliases.Rhs(alias) + } else if false && aliases.Enabled() { + // The Enabled check is too expensive, so for now we + // simply assume that aliases are not enabled. + // TODO(adonovan): replace with "if true {" when go1.24 is assured. + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want alias)", code, t, t) + } + + case opTypeParam: + hasTypeParams, ok := t.(hasTypeParams) // Named, Signature + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want named or signature)", code, t, t) + } + tparams := hasTypeParams.TypeParams() + if n := tparams.Len(); index >= n { + return nil, fmt.Errorf("tuple index %d out of range [0-%d)", index, n) + } + t = tparams.At(index) + + case opRecvTypeParam: + sig, ok := t.(*types.Signature) // Signature + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want signature)", code, t, t) + } + rtparams := sig.RecvTypeParams() + if n := rtparams.Len(); index >= n { + return nil, fmt.Errorf("tuple index %d out of range [0-%d)", index, n) + } + t = rtparams.At(index) + + case opConstraint: + tparam, ok := t.(*types.TypeParam) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want type parameter)", code, t, t) + } + t = tparam.Constraint() + + case opAt: + tuple, ok := t.(*types.Tuple) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want tuple)", code, t, t) + } + if n := tuple.Len(); index >= n { + return nil, fmt.Errorf("tuple index %d out of range [0-%d)", index, n) + } + obj = tuple.At(index) + t = nil + + case opField: + structType, ok := t.(*types.Struct) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want struct)", code, t, t) + } + if n := structType.NumFields(); index >= n { + return nil, fmt.Errorf("field index %d out of range [0-%d)", index, n) + } + obj = structType.Field(index) + t = nil + + case opMethod: + switch t := t.(type) { + case *types.Interface: + if index >= t.NumMethods() { + return nil, fmt.Errorf("method index %d out of range [0-%d)", index, t.NumMethods()) + } + obj = t.Method(index) // Id-ordered + + case *types.Named: + if index >= t.NumMethods() { + return nil, fmt.Errorf("method index %d out of range [0-%d)", index, t.NumMethods()) + } + obj = t.Method(index) + + default: + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want interface or named)", code, t, t) + } + t = nil + + case opObj: + hasObj, ok := t.(hasObj) + if !ok { + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want named or type param)", code, t, t) + } + obj = hasObj.Obj() + t = nil + + default: + return nil, fmt.Errorf("invalid path: unknown code %q", code) + } + } + + if obj == nil { + panic(p) // path does not end in an object-valued operator + } + + if obj.Pkg() != pkg { + return nil, fmt.Errorf("path denotes %s, which belongs to a different package", obj) + } + + return obj, nil // success +} + +// scopeObjects is a memoization of scope objects. +// Callers must not modify the result. +func (enc *Encoder) scopeObjects(scope *types.Scope) []types.Object { + m := enc.scopeMemo + if m == nil { + m = make(map[*types.Scope][]types.Object) + enc.scopeMemo = m + } + objs, ok := m[scope] + if !ok { + names := scope.Names() // allocates and sorts + objs = make([]types.Object, len(names)) + for i, name := range names { + objs[i] = scope.Lookup(name) + } + m[scope] = objs + } + return objs +} diff --git a/vendor/golang.org/x/tools/go/types/typeutil/callee.go b/vendor/golang.org/x/tools/go/types/typeutil/callee.go new file mode 100644 index 0000000..7543803 --- /dev/null +++ b/vendor/golang.org/x/tools/go/types/typeutil/callee.go @@ -0,0 +1,68 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typeutil + +import ( + "go/ast" + "go/types" + + "golang.org/x/tools/internal/typeparams" +) + +// Callee returns the named target of a function call, if any: +// a function, method, builtin, or variable. +// +// Functions and methods may potentially have type parameters. +func Callee(info *types.Info, call *ast.CallExpr) types.Object { + fun := ast.Unparen(call.Fun) + + // Look through type instantiation if necessary. + isInstance := false + switch fun.(type) { + case *ast.IndexExpr, *ast.IndexListExpr: + // When extracting the callee from an *IndexExpr, we need to check that + // it is a *types.Func and not a *types.Var. + // Example: Don't match a slice m within the expression `m[0]()`. + isInstance = true + fun, _, _, _ = typeparams.UnpackIndexExpr(fun) + } + + var obj types.Object + switch fun := fun.(type) { + case *ast.Ident: + obj = info.Uses[fun] // type, var, builtin, or declared func + case *ast.SelectorExpr: + if sel, ok := info.Selections[fun]; ok { + obj = sel.Obj() // method or field + } else { + obj = info.Uses[fun.Sel] // qualified identifier? + } + } + if _, ok := obj.(*types.TypeName); ok { + return nil // T(x) is a conversion, not a call + } + // A Func is required to match instantiations. + if _, ok := obj.(*types.Func); isInstance && !ok { + return nil // Was not a Func. + } + return obj +} + +// StaticCallee returns the target (function or method) of a static function +// call, if any. It returns nil for calls to builtins. +// +// Note: for calls of instantiated functions and methods, StaticCallee returns +// the corresponding generic function or method on the generic type. +func StaticCallee(info *types.Info, call *ast.CallExpr) *types.Func { + if f, ok := Callee(info, call).(*types.Func); ok && !interfaceMethod(f) { + return f + } + return nil +} + +func interfaceMethod(f *types.Func) bool { + recv := f.Type().(*types.Signature).Recv() + return recv != nil && types.IsInterface(recv.Type()) +} diff --git a/vendor/golang.org/x/tools/go/types/typeutil/imports.go b/vendor/golang.org/x/tools/go/types/typeutil/imports.go new file mode 100644 index 0000000..b81ce0c --- /dev/null +++ b/vendor/golang.org/x/tools/go/types/typeutil/imports.go @@ -0,0 +1,30 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typeutil + +import "go/types" + +// Dependencies returns all dependencies of the specified packages. +// +// Dependent packages appear in topological order: if package P imports +// package Q, Q appears earlier than P in the result. +// The algorithm follows import statements in the order they +// appear in the source code, so the result is a total order. +func Dependencies(pkgs ...*types.Package) []*types.Package { + var result []*types.Package + seen := make(map[*types.Package]bool) + var visit func(pkgs []*types.Package) + visit = func(pkgs []*types.Package) { + for _, p := range pkgs { + if !seen[p] { + seen[p] = true + visit(p.Imports()) + result = append(result, p) + } + } + } + visit(pkgs) + return result +} diff --git a/vendor/golang.org/x/tools/go/types/typeutil/map.go b/vendor/golang.org/x/tools/go/types/typeutil/map.go new file mode 100644 index 0000000..8d824f7 --- /dev/null +++ b/vendor/golang.org/x/tools/go/types/typeutil/map.go @@ -0,0 +1,517 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package typeutil defines various utilities for types, such as Map, +// a mapping from types.Type to any values. +package typeutil // import "golang.org/x/tools/go/types/typeutil" + +import ( + "bytes" + "fmt" + "go/types" + "reflect" + + "golang.org/x/tools/internal/typeparams" +) + +// Map is a hash-table-based mapping from types (types.Type) to +// arbitrary any values. The concrete types that implement +// the Type interface are pointers. Since they are not canonicalized, +// == cannot be used to check for equivalence, and thus we cannot +// simply use a Go map. +// +// Just as with map[K]V, a nil *Map is a valid empty map. +// +// Not thread-safe. +type Map struct { + hasher Hasher // shared by many Maps + table map[uint32][]entry // maps hash to bucket; entry.key==nil means unused + length int // number of map entries +} + +// entry is an entry (key/value association) in a hash bucket. +type entry struct { + key types.Type + value any +} + +// SetHasher sets the hasher used by Map. +// +// All Hashers are functionally equivalent but contain internal state +// used to cache the results of hashing previously seen types. +// +// A single Hasher created by MakeHasher() may be shared among many +// Maps. This is recommended if the instances have many keys in +// common, as it will amortize the cost of hash computation. +// +// A Hasher may grow without bound as new types are seen. Even when a +// type is deleted from the map, the Hasher never shrinks, since other +// types in the map may reference the deleted type indirectly. +// +// Hashers are not thread-safe, and read-only operations such as +// Map.Lookup require updates to the hasher, so a full Mutex lock (not a +// read-lock) is require around all Map operations if a shared +// hasher is accessed from multiple threads. +// +// If SetHasher is not called, the Map will create a private hasher at +// the first call to Insert. +func (m *Map) SetHasher(hasher Hasher) { + m.hasher = hasher +} + +// Delete removes the entry with the given key, if any. +// It returns true if the entry was found. +func (m *Map) Delete(key types.Type) bool { + if m != nil && m.table != nil { + hash := m.hasher.Hash(key) + bucket := m.table[hash] + for i, e := range bucket { + if e.key != nil && types.Identical(key, e.key) { + // We can't compact the bucket as it + // would disturb iterators. + bucket[i] = entry{} + m.length-- + return true + } + } + } + return false +} + +// At returns the map entry for the given key. +// The result is nil if the entry is not present. +func (m *Map) At(key types.Type) any { + if m != nil && m.table != nil { + for _, e := range m.table[m.hasher.Hash(key)] { + if e.key != nil && types.Identical(key, e.key) { + return e.value + } + } + } + return nil +} + +// Set sets the map entry for key to val, +// and returns the previous entry, if any. +func (m *Map) Set(key types.Type, value any) (prev any) { + if m.table != nil { + hash := m.hasher.Hash(key) + bucket := m.table[hash] + var hole *entry + for i, e := range bucket { + if e.key == nil { + hole = &bucket[i] + } else if types.Identical(key, e.key) { + prev = e.value + bucket[i].value = value + return + } + } + + if hole != nil { + *hole = entry{key, value} // overwrite deleted entry + } else { + m.table[hash] = append(bucket, entry{key, value}) + } + } else { + if m.hasher.memo == nil { + m.hasher = MakeHasher() + } + hash := m.hasher.Hash(key) + m.table = map[uint32][]entry{hash: {entry{key, value}}} + } + + m.length++ + return +} + +// Len returns the number of map entries. +func (m *Map) Len() int { + if m != nil { + return m.length + } + return 0 +} + +// Iterate calls function f on each entry in the map in unspecified order. +// +// If f should mutate the map, Iterate provides the same guarantees as +// Go maps: if f deletes a map entry that Iterate has not yet reached, +// f will not be invoked for it, but if f inserts a map entry that +// Iterate has not yet reached, whether or not f will be invoked for +// it is unspecified. +func (m *Map) Iterate(f func(key types.Type, value any)) { + if m != nil { + for _, bucket := range m.table { + for _, e := range bucket { + if e.key != nil { + f(e.key, e.value) + } + } + } + } +} + +// Keys returns a new slice containing the set of map keys. +// The order is unspecified. +func (m *Map) Keys() []types.Type { + keys := make([]types.Type, 0, m.Len()) + m.Iterate(func(key types.Type, _ any) { + keys = append(keys, key) + }) + return keys +} + +func (m *Map) toString(values bool) string { + if m == nil { + return "{}" + } + var buf bytes.Buffer + fmt.Fprint(&buf, "{") + sep := "" + m.Iterate(func(key types.Type, value any) { + fmt.Fprint(&buf, sep) + sep = ", " + fmt.Fprint(&buf, key) + if values { + fmt.Fprintf(&buf, ": %q", value) + } + }) + fmt.Fprint(&buf, "}") + return buf.String() +} + +// String returns a string representation of the map's entries. +// Values are printed using fmt.Sprintf("%v", v). +// Order is unspecified. +func (m *Map) String() string { + return m.toString(true) +} + +// KeysString returns a string representation of the map's key set. +// Order is unspecified. +func (m *Map) KeysString() string { + return m.toString(false) +} + +//////////////////////////////////////////////////////////////////////// +// Hasher + +// A Hasher maps each type to its hash value. +// For efficiency, a hasher uses memoization; thus its memory +// footprint grows monotonically over time. +// Hashers are not thread-safe. +// Hashers have reference semantics. +// Call MakeHasher to create a Hasher. +type Hasher struct { + memo map[types.Type]uint32 + + // ptrMap records pointer identity. + ptrMap map[any]uint32 + + // sigTParams holds type parameters from the signature being hashed. + // Signatures are considered identical modulo renaming of type parameters, so + // within the scope of a signature type the identity of the signature's type + // parameters is just their index. + // + // Since the language does not currently support referring to uninstantiated + // generic types or functions, and instantiated signatures do not have type + // parameter lists, we should never encounter a second non-empty type + // parameter list when hashing a generic signature. + sigTParams *types.TypeParamList +} + +// MakeHasher returns a new Hasher instance. +func MakeHasher() Hasher { + return Hasher{ + memo: make(map[types.Type]uint32), + ptrMap: make(map[any]uint32), + sigTParams: nil, + } +} + +// Hash computes a hash value for the given type t such that +// Identical(t, t') => Hash(t) == Hash(t'). +func (h Hasher) Hash(t types.Type) uint32 { + hash, ok := h.memo[t] + if !ok { + hash = h.hashFor(t) + h.memo[t] = hash + } + return hash +} + +// hashString computes the Fowler–Noll–Vo hash of s. +func hashString(s string) uint32 { + var h uint32 + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + return h +} + +// hashFor computes the hash of t. +func (h Hasher) hashFor(t types.Type) uint32 { + // See Identical for rationale. + switch t := t.(type) { + case *types.Basic: + return uint32(t.Kind()) + + case *types.Alias: + return h.Hash(types.Unalias(t)) + + case *types.Array: + return 9043 + 2*uint32(t.Len()) + 3*h.Hash(t.Elem()) + + case *types.Slice: + return 9049 + 2*h.Hash(t.Elem()) + + case *types.Struct: + var hash uint32 = 9059 + for i, n := 0, t.NumFields(); i < n; i++ { + f := t.Field(i) + if f.Anonymous() { + hash += 8861 + } + hash += hashString(t.Tag(i)) + hash += hashString(f.Name()) // (ignore f.Pkg) + hash += h.Hash(f.Type()) + } + return hash + + case *types.Pointer: + return 9067 + 2*h.Hash(t.Elem()) + + case *types.Signature: + var hash uint32 = 9091 + if t.Variadic() { + hash *= 8863 + } + + // Use a separate hasher for types inside of the signature, where type + // parameter identity is modified to be (index, constraint). We must use a + // new memo for this hasher as type identity may be affected by this + // masking. For example, in func[T any](*T), the identity of *T depends on + // whether we are mapping the argument in isolation, or recursively as part + // of hashing the signature. + // + // We should never encounter a generic signature while hashing another + // generic signature, but defensively set sigTParams only if h.mask is + // unset. + tparams := t.TypeParams() + if h.sigTParams == nil && tparams.Len() != 0 { + h = Hasher{ + // There may be something more efficient than discarding the existing + // memo, but it would require detecting whether types are 'tainted' by + // references to type parameters. + memo: make(map[types.Type]uint32), + // Re-using ptrMap ensures that pointer identity is preserved in this + // hasher. + ptrMap: h.ptrMap, + sigTParams: tparams, + } + } + + for i := 0; i < tparams.Len(); i++ { + tparam := tparams.At(i) + hash += 7 * h.Hash(tparam.Constraint()) + } + + return hash + 3*h.hashTuple(t.Params()) + 5*h.hashTuple(t.Results()) + + case *types.Union: + return h.hashUnion(t) + + case *types.Interface: + // Interfaces are identical if they have the same set of methods, with + // identical names and types, and they have the same set of type + // restrictions. See go/types.identical for more details. + var hash uint32 = 9103 + + // Hash methods. + for i, n := 0, t.NumMethods(); i < n; i++ { + // Method order is not significant. + // Ignore m.Pkg(). + m := t.Method(i) + // Use shallow hash on method signature to + // avoid anonymous interface cycles. + hash += 3*hashString(m.Name()) + 5*h.shallowHash(m.Type()) + } + + // Hash type restrictions. + terms, err := typeparams.InterfaceTermSet(t) + // if err != nil t has invalid type restrictions. + if err == nil { + hash += h.hashTermSet(terms) + } + + return hash + + case *types.Map: + return 9109 + 2*h.Hash(t.Key()) + 3*h.Hash(t.Elem()) + + case *types.Chan: + return 9127 + 2*uint32(t.Dir()) + 3*h.Hash(t.Elem()) + + case *types.Named: + hash := h.hashPtr(t.Obj()) + targs := t.TypeArgs() + for i := 0; i < targs.Len(); i++ { + targ := targs.At(i) + hash += 2 * h.Hash(targ) + } + return hash + + case *types.TypeParam: + return h.hashTypeParam(t) + + case *types.Tuple: + return h.hashTuple(t) + } + + panic(fmt.Sprintf("%T: %v", t, t)) +} + +func (h Hasher) hashTuple(tuple *types.Tuple) uint32 { + // See go/types.identicalTypes for rationale. + n := tuple.Len() + hash := 9137 + 2*uint32(n) + for i := 0; i < n; i++ { + hash += 3 * h.Hash(tuple.At(i).Type()) + } + return hash +} + +func (h Hasher) hashUnion(t *types.Union) uint32 { + // Hash type restrictions. + terms, err := typeparams.UnionTermSet(t) + // if err != nil t has invalid type restrictions. Fall back on a non-zero + // hash. + if err != nil { + return 9151 + } + return h.hashTermSet(terms) +} + +func (h Hasher) hashTermSet(terms []*types.Term) uint32 { + hash := 9157 + 2*uint32(len(terms)) + for _, term := range terms { + // term order is not significant. + termHash := h.Hash(term.Type()) + if term.Tilde() { + termHash *= 9161 + } + hash += 3 * termHash + } + return hash +} + +// hashTypeParam returns a hash of the type parameter t, with a hash value +// depending on whether t is contained in h.sigTParams. +// +// If h.sigTParams is set and contains t, then we are in the process of hashing +// a signature, and the hash value of t must depend only on t's index and +// constraint: signatures are considered identical modulo type parameter +// renaming. To avoid infinite recursion, we only hash the type parameter +// index, and rely on types.Identical to handle signatures where constraints +// are not identical. +// +// Otherwise the hash of t depends only on t's pointer identity. +func (h Hasher) hashTypeParam(t *types.TypeParam) uint32 { + if h.sigTParams != nil { + i := t.Index() + if i >= 0 && i < h.sigTParams.Len() && t == h.sigTParams.At(i) { + return 9173 + 3*uint32(i) + } + } + return h.hashPtr(t.Obj()) +} + +// hashPtr hashes the pointer identity of ptr. It uses h.ptrMap to ensure that +// pointers values are not dependent on the GC. +func (h Hasher) hashPtr(ptr any) uint32 { + if hash, ok := h.ptrMap[ptr]; ok { + return hash + } + hash := uint32(reflect.ValueOf(ptr).Pointer()) + h.ptrMap[ptr] = hash + return hash +} + +// shallowHash computes a hash of t without looking at any of its +// element Types, to avoid potential anonymous cycles in the types of +// interface methods. +// +// When an unnamed non-empty interface type appears anywhere among the +// arguments or results of an interface method, there is a potential +// for endless recursion. Consider: +// +// type X interface { m() []*interface { X } } +// +// The problem is that the Methods of the interface in m's result type +// include m itself; there is no mention of the named type X that +// might help us break the cycle. +// (See comment in go/types.identical, case *Interface, for more.) +func (h Hasher) shallowHash(t types.Type) uint32 { + // t is the type of an interface method (Signature), + // its params or results (Tuples), or their immediate + // elements (mostly Slice, Pointer, Basic, Named), + // so there's no need to optimize anything else. + switch t := t.(type) { + case *types.Alias: + return h.shallowHash(types.Unalias(t)) + + case *types.Signature: + var hash uint32 = 604171 + if t.Variadic() { + hash *= 971767 + } + // The Signature/Tuple recursion is always finite + // and invariably shallow. + return hash + 1062599*h.shallowHash(t.Params()) + 1282529*h.shallowHash(t.Results()) + + case *types.Tuple: + n := t.Len() + hash := 9137 + 2*uint32(n) + for i := 0; i < n; i++ { + hash += 53471161 * h.shallowHash(t.At(i).Type()) + } + return hash + + case *types.Basic: + return 45212177 * uint32(t.Kind()) + + case *types.Array: + return 1524181 + 2*uint32(t.Len()) + + case *types.Slice: + return 2690201 + + case *types.Struct: + return 3326489 + + case *types.Pointer: + return 4393139 + + case *types.Union: + return 562448657 + + case *types.Interface: + return 2124679 // no recursion here + + case *types.Map: + return 9109 + + case *types.Chan: + return 9127 + + case *types.Named: + return h.hashPtr(t.Obj()) + + case *types.TypeParam: + return h.hashPtr(t.Obj()) + } + panic(fmt.Sprintf("shallowHash: %T: %v", t, t)) +} diff --git a/vendor/golang.org/x/tools/go/types/typeutil/methodsetcache.go b/vendor/golang.org/x/tools/go/types/typeutil/methodsetcache.go new file mode 100644 index 0000000..f766602 --- /dev/null +++ b/vendor/golang.org/x/tools/go/types/typeutil/methodsetcache.go @@ -0,0 +1,71 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file implements a cache of method sets. + +package typeutil + +import ( + "go/types" + "sync" +) + +// A MethodSetCache records the method set of each type T for which +// MethodSet(T) is called so that repeat queries are fast. +// The zero value is a ready-to-use cache instance. +type MethodSetCache struct { + mu sync.Mutex + named map[*types.Named]struct{ value, pointer *types.MethodSet } // method sets for named N and *N + others map[types.Type]*types.MethodSet // all other types +} + +// MethodSet returns the method set of type T. It is thread-safe. +// +// If cache is nil, this function is equivalent to types.NewMethodSet(T). +// Utility functions can thus expose an optional *MethodSetCache +// parameter to clients that care about performance. +func (cache *MethodSetCache) MethodSet(T types.Type) *types.MethodSet { + if cache == nil { + return types.NewMethodSet(T) + } + cache.mu.Lock() + defer cache.mu.Unlock() + + switch T := types.Unalias(T).(type) { + case *types.Named: + return cache.lookupNamed(T).value + + case *types.Pointer: + if N, ok := types.Unalias(T.Elem()).(*types.Named); ok { + return cache.lookupNamed(N).pointer + } + } + + // all other types + // (The map uses pointer equivalence, not type identity.) + mset := cache.others[T] + if mset == nil { + mset = types.NewMethodSet(T) + if cache.others == nil { + cache.others = make(map[types.Type]*types.MethodSet) + } + cache.others[T] = mset + } + return mset +} + +func (cache *MethodSetCache) lookupNamed(named *types.Named) struct{ value, pointer *types.MethodSet } { + if cache.named == nil { + cache.named = make(map[*types.Named]struct{ value, pointer *types.MethodSet }) + } + // Avoid recomputing mset(*T) for each distinct Pointer + // instance whose underlying type is a named type. + msets, ok := cache.named[named] + if !ok { + msets.value = types.NewMethodSet(named) + msets.pointer = types.NewMethodSet(types.NewPointer(named)) + cache.named[named] = msets + } + return msets +} diff --git a/vendor/golang.org/x/tools/go/types/typeutil/ui.go b/vendor/golang.org/x/tools/go/types/typeutil/ui.go new file mode 100644 index 0000000..9dda6a2 --- /dev/null +++ b/vendor/golang.org/x/tools/go/types/typeutil/ui.go @@ -0,0 +1,53 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typeutil + +// This file defines utilities for user interfaces that display types. + +import ( + "go/types" +) + +// IntuitiveMethodSet returns the intuitive method set of a type T, +// which is the set of methods you can call on an addressable value of +// that type. +// +// The result always contains MethodSet(T), and is exactly MethodSet(T) +// for interface types and for pointer-to-concrete types. +// For all other concrete types T, the result additionally +// contains each method belonging to *T if there is no identically +// named method on T itself. +// +// This corresponds to user intuition about method sets; +// this function is intended only for user interfaces. +// +// The order of the result is as for types.MethodSet(T). +func IntuitiveMethodSet(T types.Type, msets *MethodSetCache) []*types.Selection { + isPointerToConcrete := func(T types.Type) bool { + ptr, ok := types.Unalias(T).(*types.Pointer) + return ok && !types.IsInterface(ptr.Elem()) + } + + var result []*types.Selection + mset := msets.MethodSet(T) + if types.IsInterface(T) || isPointerToConcrete(T) { + for i, n := 0, mset.Len(); i < n; i++ { + result = append(result, mset.At(i)) + } + } else { + // T is some other concrete type. + // Report methods of T and *T, preferring those of T. + pmset := msets.MethodSet(types.NewPointer(T)) + for i, n := 0, pmset.Len(); i < n; i++ { + meth := pmset.At(i) + if m := mset.Lookup(meth.Obj().Pkg(), meth.Obj().Name()); m != nil { + meth = m + } + result = append(result, meth) + } + + } + return result +} diff --git a/vendor/golang.org/x/tools/imports/forward.go b/vendor/golang.org/x/tools/imports/forward.go index d2547c7..cb6db88 100644 --- a/vendor/golang.org/x/tools/imports/forward.go +++ b/vendor/golang.org/x/tools/imports/forward.go @@ -7,8 +7,8 @@ package imports // import "golang.org/x/tools/imports" import ( - "io/ioutil" "log" + "os" "golang.org/x/tools/internal/gocommand" intimp "golang.org/x/tools/internal/imports" @@ -44,7 +44,7 @@ var LocalPrefix string func Process(filename string, src []byte, opt *Options) ([]byte, error) { var err error if src == nil { - src, err = ioutil.ReadFile(filename) + src, err = os.ReadFile(filename) if err != nil { return nil, err } diff --git a/vendor/golang.org/x/tools/internal/aliases/aliases.go b/vendor/golang.org/x/tools/internal/aliases/aliases.go new file mode 100644 index 0000000..b9425f5 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/aliases/aliases.go @@ -0,0 +1,38 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package aliases + +import ( + "go/token" + "go/types" +) + +// Package aliases defines backward compatible shims +// for the types.Alias type representation added in 1.22. +// This defines placeholders for x/tools until 1.26. + +// NewAlias creates a new TypeName in Package pkg that +// is an alias for the type rhs. +// +// The enabled parameter determines whether the resulting [TypeName]'s +// type is an [types.Alias]. Its value must be the result of a call to +// [Enabled], which computes the effective value of +// GODEBUG=gotypesalias=... by invoking the type checker. The Enabled +// function is expensive and should be called once per task (e.g. +// package import), not once per call to NewAlias. +// +// Precondition: enabled || len(tparams)==0. +// If materialized aliases are disabled, there must not be any type parameters. +func NewAlias(enabled bool, pos token.Pos, pkg *types.Package, name string, rhs types.Type, tparams []*types.TypeParam) *types.TypeName { + if enabled { + tname := types.NewTypeName(pos, pkg, name, nil) + SetTypeParams(types.NewAlias(tname, rhs), tparams) + return tname + } + if len(tparams) > 0 { + panic("cannot create an alias with type parameters when gotypesalias is not enabled") + } + return types.NewTypeName(pos, pkg, name, rhs) +} diff --git a/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go b/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go new file mode 100644 index 0000000..7716a33 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go @@ -0,0 +1,80 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package aliases + +import ( + "go/ast" + "go/parser" + "go/token" + "go/types" +) + +// Rhs returns the type on the right-hand side of the alias declaration. +func Rhs(alias *types.Alias) types.Type { + if alias, ok := any(alias).(interface{ Rhs() types.Type }); ok { + return alias.Rhs() // go1.23+ + } + + // go1.22's Alias didn't have the Rhs method, + // so Unalias is the best we can do. + return types.Unalias(alias) +} + +// TypeParams returns the type parameter list of the alias. +func TypeParams(alias *types.Alias) *types.TypeParamList { + if alias, ok := any(alias).(interface{ TypeParams() *types.TypeParamList }); ok { + return alias.TypeParams() // go1.23+ + } + return nil +} + +// SetTypeParams sets the type parameters of the alias type. +func SetTypeParams(alias *types.Alias, tparams []*types.TypeParam) { + if alias, ok := any(alias).(interface { + SetTypeParams(tparams []*types.TypeParam) + }); ok { + alias.SetTypeParams(tparams) // go1.23+ + } else if len(tparams) > 0 { + panic("cannot set type parameters of an Alias type in go1.22") + } +} + +// TypeArgs returns the type arguments used to instantiate the Alias type. +func TypeArgs(alias *types.Alias) *types.TypeList { + if alias, ok := any(alias).(interface{ TypeArgs() *types.TypeList }); ok { + return alias.TypeArgs() // go1.23+ + } + return nil // empty (go1.22) +} + +// Origin returns the generic Alias type of which alias is an instance. +// If alias is not an instance of a generic alias, Origin returns alias. +func Origin(alias *types.Alias) *types.Alias { + if alias, ok := any(alias).(interface{ Origin() *types.Alias }); ok { + return alias.Origin() // go1.23+ + } + return alias // not an instance of a generic alias (go1.22) +} + +// Enabled reports whether [NewAlias] should create [types.Alias] types. +// +// This function is expensive! Call it sparingly. +func Enabled() bool { + // The only reliable way to compute the answer is to invoke go/types. + // We don't parse the GODEBUG environment variable, because + // (a) it's tricky to do so in a manner that is consistent + // with the godebug package; in particular, a simple + // substring check is not good enough. The value is a + // rightmost-wins list of options. But more importantly: + // (b) it is impossible to detect changes to the effective + // setting caused by os.Setenv("GODEBUG"), as happens in + // many tests. Therefore any attempt to cache the result + // is just incorrect. + fset := token.NewFileSet() + f, _ := parser.ParseFile(fset, "a.go", "package p; type A = int", parser.SkipObjectResolution) + pkg, _ := new(types.Config).Check("p", fset, []*ast.File{f}, nil) + _, enabled := pkg.Scope().Lookup("A").Type().(*types.Alias) + return enabled +} diff --git a/vendor/golang.org/x/tools/internal/event/keys/util.go b/vendor/golang.org/x/tools/internal/event/keys/util.go new file mode 100644 index 0000000..c0e8e73 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/event/keys/util.go @@ -0,0 +1,21 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package keys + +import ( + "sort" + "strings" +) + +// Join returns a canonical join of the keys in S: +// a sorted comma-separated string list. +func Join[S ~[]T, T ~string](s S) string { + strs := make([]string, 0, len(s)) + for _, v := range s { + strs = append(strs, string(v)) + } + sort.Strings(strs) + return strings.Join(strs, ",") +} diff --git a/vendor/golang.org/x/tools/internal/event/tag/tag.go b/vendor/golang.org/x/tools/internal/event/tag/tag.go deleted file mode 100644 index 581b26c..0000000 --- a/vendor/golang.org/x/tools/internal/event/tag/tag.go +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package tag provides the labels used for telemetry throughout gopls. -package tag - -import ( - "golang.org/x/tools/internal/event/keys" -) - -var ( - // create the label keys we use - Method = keys.NewString("method", "") - StatusCode = keys.NewString("status.code", "") - StatusMessage = keys.NewString("status.message", "") - RPCID = keys.NewString("id", "") - RPCDirection = keys.NewString("direction", "") - File = keys.NewString("file", "") - Directory = keys.New("directory", "") - URI = keys.New("URI", "") - Package = keys.NewString("package", "") // sorted comma-separated list of Package IDs - PackagePath = keys.NewString("package_path", "") - Query = keys.New("query", "") - Snapshot = keys.NewUInt64("snapshot", "") - Operation = keys.NewString("operation", "") - - Position = keys.New("position", "") - Category = keys.NewString("category", "") - PackageCount = keys.NewInt("packages", "") - Files = keys.New("files", "") - Port = keys.NewInt("port", "") - Type = keys.New("type", "") - HoverKind = keys.NewString("hoverkind", "") - - NewServer = keys.NewString("new_server", "A new server was added") - EndServer = keys.NewString("end_server", "A server was shut down") - - ServerID = keys.NewString("server", "The server ID an event is related to") - Logfile = keys.NewString("logfile", "") - DebugAddress = keys.NewString("debug_address", "") - GoplsPath = keys.NewString("gopls_path", "") - ClientID = keys.NewString("client_id", "") - - Level = keys.NewInt("level", "The logging level") -) - -var ( - // create the stats we measure - Started = keys.NewInt64("started", "Count of started RPCs.") - ReceivedBytes = keys.NewInt64("received_bytes", "Bytes received.") //, unit.Bytes) - SentBytes = keys.NewInt64("sent_bytes", "Bytes sent.") //, unit.Bytes) - Latency = keys.NewFloat64("latency_ms", "Elapsed time in milliseconds") //, unit.Milliseconds) -) - -const ( - Inbound = "in" - Outbound = "out" -) diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk.go deleted file mode 100644 index c40c7e9..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package fastwalk provides a faster version of [filepath.Walk] for file system -// scanning tools. -package fastwalk - -import ( - "errors" - "os" - "path/filepath" - "runtime" - "sync" -) - -// ErrTraverseLink is used as a return value from WalkFuncs to indicate that the -// symlink named in the call may be traversed. -var ErrTraverseLink = errors.New("fastwalk: traverse symlink, assuming target is a directory") - -// ErrSkipFiles is a used as a return value from WalkFuncs to indicate that the -// callback should not be called for any other files in the current directory. -// Child directories will still be traversed. -var ErrSkipFiles = errors.New("fastwalk: skip remaining files in directory") - -// Walk is a faster implementation of [filepath.Walk]. -// -// [filepath.Walk]'s design necessarily calls [os.Lstat] on each file, -// even if the caller needs less info. -// Many tools need only the type of each file. -// On some platforms, this information is provided directly by the readdir -// system call, avoiding the need to stat each file individually. -// fastwalk_unix.go contains a fork of the syscall routines. -// -// See golang.org/issue/16399. -// -// Walk walks the file tree rooted at root, calling walkFn for -// each file or directory in the tree, including root. -// -// If Walk returns [filepath.SkipDir], the directory is skipped. -// -// Unlike [filepath.Walk]: -// - file stat calls must be done by the user. -// The only provided metadata is the file type, which does not include -// any permission bits. -// - multiple goroutines stat the filesystem concurrently. The provided -// walkFn must be safe for concurrent use. -// - Walk can follow symlinks if walkFn returns the TraverseLink -// sentinel error. It is the walkFn's responsibility to prevent -// Walk from going into symlink cycles. -func Walk(root string, walkFn func(path string, typ os.FileMode) error) error { - // TODO(bradfitz): make numWorkers configurable? We used a - // minimum of 4 to give the kernel more info about multiple - // things we want, in hopes its I/O scheduling can take - // advantage of that. Hopefully most are in cache. Maybe 4 is - // even too low of a minimum. Profile more. - numWorkers := 4 - if n := runtime.NumCPU(); n > numWorkers { - numWorkers = n - } - - // Make sure to wait for all workers to finish, otherwise - // walkFn could still be called after returning. This Wait call - // runs after close(e.donec) below. - var wg sync.WaitGroup - defer wg.Wait() - - w := &walker{ - fn: walkFn, - enqueuec: make(chan walkItem, numWorkers), // buffered for performance - workc: make(chan walkItem, numWorkers), // buffered for performance - donec: make(chan struct{}), - - // buffered for correctness & not leaking goroutines: - resc: make(chan error, numWorkers), - } - defer close(w.donec) - - for i := 0; i < numWorkers; i++ { - wg.Add(1) - go w.doWork(&wg) - } - todo := []walkItem{{dir: root}} - out := 0 - for { - workc := w.workc - var workItem walkItem - if len(todo) == 0 { - workc = nil - } else { - workItem = todo[len(todo)-1] - } - select { - case workc <- workItem: - todo = todo[:len(todo)-1] - out++ - case it := <-w.enqueuec: - todo = append(todo, it) - case err := <-w.resc: - out-- - if err != nil { - return err - } - if out == 0 && len(todo) == 0 { - // It's safe to quit here, as long as the buffered - // enqueue channel isn't also readable, which might - // happen if the worker sends both another unit of - // work and its result before the other select was - // scheduled and both w.resc and w.enqueuec were - // readable. - select { - case it := <-w.enqueuec: - todo = append(todo, it) - default: - return nil - } - } - } - } -} - -// doWork reads directories as instructed (via workc) and runs the -// user's callback function. -func (w *walker) doWork(wg *sync.WaitGroup) { - defer wg.Done() - for { - select { - case <-w.donec: - return - case it := <-w.workc: - select { - case <-w.donec: - return - case w.resc <- w.walk(it.dir, !it.callbackDone): - } - } - } -} - -type walker struct { - fn func(path string, typ os.FileMode) error - - donec chan struct{} // closed on fastWalk's return - workc chan walkItem // to workers - enqueuec chan walkItem // from workers - resc chan error // from workers -} - -type walkItem struct { - dir string - callbackDone bool // callback already called; don't do it again -} - -func (w *walker) enqueue(it walkItem) { - select { - case w.enqueuec <- it: - case <-w.donec: - } -} - -func (w *walker) onDirEnt(dirName, baseName string, typ os.FileMode) error { - joined := dirName + string(os.PathSeparator) + baseName - if typ == os.ModeDir { - w.enqueue(walkItem{dir: joined}) - return nil - } - - err := w.fn(joined, typ) - if typ == os.ModeSymlink { - if err == ErrTraverseLink { - // Set callbackDone so we don't call it twice for both the - // symlink-as-symlink and the symlink-as-directory later: - w.enqueue(walkItem{dir: joined, callbackDone: true}) - return nil - } - if err == filepath.SkipDir { - // Permit SkipDir on symlinks too. - return nil - } - } - return err -} - -func (w *walker) walk(root string, runUserCallback bool) error { - if runUserCallback { - err := w.fn(root, os.ModeDir) - if err == filepath.SkipDir { - return nil - } - if err != nil { - return err - } - } - - return readDir(root, w.onDirEnt) -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_darwin.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_darwin.go deleted file mode 100644 index 0ca55e0..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_darwin.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build darwin && cgo -// +build darwin,cgo - -package fastwalk - -/* -#include - -// fastwalk_readdir_r wraps readdir_r so that we don't have to pass a dirent** -// result pointer which triggers CGO's "Go pointer to Go pointer" check unless -// we allocat the result dirent* with malloc. -// -// fastwalk_readdir_r returns 0 on success, -1 upon reaching the end of the -// directory, or a positive error number to indicate failure. -static int fastwalk_readdir_r(DIR *fd, struct dirent *entry) { - struct dirent *result; - int ret = readdir_r(fd, entry, &result); - if (ret == 0 && result == NULL) { - ret = -1; // EOF - } - return ret; -} -*/ -import "C" - -import ( - "os" - "syscall" - "unsafe" -) - -func readDir(dirName string, fn func(dirName, entName string, typ os.FileMode) error) error { - fd, err := openDir(dirName) - if err != nil { - return &os.PathError{Op: "opendir", Path: dirName, Err: err} - } - defer C.closedir(fd) - - skipFiles := false - var dirent syscall.Dirent - for { - ret := int(C.fastwalk_readdir_r(fd, (*C.struct_dirent)(unsafe.Pointer(&dirent)))) - if ret != 0 { - if ret == -1 { - break // EOF - } - if ret == int(syscall.EINTR) { - continue - } - return &os.PathError{Op: "readdir", Path: dirName, Err: syscall.Errno(ret)} - } - if dirent.Ino == 0 { - continue - } - typ := dtToType(dirent.Type) - if skipFiles && typ.IsRegular() { - continue - } - name := (*[len(syscall.Dirent{}.Name)]byte)(unsafe.Pointer(&dirent.Name))[:] - name = name[:dirent.Namlen] - for i, c := range name { - if c == 0 { - name = name[:i] - break - } - } - // Check for useless names before allocating a string. - if string(name) == "." || string(name) == ".." { - continue - } - if err := fn(dirName, string(name), typ); err != nil { - if err != ErrSkipFiles { - return err - } - skipFiles = true - } - } - - return nil -} - -func dtToType(typ uint8) os.FileMode { - switch typ { - case syscall.DT_BLK: - return os.ModeDevice - case syscall.DT_CHR: - return os.ModeDevice | os.ModeCharDevice - case syscall.DT_DIR: - return os.ModeDir - case syscall.DT_FIFO: - return os.ModeNamedPipe - case syscall.DT_LNK: - return os.ModeSymlink - case syscall.DT_REG: - return 0 - case syscall.DT_SOCK: - return os.ModeSocket - } - return ^os.FileMode(0) -} - -// openDir wraps opendir(3) and handles any EINTR errors. The returned *DIR -// needs to be closed with closedir(3). -func openDir(path string) (*C.DIR, error) { - name, err := syscall.BytePtrFromString(path) - if err != nil { - return nil, err - } - for { - fd, err := C.opendir((*C.char)(unsafe.Pointer(name))) - if err != syscall.EINTR { - return fd, err - } - } -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_fileno.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_fileno.go deleted file mode 100644 index d58595d..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_fileno.go +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build freebsd || openbsd || netbsd -// +build freebsd openbsd netbsd - -package fastwalk - -import "syscall" - -func direntInode(dirent *syscall.Dirent) uint64 { - return uint64(dirent.Fileno) -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_ino.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_ino.go deleted file mode 100644 index d392289..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_ino.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (linux || (darwin && !cgo)) && !appengine -// +build linux darwin,!cgo -// +build !appengine - -package fastwalk - -import "syscall" - -func direntInode(dirent *syscall.Dirent) uint64 { - return dirent.Ino -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_namlen_bsd.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_namlen_bsd.go deleted file mode 100644 index 38a4db6..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_namlen_bsd.go +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (darwin && !cgo) || freebsd || openbsd || netbsd -// +build darwin,!cgo freebsd openbsd netbsd - -package fastwalk - -import "syscall" - -func direntNamlen(dirent *syscall.Dirent) uint64 { - return uint64(dirent.Namlen) -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_namlen_linux.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_namlen_linux.go deleted file mode 100644 index c82e57d..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_dirent_namlen_linux.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build linux && !appengine -// +build linux,!appengine - -package fastwalk - -import ( - "bytes" - "syscall" - "unsafe" -) - -func direntNamlen(dirent *syscall.Dirent) uint64 { - const fixedHdr = uint16(unsafe.Offsetof(syscall.Dirent{}.Name)) - nameBuf := (*[unsafe.Sizeof(dirent.Name)]byte)(unsafe.Pointer(&dirent.Name[0])) - const nameBufLen = uint16(len(nameBuf)) - limit := dirent.Reclen - fixedHdr - if limit > nameBufLen { - limit = nameBufLen - } - nameLen := bytes.IndexByte(nameBuf[:limit], 0) - if nameLen < 0 { - panic("failed to find terminating 0 byte in dirent") - } - return uint64(nameLen) -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_portable.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_portable.go deleted file mode 100644 index 085d311..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_portable.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build appengine || (!linux && !darwin && !freebsd && !openbsd && !netbsd) -// +build appengine !linux,!darwin,!freebsd,!openbsd,!netbsd - -package fastwalk - -import ( - "io/ioutil" - "os" -) - -// readDir calls fn for each directory entry in dirName. -// It does not descend into directories or follow symlinks. -// If fn returns a non-nil error, readDir returns with that error -// immediately. -func readDir(dirName string, fn func(dirName, entName string, typ os.FileMode) error) error { - fis, err := ioutil.ReadDir(dirName) - if err != nil { - return err - } - skipFiles := false - for _, fi := range fis { - if fi.Mode().IsRegular() && skipFiles { - continue - } - if err := fn(dirName, fi.Name(), fi.Mode()&os.ModeType); err != nil { - if err == ErrSkipFiles { - skipFiles = true - continue - } - return err - } - } - return nil -} diff --git a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_unix.go b/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_unix.go deleted file mode 100644 index f12f1a7..0000000 --- a/vendor/golang.org/x/tools/internal/fastwalk/fastwalk_unix.go +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (linux || freebsd || openbsd || netbsd || (darwin && !cgo)) && !appengine -// +build linux freebsd openbsd netbsd darwin,!cgo -// +build !appengine - -package fastwalk - -import ( - "fmt" - "os" - "syscall" - "unsafe" -) - -const blockSize = 8 << 10 - -// unknownFileMode is a sentinel (and bogus) os.FileMode -// value used to represent a syscall.DT_UNKNOWN Dirent.Type. -const unknownFileMode os.FileMode = os.ModeNamedPipe | os.ModeSocket | os.ModeDevice - -func readDir(dirName string, fn func(dirName, entName string, typ os.FileMode) error) error { - fd, err := open(dirName, 0, 0) - if err != nil { - return &os.PathError{Op: "open", Path: dirName, Err: err} - } - defer syscall.Close(fd) - - // The buffer must be at least a block long. - buf := make([]byte, blockSize) // stack-allocated; doesn't escape - bufp := 0 // starting read position in buf - nbuf := 0 // end valid data in buf - skipFiles := false - for { - if bufp >= nbuf { - bufp = 0 - nbuf, err = readDirent(fd, buf) - if err != nil { - return os.NewSyscallError("readdirent", err) - } - if nbuf <= 0 { - return nil - } - } - consumed, name, typ := parseDirEnt(buf[bufp:nbuf]) - bufp += consumed - if name == "" || name == "." || name == ".." { - continue - } - // Fallback for filesystems (like old XFS) that don't - // support Dirent.Type and have DT_UNKNOWN (0) there - // instead. - if typ == unknownFileMode { - fi, err := os.Lstat(dirName + "/" + name) - if err != nil { - // It got deleted in the meantime. - if os.IsNotExist(err) { - continue - } - return err - } - typ = fi.Mode() & os.ModeType - } - if skipFiles && typ.IsRegular() { - continue - } - if err := fn(dirName, name, typ); err != nil { - if err == ErrSkipFiles { - skipFiles = true - continue - } - return err - } - } -} - -func parseDirEnt(buf []byte) (consumed int, name string, typ os.FileMode) { - // golang.org/issue/37269 - dirent := &syscall.Dirent{} - copy((*[unsafe.Sizeof(syscall.Dirent{})]byte)(unsafe.Pointer(dirent))[:], buf) - if v := unsafe.Offsetof(dirent.Reclen) + unsafe.Sizeof(dirent.Reclen); uintptr(len(buf)) < v { - panic(fmt.Sprintf("buf size of %d smaller than dirent header size %d", len(buf), v)) - } - if len(buf) < int(dirent.Reclen) { - panic(fmt.Sprintf("buf size %d < record length %d", len(buf), dirent.Reclen)) - } - consumed = int(dirent.Reclen) - if direntInode(dirent) == 0 { // File absent in directory. - return - } - switch dirent.Type { - case syscall.DT_REG: - typ = 0 - case syscall.DT_DIR: - typ = os.ModeDir - case syscall.DT_LNK: - typ = os.ModeSymlink - case syscall.DT_BLK: - typ = os.ModeDevice - case syscall.DT_FIFO: - typ = os.ModeNamedPipe - case syscall.DT_SOCK: - typ = os.ModeSocket - case syscall.DT_UNKNOWN: - typ = unknownFileMode - default: - // Skip weird things. - // It's probably a DT_WHT (http://lwn.net/Articles/325369/) - // or something. Revisit if/when this package is moved outside - // of goimports. goimports only cares about regular files, - // symlinks, and directories. - return - } - - nameBuf := (*[unsafe.Sizeof(dirent.Name)]byte)(unsafe.Pointer(&dirent.Name[0])) - nameLen := direntNamlen(dirent) - - // Special cases for common things: - if nameLen == 1 && nameBuf[0] == '.' { - name = "." - } else if nameLen == 2 && nameBuf[0] == '.' && nameBuf[1] == '.' { - name = ".." - } else { - name = string(nameBuf[:nameLen]) - } - return -} - -// According to https://golang.org/doc/go1.14#runtime -// A consequence of the implementation of preemption is that on Unix systems, including Linux and macOS -// systems, programs built with Go 1.14 will receive more signals than programs built with earlier releases. -// -// This causes syscall.Open and syscall.ReadDirent sometimes fail with EINTR errors. -// We need to retry in this case. -func open(path string, mode int, perm uint32) (fd int, err error) { - for { - fd, err := syscall.Open(path, mode, perm) - if err != syscall.EINTR { - return fd, err - } - } -} - -func readDirent(fd int, buf []byte) (n int, err error) { - for { - nbuf, err := syscall.ReadDirent(fd, buf) - if err != syscall.EINTR { - return nbuf, err - } - } -} diff --git a/vendor/golang.org/x/tools/internal/gcimporter/bimport.go b/vendor/golang.org/x/tools/internal/gcimporter/bimport.go new file mode 100644 index 0000000..d79a605 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/bimport.go @@ -0,0 +1,89 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains the remaining vestiges of +// $GOROOT/src/go/internal/gcimporter/bimport.go. + +package gcimporter + +import ( + "fmt" + "go/token" + "go/types" + "sync" +) + +func errorf(format string, args ...interface{}) { + panic(fmt.Sprintf(format, args...)) +} + +const deltaNewFile = -64 // see cmd/compile/internal/gc/bexport.go + +// Synthesize a token.Pos +type fakeFileSet struct { + fset *token.FileSet + files map[string]*fileInfo +} + +type fileInfo struct { + file *token.File + lastline int +} + +const maxlines = 64 * 1024 + +func (s *fakeFileSet) pos(file string, line, column int) token.Pos { + // TODO(mdempsky): Make use of column. + + // Since we don't know the set of needed file positions, we reserve maxlines + // positions per file. We delay calling token.File.SetLines until all + // positions have been calculated (by way of fakeFileSet.setLines), so that + // we can avoid setting unnecessary lines. See also golang/go#46586. + f := s.files[file] + if f == nil { + f = &fileInfo{file: s.fset.AddFile(file, -1, maxlines)} + s.files[file] = f + } + if line > maxlines { + line = 1 + } + if line > f.lastline { + f.lastline = line + } + + // Return a fake position assuming that f.file consists only of newlines. + return token.Pos(f.file.Base() + line - 1) +} + +func (s *fakeFileSet) setLines() { + fakeLinesOnce.Do(func() { + fakeLines = make([]int, maxlines) + for i := range fakeLines { + fakeLines[i] = i + } + }) + for _, f := range s.files { + f.file.SetLines(fakeLines[:f.lastline]) + } +} + +var ( + fakeLines []int + fakeLinesOnce sync.Once +) + +func chanDir(d int) types.ChanDir { + // tag values must match the constants in cmd/compile/internal/gc/go.go + switch d { + case 1 /* Crecv */ : + return types.RecvOnly + case 2 /* Csend */ : + return types.SendOnly + case 3 /* Cboth */ : + return types.SendRecv + default: + errorf("unexpected channel dir %d", d) + return 0 + } +} diff --git a/vendor/golang.org/x/tools/internal/gcimporter/exportdata.go b/vendor/golang.org/x/tools/internal/gcimporter/exportdata.go new file mode 100644 index 0000000..f6437fe --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/exportdata.go @@ -0,0 +1,99 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file is a copy of $GOROOT/src/go/internal/gcimporter/exportdata.go. + +// This file implements FindExportData. + +package gcimporter + +import ( + "bufio" + "fmt" + "io" + "strconv" + "strings" +) + +func readGopackHeader(r *bufio.Reader) (name string, size int64, err error) { + // See $GOROOT/include/ar.h. + hdr := make([]byte, 16+12+6+6+8+10+2) + _, err = io.ReadFull(r, hdr) + if err != nil { + return + } + // leave for debugging + if false { + fmt.Printf("header: %s", hdr) + } + s := strings.TrimSpace(string(hdr[16+12+6+6+8:][:10])) + length, err := strconv.Atoi(s) + size = int64(length) + if err != nil || hdr[len(hdr)-2] != '`' || hdr[len(hdr)-1] != '\n' { + err = fmt.Errorf("invalid archive header") + return + } + name = strings.TrimSpace(string(hdr[:16])) + return +} + +// FindExportData positions the reader r at the beginning of the +// export data section of an underlying GC-created object/archive +// file by reading from it. The reader must be positioned at the +// start of the file before calling this function. The hdr result +// is the string before the export data, either "$$" or "$$B". +// The size result is the length of the export data in bytes, or -1 if not known. +func FindExportData(r *bufio.Reader) (hdr string, size int64, err error) { + // Read first line to make sure this is an object file. + line, err := r.ReadSlice('\n') + if err != nil { + err = fmt.Errorf("can't find export data (%v)", err) + return + } + + if string(line) == "!\n" { + // Archive file. Scan to __.PKGDEF. + var name string + if name, size, err = readGopackHeader(r); err != nil { + return + } + + // First entry should be __.PKGDEF. + if name != "__.PKGDEF" { + err = fmt.Errorf("go archive is missing __.PKGDEF") + return + } + + // Read first line of __.PKGDEF data, so that line + // is once again the first line of the input. + if line, err = r.ReadSlice('\n'); err != nil { + err = fmt.Errorf("can't find export data (%v)", err) + return + } + size -= int64(len(line)) + } + + // Now at __.PKGDEF in archive or still at beginning of file. + // Either way, line should begin with "go object ". + if !strings.HasPrefix(string(line), "go object ") { + err = fmt.Errorf("not a Go object file") + return + } + + // Skip over object header to export data. + // Begins after first line starting with $$. + for line[0] != '$' { + if line, err = r.ReadSlice('\n'); err != nil { + err = fmt.Errorf("can't find export data (%v)", err) + return + } + size -= int64(len(line)) + } + hdr = string(line) + if size < 0 { + size = -1 + } + + return +} diff --git a/vendor/golang.org/x/tools/internal/gcimporter/gcimporter.go b/vendor/golang.org/x/tools/internal/gcimporter/gcimporter.go new file mode 100644 index 0000000..e6c5d51 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/gcimporter.go @@ -0,0 +1,271 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file is a reduced copy of $GOROOT/src/go/internal/gcimporter/gcimporter.go. + +// Package gcimporter provides various functions for reading +// gc-generated object files that can be used to implement the +// Importer interface defined by the Go 1.5 standard library package. +// +// The encoding is deterministic: if the encoder is applied twice to +// the same types.Package data structure, both encodings are equal. +// This property may be important to avoid spurious changes in +// applications such as build systems. +// +// However, the encoder is not necessarily idempotent. Importing an +// exported package may yield a types.Package that, while it +// represents the same set of Go types as the original, may differ in +// the details of its internal representation. Because of these +// differences, re-encoding the imported package may yield a +// different, but equally valid, encoding of the package. +package gcimporter // import "golang.org/x/tools/internal/gcimporter" + +import ( + "bufio" + "bytes" + "fmt" + "go/build" + "go/token" + "go/types" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" +) + +const ( + // Enable debug during development: it adds some additional checks, and + // prevents errors from being recovered. + debug = false + + // If trace is set, debugging output is printed to std out. + trace = false +) + +var exportMap sync.Map // package dir → func() (string, bool) + +// lookupGorootExport returns the location of the export data +// (normally found in the build cache, but located in GOROOT/pkg +// in prior Go releases) for the package located in pkgDir. +// +// (We use the package's directory instead of its import path +// mainly to simplify handling of the packages in src/vendor +// and cmd/vendor.) +func lookupGorootExport(pkgDir string) (string, bool) { + f, ok := exportMap.Load(pkgDir) + if !ok { + var ( + listOnce sync.Once + exportPath string + ) + f, _ = exportMap.LoadOrStore(pkgDir, func() (string, bool) { + listOnce.Do(func() { + cmd := exec.Command("go", "list", "-export", "-f", "{{.Export}}", pkgDir) + cmd.Dir = build.Default.GOROOT + var output []byte + output, err := cmd.Output() + if err != nil { + return + } + + exports := strings.Split(string(bytes.TrimSpace(output)), "\n") + if len(exports) != 1 { + return + } + + exportPath = exports[0] + }) + + return exportPath, exportPath != "" + }) + } + + return f.(func() (string, bool))() +} + +var pkgExts = [...]string{".a", ".o"} + +// FindPkg returns the filename and unique package id for an import +// path based on package information provided by build.Import (using +// the build.Default build.Context). A relative srcDir is interpreted +// relative to the current working directory. +// If no file was found, an empty filename is returned. +func FindPkg(path, srcDir string) (filename, id string) { + if path == "" { + return + } + + var noext string + switch { + default: + // "x" -> "$GOPATH/pkg/$GOOS_$GOARCH/x.ext", "x" + // Don't require the source files to be present. + if abs, err := filepath.Abs(srcDir); err == nil { // see issue 14282 + srcDir = abs + } + bp, _ := build.Import(path, srcDir, build.FindOnly|build.AllowBinary) + if bp.PkgObj == "" { + var ok bool + if bp.Goroot && bp.Dir != "" { + filename, ok = lookupGorootExport(bp.Dir) + } + if !ok { + id = path // make sure we have an id to print in error message + return + } + } else { + noext = strings.TrimSuffix(bp.PkgObj, ".a") + id = bp.ImportPath + } + + case build.IsLocalImport(path): + // "./x" -> "/this/directory/x.ext", "/this/directory/x" + noext = filepath.Join(srcDir, path) + id = noext + + case filepath.IsAbs(path): + // for completeness only - go/build.Import + // does not support absolute imports + // "/x" -> "/x.ext", "/x" + noext = path + id = path + } + + if false { // for debugging + if path != id { + fmt.Printf("%s -> %s\n", path, id) + } + } + + if filename != "" { + if f, err := os.Stat(filename); err == nil && !f.IsDir() { + return + } + } + + // try extensions + for _, ext := range pkgExts { + filename = noext + ext + if f, err := os.Stat(filename); err == nil && !f.IsDir() { + return + } + } + + filename = "" // not found + return +} + +// Import imports a gc-generated package given its import path and srcDir, adds +// the corresponding package object to the packages map, and returns the object. +// The packages map must contain all packages already imported. +func Import(packages map[string]*types.Package, path, srcDir string, lookup func(path string) (io.ReadCloser, error)) (pkg *types.Package, err error) { + var rc io.ReadCloser + var filename, id string + if lookup != nil { + // With custom lookup specified, assume that caller has + // converted path to a canonical import path for use in the map. + if path == "unsafe" { + return types.Unsafe, nil + } + id = path + + // No need to re-import if the package was imported completely before. + if pkg = packages[id]; pkg != nil && pkg.Complete() { + return + } + f, err := lookup(path) + if err != nil { + return nil, err + } + rc = f + } else { + filename, id = FindPkg(path, srcDir) + if filename == "" { + if path == "unsafe" { + return types.Unsafe, nil + } + return nil, fmt.Errorf("can't find import: %q", id) + } + + // no need to re-import if the package was imported completely before + if pkg = packages[id]; pkg != nil && pkg.Complete() { + return + } + + // open file + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + // add file name to error + err = fmt.Errorf("%s: %v", filename, err) + } + }() + rc = f + } + defer rc.Close() + + var hdr string + var size int64 + buf := bufio.NewReader(rc) + if hdr, size, err = FindExportData(buf); err != nil { + return + } + + switch hdr { + case "$$B\n": + var data []byte + data, err = io.ReadAll(buf) + if err != nil { + break + } + + // TODO(gri): allow clients of go/importer to provide a FileSet. + // Or, define a new standard go/types/gcexportdata package. + fset := token.NewFileSet() + + // Select appropriate importer. + if len(data) > 0 { + switch data[0] { + case 'v', 'c', 'd': + // binary: emitted by cmd/compile till go1.10; obsolete. + return nil, fmt.Errorf("binary (%c) import format is no longer supported", data[0]) + + case 'i': + // indexed: emitted by cmd/compile till go1.19; + // now used only for serializing go/types. + // See https://github.com/golang/go/issues/69491. + _, pkg, err := IImportData(fset, packages, data[1:], id) + return pkg, err + + case 'u': + // unified: emitted by cmd/compile since go1.20. + _, pkg, err := UImportData(fset, packages, data[1:size], id) + return pkg, err + + default: + l := len(data) + if l > 10 { + l = 10 + } + return nil, fmt.Errorf("unexpected export data with prefix %q for path %s", string(data[:l]), id) + } + } + + default: + err = fmt.Errorf("unknown export data header: %q", hdr) + } + + return +} + +type byPath []*types.Package + +func (a byPath) Len() int { return len(a) } +func (a byPath) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a byPath) Less(i, j int) bool { return a[i].Path() < a[j].Path() } diff --git a/vendor/golang.org/x/tools/internal/gcimporter/iexport.go b/vendor/golang.org/x/tools/internal/gcimporter/iexport.go new file mode 100644 index 0000000..1e19fbe --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/iexport.go @@ -0,0 +1,1568 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Indexed package export. +// +// The indexed export data format is an evolution of the previous +// binary export data format. Its chief contribution is introducing an +// index table, which allows efficient random access of individual +// declarations and inline function bodies. In turn, this allows +// avoiding unnecessary work for compilation units that import large +// packages. +// +// +// The top-level data format is structured as: +// +// Header struct { +// Tag byte // 'i' +// Version uvarint +// StringSize uvarint +// DataSize uvarint +// } +// +// Strings [StringSize]byte +// Data [DataSize]byte +// +// MainIndex []struct{ +// PkgPath stringOff +// PkgName stringOff +// PkgHeight uvarint +// +// Decls []struct{ +// Name stringOff +// Offset declOff +// } +// } +// +// Fingerprint [8]byte +// +// uvarint means a uint64 written out using uvarint encoding. +// +// []T means a uvarint followed by that many T objects. In other +// words: +// +// Len uvarint +// Elems [Len]T +// +// stringOff means a uvarint that indicates an offset within the +// Strings section. At that offset is another uvarint, followed by +// that many bytes, which form the string value. +// +// declOff means a uvarint that indicates an offset within the Data +// section where the associated declaration can be found. +// +// +// There are five kinds of declarations, distinguished by their first +// byte: +// +// type Var struct { +// Tag byte // 'V' +// Pos Pos +// Type typeOff +// } +// +// type Func struct { +// Tag byte // 'F' or 'G' +// Pos Pos +// TypeParams []typeOff // only present if Tag == 'G' +// Signature Signature +// } +// +// type Const struct { +// Tag byte // 'C' +// Pos Pos +// Value Value +// } +// +// type Type struct { +// Tag byte // 'T' or 'U' +// Pos Pos +// TypeParams []typeOff // only present if Tag == 'U' +// Underlying typeOff +// +// Methods []struct{ // omitted if Underlying is an interface type +// Pos Pos +// Name stringOff +// Recv Param +// Signature Signature +// } +// } +// +// type Alias struct { +// Tag byte // 'A' or 'B' +// Pos Pos +// TypeParams []typeOff // only present if Tag == 'B' +// Type typeOff +// } +// +// // "Automatic" declaration of each typeparam +// type TypeParam struct { +// Tag byte // 'P' +// Pos Pos +// Implicit bool +// Constraint typeOff +// } +// +// typeOff means a uvarint that either indicates a predeclared type, +// or an offset into the Data section. If the uvarint is less than +// predeclReserved, then it indicates the index into the predeclared +// types list (see predeclared in bexport.go for order). Otherwise, +// subtracting predeclReserved yields the offset of a type descriptor. +// +// Value means a type, kind, and type-specific value. See +// (*exportWriter).value for details. +// +// +// There are twelve kinds of type descriptors, distinguished by an itag: +// +// type DefinedType struct { +// Tag itag // definedType +// Name stringOff +// PkgPath stringOff +// } +// +// type PointerType struct { +// Tag itag // pointerType +// Elem typeOff +// } +// +// type SliceType struct { +// Tag itag // sliceType +// Elem typeOff +// } +// +// type ArrayType struct { +// Tag itag // arrayType +// Len uint64 +// Elem typeOff +// } +// +// type ChanType struct { +// Tag itag // chanType +// Dir uint64 // 1 RecvOnly; 2 SendOnly; 3 SendRecv +// Elem typeOff +// } +// +// type MapType struct { +// Tag itag // mapType +// Key typeOff +// Elem typeOff +// } +// +// type FuncType struct { +// Tag itag // signatureType +// PkgPath stringOff +// Signature Signature +// } +// +// type StructType struct { +// Tag itag // structType +// PkgPath stringOff +// Fields []struct { +// Pos Pos +// Name stringOff +// Type typeOff +// Embedded bool +// Note stringOff +// } +// } +// +// type InterfaceType struct { +// Tag itag // interfaceType +// PkgPath stringOff +// Embeddeds []struct { +// Pos Pos +// Type typeOff +// } +// Methods []struct { +// Pos Pos +// Name stringOff +// Signature Signature +// } +// } +// +// // Reference to a type param declaration +// type TypeParamType struct { +// Tag itag // typeParamType +// Name stringOff +// PkgPath stringOff +// } +// +// // Instantiation of a generic type (like List[T2] or List[int]) +// type InstanceType struct { +// Tag itag // instanceType +// Pos pos +// TypeArgs []typeOff +// BaseType typeOff +// } +// +// type UnionType struct { +// Tag itag // interfaceType +// Terms []struct { +// tilde bool +// Type typeOff +// } +// } +// +// +// +// type Signature struct { +// Params []Param +// Results []Param +// Variadic bool // omitted if Results is empty +// } +// +// type Param struct { +// Pos Pos +// Name stringOff +// Type typOff +// } +// +// +// Pos encodes a file:line:column triple, incorporating a simple delta +// encoding scheme within a data object. See exportWriter.pos for +// details. + +package gcimporter + +import ( + "bytes" + "encoding/binary" + "fmt" + "go/constant" + "go/token" + "go/types" + "io" + "math/big" + "reflect" + "sort" + "strconv" + "strings" + + "golang.org/x/tools/go/types/objectpath" + "golang.org/x/tools/internal/aliases" +) + +// IExportShallow encodes "shallow" export data for the specified package. +// +// No promises are made about the encoding other than that it can be decoded by +// the same version of IIExportShallow. If you plan to save export data in the +// file system, be sure to include a cryptographic digest of the executable in +// the key to avoid version skew. +// +// If the provided reportf func is non-nil, it will be used for reporting bugs +// encountered during export. +// TODO(rfindley): remove reportf when we are confident enough in the new +// objectpath encoding. +func IExportShallow(fset *token.FileSet, pkg *types.Package, reportf ReportFunc) ([]byte, error) { + // In principle this operation can only fail if out.Write fails, + // but that's impossible for bytes.Buffer---and as a matter of + // fact iexportCommon doesn't even check for I/O errors. + // TODO(adonovan): handle I/O errors properly. + // TODO(adonovan): use byte slices throughout, avoiding copying. + const bundle, shallow = false, true + var out bytes.Buffer + err := iexportCommon(&out, fset, bundle, shallow, iexportVersion, []*types.Package{pkg}) + return out.Bytes(), err +} + +// IImportShallow decodes "shallow" types.Package data encoded by +// IExportShallow in the same executable. This function cannot import data from +// cmd/compile or gcexportdata.Write. +// +// The importer calls getPackages to obtain package symbols for all +// packages mentioned in the export data, including the one being +// decoded. +// +// If the provided reportf func is non-nil, it will be used for reporting bugs +// encountered during import. +// TODO(rfindley): remove reportf when we are confident enough in the new +// objectpath encoding. +func IImportShallow(fset *token.FileSet, getPackages GetPackagesFunc, data []byte, path string, reportf ReportFunc) (*types.Package, error) { + const bundle = false + const shallow = true + pkgs, err := iimportCommon(fset, getPackages, data, bundle, path, shallow, reportf) + if err != nil { + return nil, err + } + return pkgs[0], nil +} + +// ReportFunc is the type of a function used to report formatted bugs. +type ReportFunc = func(string, ...interface{}) + +// Current bundled export format version. Increase with each format change. +// 0: initial implementation +const bundleVersion = 0 + +// IExportData writes indexed export data for pkg to out. +// +// If no file set is provided, position info will be missing. +// The package path of the top-level package will not be recorded, +// so that calls to IImportData can override with a provided package path. +func IExportData(out io.Writer, fset *token.FileSet, pkg *types.Package) error { + const bundle, shallow = false, false + return iexportCommon(out, fset, bundle, shallow, iexportVersion, []*types.Package{pkg}) +} + +// IExportBundle writes an indexed export bundle for pkgs to out. +func IExportBundle(out io.Writer, fset *token.FileSet, pkgs []*types.Package) error { + const bundle, shallow = true, false + return iexportCommon(out, fset, bundle, shallow, iexportVersion, pkgs) +} + +func iexportCommon(out io.Writer, fset *token.FileSet, bundle, shallow bool, version int, pkgs []*types.Package) (err error) { + if !debug { + defer func() { + if e := recover(); e != nil { + if ierr, ok := e.(internalError); ok { + err = ierr + return + } + // Not an internal error; panic again. + panic(e) + } + }() + } + + p := iexporter{ + fset: fset, + version: version, + shallow: shallow, + allPkgs: map[*types.Package]bool{}, + stringIndex: map[string]uint64{}, + declIndex: map[types.Object]uint64{}, + tparamNames: map[types.Object]string{}, + typIndex: map[types.Type]uint64{}, + } + if !bundle { + p.localpkg = pkgs[0] + } + + for i, pt := range predeclared() { + p.typIndex[pt] = uint64(i) + } + if len(p.typIndex) > predeclReserved { + panic(internalErrorf("too many predeclared types: %d > %d", len(p.typIndex), predeclReserved)) + } + + // Initialize work queue with exported declarations. + for _, pkg := range pkgs { + scope := pkg.Scope() + for _, name := range scope.Names() { + if token.IsExported(name) { + p.pushDecl(scope.Lookup(name)) + } + } + + if bundle { + // Ensure pkg and its imports are included in the index. + p.allPkgs[pkg] = true + for _, imp := range pkg.Imports() { + p.allPkgs[imp] = true + } + } + } + + // Loop until no more work. + for !p.declTodo.empty() { + p.doDecl(p.declTodo.popHead()) + } + + // Produce index of offset of each file record in files. + var files intWriter + var fileOffset []uint64 // fileOffset[i] is offset in files of file encoded as i + if p.shallow { + fileOffset = make([]uint64, len(p.fileInfos)) + for i, info := range p.fileInfos { + fileOffset[i] = uint64(files.Len()) + p.encodeFile(&files, info.file, info.needed) + } + } + + // Append indices to data0 section. + dataLen := uint64(p.data0.Len()) + w := p.newWriter() + w.writeIndex(p.declIndex) + + if bundle { + w.uint64(uint64(len(pkgs))) + for _, pkg := range pkgs { + w.pkg(pkg) + imps := pkg.Imports() + w.uint64(uint64(len(imps))) + for _, imp := range imps { + w.pkg(imp) + } + } + } + w.flush() + + // Assemble header. + var hdr intWriter + if bundle { + hdr.uint64(bundleVersion) + } + hdr.uint64(uint64(p.version)) + hdr.uint64(uint64(p.strings.Len())) + if p.shallow { + hdr.uint64(uint64(files.Len())) + hdr.uint64(uint64(len(fileOffset))) + for _, offset := range fileOffset { + hdr.uint64(offset) + } + } + hdr.uint64(dataLen) + + // Flush output. + io.Copy(out, &hdr) + io.Copy(out, &p.strings) + if p.shallow { + io.Copy(out, &files) + } + io.Copy(out, &p.data0) + + return nil +} + +// encodeFile writes to w a representation of the file sufficient to +// faithfully restore position information about all needed offsets. +// Mutates the needed array. +func (p *iexporter) encodeFile(w *intWriter, file *token.File, needed []uint64) { + _ = needed[0] // precondition: needed is non-empty + + w.uint64(p.stringOff(file.Name())) + + size := uint64(file.Size()) + w.uint64(size) + + // Sort the set of needed offsets. Duplicates are harmless. + sort.Slice(needed, func(i, j int) bool { return needed[i] < needed[j] }) + + lines := file.Lines() // byte offset of each line start + w.uint64(uint64(len(lines))) + + // Rather than record the entire array of line start offsets, + // we save only a sparse list of (index, offset) pairs for + // the start of each line that contains a needed position. + var sparse [][2]int // (index, offset) pairs +outer: + for i, lineStart := range lines { + lineEnd := size + if i < len(lines)-1 { + lineEnd = uint64(lines[i+1]) + } + // Does this line contains a needed offset? + if needed[0] < lineEnd { + sparse = append(sparse, [2]int{i, lineStart}) + for needed[0] < lineEnd { + needed = needed[1:] + if len(needed) == 0 { + break outer + } + } + } + } + + // Delta-encode the columns. + w.uint64(uint64(len(sparse))) + var prev [2]int + for _, pair := range sparse { + w.uint64(uint64(pair[0] - prev[0])) + w.uint64(uint64(pair[1] - prev[1])) + prev = pair + } +} + +// writeIndex writes out an object index. mainIndex indicates whether +// we're writing out the main index, which is also read by +// non-compiler tools and includes a complete package description +// (i.e., name and height). +func (w *exportWriter) writeIndex(index map[types.Object]uint64) { + type pkgObj struct { + obj types.Object + name string // qualified name; differs from obj.Name for type params + } + // Build a map from packages to objects from that package. + pkgObjs := map[*types.Package][]pkgObj{} + + // For the main index, make sure to include every package that + // we reference, even if we're not exporting (or reexporting) + // any symbols from it. + if w.p.localpkg != nil { + pkgObjs[w.p.localpkg] = nil + } + for pkg := range w.p.allPkgs { + pkgObjs[pkg] = nil + } + + for obj := range index { + name := w.p.exportName(obj) + pkgObjs[obj.Pkg()] = append(pkgObjs[obj.Pkg()], pkgObj{obj, name}) + } + + var pkgs []*types.Package + for pkg, objs := range pkgObjs { + pkgs = append(pkgs, pkg) + + sort.Slice(objs, func(i, j int) bool { + return objs[i].name < objs[j].name + }) + } + + sort.Slice(pkgs, func(i, j int) bool { + return w.exportPath(pkgs[i]) < w.exportPath(pkgs[j]) + }) + + w.uint64(uint64(len(pkgs))) + for _, pkg := range pkgs { + w.string(w.exportPath(pkg)) + w.string(pkg.Name()) + w.uint64(uint64(0)) // package height is not needed for go/types + + objs := pkgObjs[pkg] + w.uint64(uint64(len(objs))) + for _, obj := range objs { + w.string(obj.name) + w.uint64(index[obj.obj]) + } + } +} + +// exportName returns the 'exported' name of an object. It differs from +// obj.Name() only for type parameters (see tparamExportName for details). +func (p *iexporter) exportName(obj types.Object) (res string) { + if name := p.tparamNames[obj]; name != "" { + return name + } + return obj.Name() +} + +type iexporter struct { + fset *token.FileSet + out *bytes.Buffer + version int + + shallow bool // don't put types from other packages in the index + objEncoder *objectpath.Encoder // encodes objects from other packages in shallow mode; lazily allocated + localpkg *types.Package // (nil in bundle mode) + + // allPkgs tracks all packages that have been referenced by + // the export data, so we can ensure to include them in the + // main index. + allPkgs map[*types.Package]bool + + declTodo objQueue + + strings intWriter + stringIndex map[string]uint64 + + // In shallow mode, object positions are encoded as (file, offset). + // Each file is recorded as a line-number table. + // Only the lines of needed positions are saved faithfully. + fileInfo map[*token.File]uint64 // value is index in fileInfos + fileInfos []*filePositions + + data0 intWriter + declIndex map[types.Object]uint64 + tparamNames map[types.Object]string // typeparam->exported name + typIndex map[types.Type]uint64 + + indent int // for tracing support +} + +type filePositions struct { + file *token.File + needed []uint64 // unordered list of needed file offsets +} + +func (p *iexporter) trace(format string, args ...interface{}) { + if !trace { + // Call sites should also be guarded, but having this check here allows + // easily enabling/disabling debug trace statements. + return + } + fmt.Printf(strings.Repeat("..", p.indent)+format+"\n", args...) +} + +// objectpathEncoder returns the lazily allocated objectpath.Encoder to use +// when encoding objects in other packages during shallow export. +// +// Using a shared Encoder amortizes some of cost of objectpath search. +func (p *iexporter) objectpathEncoder() *objectpath.Encoder { + if p.objEncoder == nil { + p.objEncoder = new(objectpath.Encoder) + } + return p.objEncoder +} + +// stringOff returns the offset of s within the string section. +// If not already present, it's added to the end. +func (p *iexporter) stringOff(s string) uint64 { + off, ok := p.stringIndex[s] + if !ok { + off = uint64(p.strings.Len()) + p.stringIndex[s] = off + + p.strings.uint64(uint64(len(s))) + p.strings.WriteString(s) + } + return off +} + +// fileIndexAndOffset returns the index of the token.File and the byte offset of pos within it. +func (p *iexporter) fileIndexAndOffset(file *token.File, pos token.Pos) (uint64, uint64) { + index, ok := p.fileInfo[file] + if !ok { + index = uint64(len(p.fileInfo)) + p.fileInfos = append(p.fileInfos, &filePositions{file: file}) + if p.fileInfo == nil { + p.fileInfo = make(map[*token.File]uint64) + } + p.fileInfo[file] = index + } + // Record each needed offset. + info := p.fileInfos[index] + offset := uint64(file.Offset(pos)) + info.needed = append(info.needed, offset) + + return index, offset +} + +// pushDecl adds n to the declaration work queue, if not already present. +func (p *iexporter) pushDecl(obj types.Object) { + // Package unsafe is known to the compiler and predeclared. + // Caller should not ask us to do export it. + if obj.Pkg() == types.Unsafe { + panic("cannot export package unsafe") + } + + // Shallow export data: don't index decls from other packages. + if p.shallow && obj.Pkg() != p.localpkg { + return + } + + if _, ok := p.declIndex[obj]; ok { + return + } + + p.declIndex[obj] = ^uint64(0) // mark obj present in work queue + p.declTodo.pushTail(obj) +} + +// exportWriter handles writing out individual data section chunks. +type exportWriter struct { + p *iexporter + + data intWriter + prevFile string + prevLine int64 + prevColumn int64 +} + +func (w *exportWriter) exportPath(pkg *types.Package) string { + if pkg == w.p.localpkg { + return "" + } + return pkg.Path() +} + +func (p *iexporter) doDecl(obj types.Object) { + if trace { + p.trace("exporting decl %v (%T)", obj, obj) + p.indent++ + defer func() { + p.indent-- + p.trace("=> %s", obj) + }() + } + w := p.newWriter() + + switch obj := obj.(type) { + case *types.Var: + w.tag(varTag) + w.pos(obj.Pos()) + w.typ(obj.Type(), obj.Pkg()) + + case *types.Func: + sig, _ := obj.Type().(*types.Signature) + if sig.Recv() != nil { + // We shouldn't see methods in the package scope, + // but the type checker may repair "func () F() {}" + // to "func (Invalid) F()" and then treat it like "func F()", + // so allow that. See golang/go#57729. + if sig.Recv().Type() != types.Typ[types.Invalid] { + panic(internalErrorf("unexpected method: %v", sig)) + } + } + + // Function. + if sig.TypeParams().Len() == 0 { + w.tag(funcTag) + } else { + w.tag(genericFuncTag) + } + w.pos(obj.Pos()) + // The tparam list of the function type is the declaration of the type + // params. So, write out the type params right now. Then those type params + // will be referenced via their type offset (via typOff) in all other + // places in the signature and function where they are used. + // + // While importing the type parameters, tparamList computes and records + // their export name, so that it can be later used when writing the index. + if tparams := sig.TypeParams(); tparams.Len() > 0 { + w.tparamList(obj.Name(), tparams, obj.Pkg()) + } + w.signature(sig) + + case *types.Const: + w.tag(constTag) + w.pos(obj.Pos()) + w.value(obj.Type(), obj.Val()) + + case *types.TypeName: + t := obj.Type() + + if tparam, ok := types.Unalias(t).(*types.TypeParam); ok { + w.tag(typeParamTag) + w.pos(obj.Pos()) + constraint := tparam.Constraint() + if p.version >= iexportVersionGo1_18 { + implicit := false + if iface, _ := types.Unalias(constraint).(*types.Interface); iface != nil { + implicit = iface.IsImplicit() + } + w.bool(implicit) + } + w.typ(constraint, obj.Pkg()) + break + } + + if obj.IsAlias() { + alias, materialized := t.(*types.Alias) // may fail when aliases are not enabled + + var tparams *types.TypeParamList + if materialized { + tparams = aliases.TypeParams(alias) + } + if tparams.Len() == 0 { + w.tag(aliasTag) + } else { + w.tag(genericAliasTag) + } + w.pos(obj.Pos()) + if tparams.Len() > 0 { + w.tparamList(obj.Name(), tparams, obj.Pkg()) + } + if materialized { + // Preserve materialized aliases, + // even of non-exported types. + t = aliases.Rhs(alias) + } + w.typ(t, obj.Pkg()) + break + } + + // Defined type. + named, ok := t.(*types.Named) + if !ok { + panic(internalErrorf("%s is not a defined type", t)) + } + + if named.TypeParams().Len() == 0 { + w.tag(typeTag) + } else { + w.tag(genericTypeTag) + } + w.pos(obj.Pos()) + + if named.TypeParams().Len() > 0 { + // While importing the type parameters, tparamList computes and records + // their export name, so that it can be later used when writing the index. + w.tparamList(obj.Name(), named.TypeParams(), obj.Pkg()) + } + + underlying := named.Underlying() + w.typ(underlying, obj.Pkg()) + + if types.IsInterface(t) { + break + } + + n := named.NumMethods() + w.uint64(uint64(n)) + for i := 0; i < n; i++ { + m := named.Method(i) + w.pos(m.Pos()) + w.string(m.Name()) + sig, _ := m.Type().(*types.Signature) + + // Receiver type parameters are type arguments of the receiver type, so + // their name must be qualified before exporting recv. + if rparams := sig.RecvTypeParams(); rparams.Len() > 0 { + prefix := obj.Name() + "." + m.Name() + for i := 0; i < rparams.Len(); i++ { + rparam := rparams.At(i) + name := tparamExportName(prefix, rparam) + w.p.tparamNames[rparam.Obj()] = name + } + } + w.param(sig.Recv()) + w.signature(sig) + } + + default: + panic(internalErrorf("unexpected object: %v", obj)) + } + + p.declIndex[obj] = w.flush() +} + +func (w *exportWriter) tag(tag byte) { + w.data.WriteByte(tag) +} + +func (w *exportWriter) pos(pos token.Pos) { + if w.p.shallow { + w.posV2(pos) + } else if w.p.version >= iexportVersionPosCol { + w.posV1(pos) + } else { + w.posV0(pos) + } +} + +// posV2 encoding (used only in shallow mode) records positions as +// (file, offset), where file is the index in the token.File table +// (which records the file name and newline offsets) and offset is a +// byte offset. It effectively ignores //line directives. +func (w *exportWriter) posV2(pos token.Pos) { + if pos == token.NoPos { + w.uint64(0) + return + } + file := w.p.fset.File(pos) // fset must be non-nil + index, offset := w.p.fileIndexAndOffset(file, pos) + w.uint64(1 + index) + w.uint64(offset) +} + +func (w *exportWriter) posV1(pos token.Pos) { + if w.p.fset == nil { + w.int64(0) + return + } + + p := w.p.fset.Position(pos) + file := p.Filename + line := int64(p.Line) + column := int64(p.Column) + + deltaColumn := (column - w.prevColumn) << 1 + deltaLine := (line - w.prevLine) << 1 + + if file != w.prevFile { + deltaLine |= 1 + } + if deltaLine != 0 { + deltaColumn |= 1 + } + + w.int64(deltaColumn) + if deltaColumn&1 != 0 { + w.int64(deltaLine) + if deltaLine&1 != 0 { + w.string(file) + } + } + + w.prevFile = file + w.prevLine = line + w.prevColumn = column +} + +func (w *exportWriter) posV0(pos token.Pos) { + if w.p.fset == nil { + w.int64(0) + return + } + + p := w.p.fset.Position(pos) + file := p.Filename + line := int64(p.Line) + + // When file is the same as the last position (common case), + // we can save a few bytes by delta encoding just the line + // number. + // + // Note: Because data objects may be read out of order (or not + // at all), we can only apply delta encoding within a single + // object. This is handled implicitly by tracking prevFile and + // prevLine as fields of exportWriter. + + if file == w.prevFile { + delta := line - w.prevLine + w.int64(delta) + if delta == deltaNewFile { + w.int64(-1) + } + } else { + w.int64(deltaNewFile) + w.int64(line) // line >= 0 + w.string(file) + w.prevFile = file + } + w.prevLine = line +} + +func (w *exportWriter) pkg(pkg *types.Package) { + // Ensure any referenced packages are declared in the main index. + w.p.allPkgs[pkg] = true + + w.string(w.exportPath(pkg)) +} + +func (w *exportWriter) qualifiedType(obj *types.TypeName) { + name := w.p.exportName(obj) + + // Ensure any referenced declarations are written out too. + w.p.pushDecl(obj) + w.string(name) + w.pkg(obj.Pkg()) +} + +// TODO(rfindley): what does 'pkg' even mean here? It would be better to pass +// it in explicitly into signatures and structs that may use it for +// constructing fields. +func (w *exportWriter) typ(t types.Type, pkg *types.Package) { + w.data.uint64(w.p.typOff(t, pkg)) +} + +func (p *iexporter) newWriter() *exportWriter { + return &exportWriter{p: p} +} + +func (w *exportWriter) flush() uint64 { + off := uint64(w.p.data0.Len()) + io.Copy(&w.p.data0, &w.data) + return off +} + +func (p *iexporter) typOff(t types.Type, pkg *types.Package) uint64 { + off, ok := p.typIndex[t] + if !ok { + w := p.newWriter() + w.doTyp(t, pkg) + off = predeclReserved + w.flush() + p.typIndex[t] = off + } + return off +} + +func (w *exportWriter) startType(k itag) { + w.data.uint64(uint64(k)) +} + +func (w *exportWriter) doTyp(t types.Type, pkg *types.Package) { + if trace { + w.p.trace("exporting type %s (%T)", t, t) + w.p.indent++ + defer func() { + w.p.indent-- + w.p.trace("=> %s", t) + }() + } + switch t := t.(type) { + case *types.Alias: + if targs := aliases.TypeArgs(t); targs.Len() > 0 { + w.startType(instanceType) + w.pos(t.Obj().Pos()) + w.typeList(targs, pkg) + w.typ(aliases.Origin(t), pkg) + return + } + w.startType(aliasType) + w.qualifiedType(t.Obj()) + + case *types.Named: + if targs := t.TypeArgs(); targs.Len() > 0 { + w.startType(instanceType) + // TODO(rfindley): investigate if this position is correct, and if it + // matters. + w.pos(t.Obj().Pos()) + w.typeList(targs, pkg) + w.typ(t.Origin(), pkg) + return + } + w.startType(definedType) + w.qualifiedType(t.Obj()) + + case *types.TypeParam: + w.startType(typeParamType) + w.qualifiedType(t.Obj()) + + case *types.Pointer: + w.startType(pointerType) + w.typ(t.Elem(), pkg) + + case *types.Slice: + w.startType(sliceType) + w.typ(t.Elem(), pkg) + + case *types.Array: + w.startType(arrayType) + w.uint64(uint64(t.Len())) + w.typ(t.Elem(), pkg) + + case *types.Chan: + w.startType(chanType) + // 1 RecvOnly; 2 SendOnly; 3 SendRecv + var dir uint64 + switch t.Dir() { + case types.RecvOnly: + dir = 1 + case types.SendOnly: + dir = 2 + case types.SendRecv: + dir = 3 + } + w.uint64(dir) + w.typ(t.Elem(), pkg) + + case *types.Map: + w.startType(mapType) + w.typ(t.Key(), pkg) + w.typ(t.Elem(), pkg) + + case *types.Signature: + w.startType(signatureType) + w.pkg(pkg) + w.signature(t) + + case *types.Struct: + w.startType(structType) + n := t.NumFields() + // Even for struct{} we must emit some qualifying package, because that's + // what the compiler does, and thus that's what the importer expects. + fieldPkg := pkg + if n > 0 { + fieldPkg = t.Field(0).Pkg() + } + if fieldPkg == nil { + // TODO(rfindley): improve this very hacky logic. + // + // The importer expects a package to be set for all struct types, even + // those with no fields. A better encoding might be to set NumFields + // before pkg. setPkg panics with a nil package, which may be possible + // to reach with invalid packages (and perhaps valid packages, too?), so + // (arbitrarily) set the localpkg if available. + // + // Alternatively, we may be able to simply guarantee that pkg != nil, by + // reconsidering the encoding of constant values. + if w.p.shallow { + fieldPkg = w.p.localpkg + } else { + panic(internalErrorf("no package to set for empty struct")) + } + } + w.pkg(fieldPkg) + w.uint64(uint64(n)) + + for i := 0; i < n; i++ { + f := t.Field(i) + if w.p.shallow { + w.objectPath(f) + } + w.pos(f.Pos()) + w.string(f.Name()) // unexported fields implicitly qualified by prior setPkg + w.typ(f.Type(), fieldPkg) + w.bool(f.Anonymous()) + w.string(t.Tag(i)) // note (or tag) + } + + case *types.Interface: + w.startType(interfaceType) + w.pkg(pkg) + + n := t.NumEmbeddeds() + w.uint64(uint64(n)) + for i := 0; i < n; i++ { + ft := t.EmbeddedType(i) + tPkg := pkg + if named, _ := types.Unalias(ft).(*types.Named); named != nil { + w.pos(named.Obj().Pos()) + } else { + w.pos(token.NoPos) + } + w.typ(ft, tPkg) + } + + // See comment for struct fields. In shallow mode we change the encoding + // for interface methods that are promoted from other packages. + + n = t.NumExplicitMethods() + w.uint64(uint64(n)) + for i := 0; i < n; i++ { + m := t.ExplicitMethod(i) + if w.p.shallow { + w.objectPath(m) + } + w.pos(m.Pos()) + w.string(m.Name()) + sig, _ := m.Type().(*types.Signature) + w.signature(sig) + } + + case *types.Union: + w.startType(unionType) + nt := t.Len() + w.uint64(uint64(nt)) + for i := 0; i < nt; i++ { + term := t.Term(i) + w.bool(term.Tilde()) + w.typ(term.Type(), pkg) + } + + default: + panic(internalErrorf("unexpected type: %v, %v", t, reflect.TypeOf(t))) + } +} + +// objectPath writes the package and objectPath to use to look up obj in a +// different package, when encoding in "shallow" mode. +// +// When doing a shallow import, the importer creates only the local package, +// and requests package symbols for dependencies from the client. +// However, certain types defined in the local package may hold objects defined +// (perhaps deeply) within another package. +// +// For example, consider the following: +// +// package a +// func F() chan * map[string] struct { X int } +// +// package b +// import "a" +// var B = a.F() +// +// In this example, the type of b.B holds fields defined in package a. +// In order to have the correct canonical objects for the field defined in the +// type of B, they are encoded as objectPaths and later looked up in the +// importer. The same problem applies to interface methods. +func (w *exportWriter) objectPath(obj types.Object) { + if obj.Pkg() == nil || obj.Pkg() == w.p.localpkg { + // obj.Pkg() may be nil for the builtin error.Error. + // In this case, or if obj is declared in the local package, no need to + // encode. + w.string("") + return + } + objectPath, err := w.p.objectpathEncoder().For(obj) + if err != nil { + // Fall back to the empty string, which will cause the importer to create a + // new object, which matches earlier behavior. Creating a new object is + // sufficient for many purposes (such as type checking), but causes certain + // references algorithms to fail (golang/go#60819). However, we didn't + // notice this problem during months of gopls@v0.12.0 testing. + // + // TODO(golang/go#61674): this workaround is insufficient, as in the case + // where the field forwarded from an instantiated type that may not appear + // in the export data of the original package: + // + // // package a + // type A[P any] struct{ F P } + // + // // package b + // type B a.A[int] + // + // We need to update references algorithms not to depend on this + // de-duplication, at which point we may want to simply remove the + // workaround here. + w.string("") + return + } + w.string(string(objectPath)) + w.pkg(obj.Pkg()) +} + +func (w *exportWriter) signature(sig *types.Signature) { + w.paramList(sig.Params()) + w.paramList(sig.Results()) + if sig.Params().Len() > 0 { + w.bool(sig.Variadic()) + } +} + +func (w *exportWriter) typeList(ts *types.TypeList, pkg *types.Package) { + w.uint64(uint64(ts.Len())) + for i := 0; i < ts.Len(); i++ { + w.typ(ts.At(i), pkg) + } +} + +func (w *exportWriter) tparamList(prefix string, list *types.TypeParamList, pkg *types.Package) { + ll := uint64(list.Len()) + w.uint64(ll) + for i := 0; i < list.Len(); i++ { + tparam := list.At(i) + // Set the type parameter exportName before exporting its type. + exportName := tparamExportName(prefix, tparam) + w.p.tparamNames[tparam.Obj()] = exportName + w.typ(list.At(i), pkg) + } +} + +const blankMarker = "$" + +// tparamExportName returns the 'exported' name of a type parameter, which +// differs from its actual object name: it is prefixed with a qualifier, and +// blank type parameter names are disambiguated by their index in the type +// parameter list. +func tparamExportName(prefix string, tparam *types.TypeParam) string { + assert(prefix != "") + name := tparam.Obj().Name() + if name == "_" { + name = blankMarker + strconv.Itoa(tparam.Index()) + } + return prefix + "." + name +} + +// tparamName returns the real name of a type parameter, after stripping its +// qualifying prefix and reverting blank-name encoding. See tparamExportName +// for details. +func tparamName(exportName string) string { + // Remove the "path" from the type param name that makes it unique. + ix := strings.LastIndex(exportName, ".") + if ix < 0 { + errorf("malformed type parameter export name %s: missing prefix", exportName) + } + name := exportName[ix+1:] + if strings.HasPrefix(name, blankMarker) { + return "_" + } + return name +} + +func (w *exportWriter) paramList(tup *types.Tuple) { + n := tup.Len() + w.uint64(uint64(n)) + for i := 0; i < n; i++ { + w.param(tup.At(i)) + } +} + +func (w *exportWriter) param(obj types.Object) { + w.pos(obj.Pos()) + w.localIdent(obj) + w.typ(obj.Type(), obj.Pkg()) +} + +func (w *exportWriter) value(typ types.Type, v constant.Value) { + w.typ(typ, nil) + if w.p.version >= iexportVersionGo1_18 { + w.int64(int64(v.Kind())) + } + + if v.Kind() == constant.Unknown { + // golang/go#60605: treat unknown constant values as if they have invalid type + // + // This loses some fidelity over the package type-checked from source, but that + // is acceptable. + // + // TODO(rfindley): we should switch on the recorded constant kind rather + // than the constant type + return + } + + switch b := typ.Underlying().(*types.Basic); b.Info() & types.IsConstType { + case types.IsBoolean: + w.bool(constant.BoolVal(v)) + case types.IsInteger: + var i big.Int + if i64, exact := constant.Int64Val(v); exact { + i.SetInt64(i64) + } else if ui64, exact := constant.Uint64Val(v); exact { + i.SetUint64(ui64) + } else { + i.SetString(v.ExactString(), 10) + } + w.mpint(&i, typ) + case types.IsFloat: + f := constantToFloat(v) + w.mpfloat(f, typ) + case types.IsComplex: + w.mpfloat(constantToFloat(constant.Real(v)), typ) + w.mpfloat(constantToFloat(constant.Imag(v)), typ) + case types.IsString: + w.string(constant.StringVal(v)) + default: + if b.Kind() == types.Invalid { + // package contains type errors + break + } + panic(internalErrorf("unexpected type %v (%v)", typ, typ.Underlying())) + } +} + +// constantToFloat converts a constant.Value with kind constant.Float to a +// big.Float. +func constantToFloat(x constant.Value) *big.Float { + x = constant.ToFloat(x) + // Use the same floating-point precision (512) as cmd/compile + // (see Mpprec in cmd/compile/internal/gc/mpfloat.go). + const mpprec = 512 + var f big.Float + f.SetPrec(mpprec) + if v, exact := constant.Float64Val(x); exact { + // float64 + f.SetFloat64(v) + } else if num, denom := constant.Num(x), constant.Denom(x); num.Kind() == constant.Int { + // TODO(gri): add big.Rat accessor to constant.Value. + n := valueToRat(num) + d := valueToRat(denom) + f.SetRat(n.Quo(n, d)) + } else { + // Value too large to represent as a fraction => inaccessible. + // TODO(gri): add big.Float accessor to constant.Value. + _, ok := f.SetString(x.ExactString()) + assert(ok) + } + return &f +} + +func valueToRat(x constant.Value) *big.Rat { + // Convert little-endian to big-endian. + // I can't believe this is necessary. + bytes := constant.Bytes(x) + for i := 0; i < len(bytes)/2; i++ { + bytes[i], bytes[len(bytes)-1-i] = bytes[len(bytes)-1-i], bytes[i] + } + return new(big.Rat).SetInt(new(big.Int).SetBytes(bytes)) +} + +// mpint exports a multi-precision integer. +// +// For unsigned types, small values are written out as a single +// byte. Larger values are written out as a length-prefixed big-endian +// byte string, where the length prefix is encoded as its complement. +// For example, bytes 0, 1, and 2 directly represent the integer +// values 0, 1, and 2; while bytes 255, 254, and 253 indicate a 1-, +// 2-, and 3-byte big-endian string follow. +// +// Encoding for signed types use the same general approach as for +// unsigned types, except small values use zig-zag encoding and the +// bottom bit of length prefix byte for large values is reserved as a +// sign bit. +// +// The exact boundary between small and large encodings varies +// according to the maximum number of bytes needed to encode a value +// of type typ. As a special case, 8-bit types are always encoded as a +// single byte. +// +// TODO(mdempsky): Is this level of complexity really worthwhile? +func (w *exportWriter) mpint(x *big.Int, typ types.Type) { + basic, ok := typ.Underlying().(*types.Basic) + if !ok { + panic(internalErrorf("unexpected type %v (%T)", typ.Underlying(), typ.Underlying())) + } + + signed, maxBytes := intSize(basic) + + negative := x.Sign() < 0 + if !signed && negative { + panic(internalErrorf("negative unsigned integer; type %v, value %v", typ, x)) + } + + b := x.Bytes() + if len(b) > 0 && b[0] == 0 { + panic(internalErrorf("leading zeros")) + } + if uint(len(b)) > maxBytes { + panic(internalErrorf("bad mpint length: %d > %d (type %v, value %v)", len(b), maxBytes, typ, x)) + } + + maxSmall := 256 - maxBytes + if signed { + maxSmall = 256 - 2*maxBytes + } + if maxBytes == 1 { + maxSmall = 256 + } + + // Check if x can use small value encoding. + if len(b) <= 1 { + var ux uint + if len(b) == 1 { + ux = uint(b[0]) + } + if signed { + ux <<= 1 + if negative { + ux-- + } + } + if ux < maxSmall { + w.data.WriteByte(byte(ux)) + return + } + } + + n := 256 - uint(len(b)) + if signed { + n = 256 - 2*uint(len(b)) + if negative { + n |= 1 + } + } + if n < maxSmall || n >= 256 { + panic(internalErrorf("encoding mistake: %d, %v, %v => %d", len(b), signed, negative, n)) + } + + w.data.WriteByte(byte(n)) + w.data.Write(b) +} + +// mpfloat exports a multi-precision floating point number. +// +// The number's value is decomposed into mantissa × 2**exponent, where +// mantissa is an integer. The value is written out as mantissa (as a +// multi-precision integer) and then the exponent, except exponent is +// omitted if mantissa is zero. +func (w *exportWriter) mpfloat(f *big.Float, typ types.Type) { + if f.IsInf() { + panic("infinite constant") + } + + // Break into f = mant × 2**exp, with 0.5 <= mant < 1. + var mant big.Float + exp := int64(f.MantExp(&mant)) + + // Scale so that mant is an integer. + prec := mant.MinPrec() + mant.SetMantExp(&mant, int(prec)) + exp -= int64(prec) + + manti, acc := mant.Int(nil) + if acc != big.Exact { + panic(internalErrorf("mantissa scaling failed for %f (%s)", f, acc)) + } + w.mpint(manti, typ) + if manti.Sign() != 0 { + w.int64(exp) + } +} + +func (w *exportWriter) bool(b bool) bool { + var x uint64 + if b { + x = 1 + } + w.uint64(x) + return b +} + +func (w *exportWriter) int64(x int64) { w.data.int64(x) } +func (w *exportWriter) uint64(x uint64) { w.data.uint64(x) } +func (w *exportWriter) string(s string) { w.uint64(w.p.stringOff(s)) } + +func (w *exportWriter) localIdent(obj types.Object) { + // Anonymous parameters. + if obj == nil { + w.string("") + return + } + + name := obj.Name() + if name == "_" { + w.string("_") + return + } + + w.string(name) +} + +type intWriter struct { + bytes.Buffer +} + +func (w *intWriter) int64(x int64) { + var buf [binary.MaxVarintLen64]byte + n := binary.PutVarint(buf[:], x) + w.Write(buf[:n]) +} + +func (w *intWriter) uint64(x uint64) { + var buf [binary.MaxVarintLen64]byte + n := binary.PutUvarint(buf[:], x) + w.Write(buf[:n]) +} + +func assert(cond bool) { + if !cond { + panic("internal error: assertion failed") + } +} + +// The below is copied from go/src/cmd/compile/internal/gc/syntax.go. + +// objQueue is a FIFO queue of types.Object. The zero value of objQueue is +// a ready-to-use empty queue. +type objQueue struct { + ring []types.Object + head, tail int +} + +// empty returns true if q contains no Nodes. +func (q *objQueue) empty() bool { + return q.head == q.tail +} + +// pushTail appends n to the tail of the queue. +func (q *objQueue) pushTail(obj types.Object) { + if len(q.ring) == 0 { + q.ring = make([]types.Object, 16) + } else if q.head+len(q.ring) == q.tail { + // Grow the ring. + nring := make([]types.Object, len(q.ring)*2) + // Copy the old elements. + part := q.ring[q.head%len(q.ring):] + if q.tail-q.head <= len(part) { + part = part[:q.tail-q.head] + copy(nring, part) + } else { + pos := copy(nring, part) + copy(nring[pos:], q.ring[:q.tail%len(q.ring)]) + } + q.ring, q.head, q.tail = nring, 0, q.tail-q.head + } + + q.ring[q.tail%len(q.ring)] = obj + q.tail++ +} + +// popHead pops a node from the head of the queue. It panics if q is empty. +func (q *objQueue) popHead() types.Object { + if q.empty() { + panic("dequeue empty") + } + obj := q.ring[q.head%len(q.ring)] + q.head++ + return obj +} + +// internalError represents an error generated inside this package. +type internalError string + +func (e internalError) Error() string { return "gcimporter: " + string(e) } + +// TODO(adonovan): make this call panic, so that it's symmetric with errorf. +// Otherwise it's easy to forget to do anything with the error. +// +// TODO(adonovan): also, consider switching the names "errorf" and +// "internalErrorf" as the former is used for bugs, whose cause is +// internal inconsistency, whereas the latter is used for ordinary +// situations like bad input, whose cause is external. +func internalErrorf(format string, args ...interface{}) error { + return internalError(fmt.Sprintf(format, args...)) +} diff --git a/vendor/golang.org/x/tools/internal/gcimporter/iimport.go b/vendor/golang.org/x/tools/internal/gcimporter/iimport.go new file mode 100644 index 0000000..21908a1 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/iimport.go @@ -0,0 +1,1101 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Indexed package import. +// See iexport.go for the export data format. + +// This file is a copy of $GOROOT/src/go/internal/gcimporter/iimport.go. + +package gcimporter + +import ( + "bytes" + "encoding/binary" + "fmt" + "go/constant" + "go/token" + "go/types" + "io" + "math/big" + "sort" + "strings" + + "golang.org/x/tools/go/types/objectpath" + "golang.org/x/tools/internal/aliases" + "golang.org/x/tools/internal/typesinternal" +) + +type intReader struct { + *bytes.Reader + path string +} + +func (r *intReader) int64() int64 { + i, err := binary.ReadVarint(r.Reader) + if err != nil { + errorf("import %q: read varint error: %v", r.path, err) + } + return i +} + +func (r *intReader) uint64() uint64 { + i, err := binary.ReadUvarint(r.Reader) + if err != nil { + errorf("import %q: read varint error: %v", r.path, err) + } + return i +} + +// Keep this in sync with constants in iexport.go. +const ( + iexportVersionGo1_11 = 0 + iexportVersionPosCol = 1 + iexportVersionGo1_18 = 2 + iexportVersionGenerics = 2 + iexportVersion = iexportVersionGenerics + + iexportVersionCurrent = 2 +) + +type ident struct { + pkg *types.Package + name string +} + +const predeclReserved = 32 + +type itag uint64 + +const ( + // Types + definedType itag = iota + pointerType + sliceType + arrayType + chanType + mapType + signatureType + structType + interfaceType + typeParamType + instanceType + unionType + aliasType +) + +// Object tags +const ( + varTag = 'V' + funcTag = 'F' + genericFuncTag = 'G' + constTag = 'C' + aliasTag = 'A' + genericAliasTag = 'B' + typeParamTag = 'P' + typeTag = 'T' + genericTypeTag = 'U' +) + +// IImportData imports a package from the serialized package data +// and returns 0 and a reference to the package. +// If the export data version is not recognized or the format is otherwise +// compromised, an error is returned. +func IImportData(fset *token.FileSet, imports map[string]*types.Package, data []byte, path string) (int, *types.Package, error) { + pkgs, err := iimportCommon(fset, GetPackagesFromMap(imports), data, false, path, false, nil) + if err != nil { + return 0, nil, err + } + return 0, pkgs[0], nil +} + +// IImportBundle imports a set of packages from the serialized package bundle. +func IImportBundle(fset *token.FileSet, imports map[string]*types.Package, data []byte) ([]*types.Package, error) { + return iimportCommon(fset, GetPackagesFromMap(imports), data, true, "", false, nil) +} + +// A GetPackagesFunc function obtains the non-nil symbols for a set of +// packages, creating and recursively importing them as needed. An +// implementation should store each package symbol is in the Pkg +// field of the items array. +// +// Any error causes importing to fail. This can be used to quickly read +// the import manifest of an export data file without fully decoding it. +type GetPackagesFunc = func(items []GetPackagesItem) error + +// A GetPackagesItem is a request from the importer for the package +// symbol of the specified name and path. +type GetPackagesItem struct { + Name, Path string + Pkg *types.Package // to be filled in by GetPackagesFunc call + + // private importer state + pathOffset uint64 + nameIndex map[string]uint64 +} + +// GetPackagesFromMap returns a GetPackagesFunc that retrieves +// packages from the given map of package path to package. +// +// The returned function may mutate m: each requested package that is not +// found is created with types.NewPackage and inserted into m. +func GetPackagesFromMap(m map[string]*types.Package) GetPackagesFunc { + return func(items []GetPackagesItem) error { + for i, item := range items { + pkg, ok := m[item.Path] + if !ok { + pkg = types.NewPackage(item.Path, item.Name) + m[item.Path] = pkg + } + items[i].Pkg = pkg + } + return nil + } +} + +func iimportCommon(fset *token.FileSet, getPackages GetPackagesFunc, data []byte, bundle bool, path string, shallow bool, reportf ReportFunc) (pkgs []*types.Package, err error) { + const currentVersion = iexportVersionCurrent + version := int64(-1) + if !debug { + defer func() { + if e := recover(); e != nil { + if bundle { + err = fmt.Errorf("%v", e) + } else if version > currentVersion { + err = fmt.Errorf("cannot import %q (%v), export data is newer version - update tool", path, e) + } else { + err = fmt.Errorf("internal error while importing %q (%v); please report an issue", path, e) + } + } + }() + } + + r := &intReader{bytes.NewReader(data), path} + + if bundle { + if v := r.uint64(); v != bundleVersion { + errorf("unknown bundle format version %d", v) + } + } + + version = int64(r.uint64()) + switch version { + case iexportVersionGo1_18, iexportVersionPosCol, iexportVersionGo1_11: + default: + if version > iexportVersionGo1_18 { + errorf("unstable iexport format version %d, just rebuild compiler and std library", version) + } else { + errorf("unknown iexport format version %d", version) + } + } + + sLen := int64(r.uint64()) + var fLen int64 + var fileOffset []uint64 + if shallow { + // Shallow mode uses a different position encoding. + fLen = int64(r.uint64()) + fileOffset = make([]uint64, r.uint64()) + for i := range fileOffset { + fileOffset[i] = r.uint64() + } + } + dLen := int64(r.uint64()) + + whence, _ := r.Seek(0, io.SeekCurrent) + stringData := data[whence : whence+sLen] + fileData := data[whence+sLen : whence+sLen+fLen] + declData := data[whence+sLen+fLen : whence+sLen+fLen+dLen] + r.Seek(sLen+fLen+dLen, io.SeekCurrent) + + p := iimporter{ + version: int(version), + ipath: path, + aliases: aliases.Enabled(), + shallow: shallow, + reportf: reportf, + + stringData: stringData, + stringCache: make(map[uint64]string), + fileOffset: fileOffset, + fileData: fileData, + fileCache: make([]*token.File, len(fileOffset)), + pkgCache: make(map[uint64]*types.Package), + + declData: declData, + pkgIndex: make(map[*types.Package]map[string]uint64), + typCache: make(map[uint64]types.Type), + // Separate map for typeparams, keyed by their package and unique + // name. + tparamIndex: make(map[ident]types.Type), + + fake: fakeFileSet{ + fset: fset, + files: make(map[string]*fileInfo), + }, + } + defer p.fake.setLines() // set lines for files in fset + + for i, pt := range predeclared() { + p.typCache[uint64(i)] = pt + } + + // Gather the relevant packages from the manifest. + items := make([]GetPackagesItem, r.uint64()) + uniquePkgPaths := make(map[string]bool) + for i := range items { + pkgPathOff := r.uint64() + pkgPath := p.stringAt(pkgPathOff) + pkgName := p.stringAt(r.uint64()) + _ = r.uint64() // package height; unused by go/types + + if pkgPath == "" { + pkgPath = path + } + items[i].Name = pkgName + items[i].Path = pkgPath + items[i].pathOffset = pkgPathOff + + // Read index for package. + nameIndex := make(map[string]uint64) + nSyms := r.uint64() + // In shallow mode, only the current package (i=0) has an index. + assert(!(shallow && i > 0 && nSyms != 0)) + for ; nSyms > 0; nSyms-- { + name := p.stringAt(r.uint64()) + nameIndex[name] = r.uint64() + } + + items[i].nameIndex = nameIndex + + uniquePkgPaths[pkgPath] = true + } + // Debugging #63822; hypothesis: there are duplicate PkgPaths. + if len(uniquePkgPaths) != len(items) { + reportf("found duplicate PkgPaths while reading export data manifest: %v", items) + } + + // Request packages all at once from the client, + // enabling a parallel implementation. + if err := getPackages(items); err != nil { + return nil, err // don't wrap this error + } + + // Check the results and complete the index. + pkgList := make([]*types.Package, len(items)) + for i, item := range items { + pkg := item.Pkg + if pkg == nil { + errorf("internal error: getPackages returned nil package for %q", item.Path) + } else if pkg.Path() != item.Path { + errorf("internal error: getPackages returned wrong path %q, want %q", pkg.Path(), item.Path) + } else if pkg.Name() != item.Name { + errorf("internal error: getPackages returned wrong name %s for package %q, want %s", pkg.Name(), item.Path, item.Name) + } + p.pkgCache[item.pathOffset] = pkg + p.pkgIndex[pkg] = item.nameIndex + pkgList[i] = pkg + } + + if bundle { + pkgs = make([]*types.Package, r.uint64()) + for i := range pkgs { + pkg := p.pkgAt(r.uint64()) + imps := make([]*types.Package, r.uint64()) + for j := range imps { + imps[j] = p.pkgAt(r.uint64()) + } + pkg.SetImports(imps) + pkgs[i] = pkg + } + } else { + if len(pkgList) == 0 { + errorf("no packages found for %s", path) + panic("unreachable") + } + pkgs = pkgList[:1] + + // record all referenced packages as imports + list := append(([]*types.Package)(nil), pkgList[1:]...) + sort.Sort(byPath(list)) + pkgs[0].SetImports(list) + } + + for _, pkg := range pkgs { + if pkg.Complete() { + continue + } + + names := make([]string, 0, len(p.pkgIndex[pkg])) + for name := range p.pkgIndex[pkg] { + names = append(names, name) + } + sort.Strings(names) + for _, name := range names { + p.doDecl(pkg, name) + } + + // package was imported completely and without errors + pkg.MarkComplete() + } + + // SetConstraint can't be called if the constraint type is not yet complete. + // When type params are created in the typeParamTag case of (*importReader).obj(), + // the associated constraint type may not be complete due to recursion. + // Therefore, we defer calling SetConstraint there, and call it here instead + // after all types are complete. + for _, d := range p.later { + d.t.SetConstraint(d.constraint) + } + + for _, typ := range p.interfaceList { + typ.Complete() + } + + // Workaround for golang/go#61561. See the doc for instanceList for details. + for _, typ := range p.instanceList { + if iface, _ := typ.Underlying().(*types.Interface); iface != nil { + iface.Complete() + } + } + + return pkgs, nil +} + +type setConstraintArgs struct { + t *types.TypeParam + constraint types.Type +} + +type iimporter struct { + version int + ipath string + + aliases bool + shallow bool + reportf ReportFunc // if non-nil, used to report bugs + + stringData []byte + stringCache map[uint64]string + fileOffset []uint64 // fileOffset[i] is offset in fileData for info about file encoded as i + fileData []byte + fileCache []*token.File // memoized decoding of file encoded as i + pkgCache map[uint64]*types.Package + + declData []byte + pkgIndex map[*types.Package]map[string]uint64 + typCache map[uint64]types.Type + tparamIndex map[ident]types.Type + + fake fakeFileSet + interfaceList []*types.Interface + + // Workaround for the go/types bug golang/go#61561: instances produced during + // instantiation may contain incomplete interfaces. Here we only complete the + // underlying type of the instance, which is the most common case but doesn't + // handle parameterized interface literals defined deeper in the type. + instanceList []types.Type // instances for later completion (see golang/go#61561) + + // Arguments for calls to SetConstraint that are deferred due to recursive types + later []setConstraintArgs + + indent int // for tracing support +} + +func (p *iimporter) trace(format string, args ...interface{}) { + if !trace { + // Call sites should also be guarded, but having this check here allows + // easily enabling/disabling debug trace statements. + return + } + fmt.Printf(strings.Repeat("..", p.indent)+format+"\n", args...) +} + +func (p *iimporter) doDecl(pkg *types.Package, name string) { + if debug { + p.trace("import decl %s", name) + p.indent++ + defer func() { + p.indent-- + p.trace("=> %s", name) + }() + } + // See if we've already imported this declaration. + if obj := pkg.Scope().Lookup(name); obj != nil { + return + } + + off, ok := p.pkgIndex[pkg][name] + if !ok { + // In deep mode, the index should be complete. In shallow + // mode, we should have already recursively loaded necessary + // dependencies so the above Lookup succeeds. + errorf("%v.%v not in index", pkg, name) + } + + r := &importReader{p: p, currPkg: pkg} + r.declReader.Reset(p.declData[off:]) + + r.obj(name) +} + +func (p *iimporter) stringAt(off uint64) string { + if s, ok := p.stringCache[off]; ok { + return s + } + + slen, n := binary.Uvarint(p.stringData[off:]) + if n <= 0 { + errorf("varint failed") + } + spos := off + uint64(n) + s := string(p.stringData[spos : spos+slen]) + p.stringCache[off] = s + return s +} + +func (p *iimporter) fileAt(index uint64) *token.File { + file := p.fileCache[index] + if file == nil { + off := p.fileOffset[index] + file = p.decodeFile(intReader{bytes.NewReader(p.fileData[off:]), p.ipath}) + p.fileCache[index] = file + } + return file +} + +func (p *iimporter) decodeFile(rd intReader) *token.File { + filename := p.stringAt(rd.uint64()) + size := int(rd.uint64()) + file := p.fake.fset.AddFile(filename, -1, size) + + // SetLines requires a nondecreasing sequence. + // Because it is common for clients to derive the interval + // [start, start+len(name)] from a start position, and we + // want to ensure that the end offset is on the same line, + // we fill in the gaps of the sparse encoding with values + // that strictly increase by the largest possible amount. + // This allows us to avoid having to record the actual end + // offset of each needed line. + + lines := make([]int, int(rd.uint64())) + var index, offset int + for i, n := 0, int(rd.uint64()); i < n; i++ { + index += int(rd.uint64()) + offset += int(rd.uint64()) + lines[index] = offset + + // Ensure monotonicity between points. + for j := index - 1; j > 0 && lines[j] == 0; j-- { + lines[j] = lines[j+1] - 1 + } + } + + // Ensure monotonicity after last point. + for j := len(lines) - 1; j > 0 && lines[j] == 0; j-- { + size-- + lines[j] = size + } + + if !file.SetLines(lines) { + errorf("SetLines failed: %d", lines) // can't happen + } + return file +} + +func (p *iimporter) pkgAt(off uint64) *types.Package { + if pkg, ok := p.pkgCache[off]; ok { + return pkg + } + path := p.stringAt(off) + errorf("missing package %q in %q", path, p.ipath) + return nil +} + +func (p *iimporter) typAt(off uint64, base *types.Named) types.Type { + if t, ok := p.typCache[off]; ok && canReuse(base, t) { + return t + } + + if off < predeclReserved { + errorf("predeclared type missing from cache: %v", off) + } + + r := &importReader{p: p} + r.declReader.Reset(p.declData[off-predeclReserved:]) + t := r.doType(base) + + if canReuse(base, t) { + p.typCache[off] = t + } + return t +} + +// canReuse reports whether the type rhs on the RHS of the declaration for def +// may be re-used. +// +// Specifically, if def is non-nil and rhs is an interface type with methods, it +// may not be re-used because we have a convention of setting the receiver type +// for interface methods to def. +func canReuse(def *types.Named, rhs types.Type) bool { + if def == nil { + return true + } + iface, _ := types.Unalias(rhs).(*types.Interface) + if iface == nil { + return true + } + // Don't use iface.Empty() here as iface may not be complete. + return iface.NumEmbeddeds() == 0 && iface.NumExplicitMethods() == 0 +} + +type importReader struct { + p *iimporter + declReader bytes.Reader + currPkg *types.Package + prevFile string + prevLine int64 + prevColumn int64 +} + +func (r *importReader) obj(name string) { + tag := r.byte() + pos := r.pos() + + switch tag { + case aliasTag, genericAliasTag: + var tparams []*types.TypeParam + if tag == genericAliasTag { + tparams = r.tparamList() + } + typ := r.typ() + obj := aliases.NewAlias(r.p.aliases, pos, r.currPkg, name, typ, tparams) + r.declare(obj) + + case constTag: + typ, val := r.value() + + r.declare(types.NewConst(pos, r.currPkg, name, typ, val)) + + case funcTag, genericFuncTag: + var tparams []*types.TypeParam + if tag == genericFuncTag { + tparams = r.tparamList() + } + sig := r.signature(nil, nil, tparams) + r.declare(types.NewFunc(pos, r.currPkg, name, sig)) + + case typeTag, genericTypeTag: + // Types can be recursive. We need to setup a stub + // declaration before recursing. + obj := types.NewTypeName(pos, r.currPkg, name, nil) + named := types.NewNamed(obj, nil, nil) + // Declare obj before calling r.tparamList, so the new type name is recognized + // if used in the constraint of one of its own typeparams (see #48280). + r.declare(obj) + if tag == genericTypeTag { + tparams := r.tparamList() + named.SetTypeParams(tparams) + } + + underlying := r.p.typAt(r.uint64(), named).Underlying() + named.SetUnderlying(underlying) + + if !isInterface(underlying) { + for n := r.uint64(); n > 0; n-- { + mpos := r.pos() + mname := r.ident() + recv := r.param() + + // If the receiver has any targs, set those as the + // rparams of the method (since those are the + // typeparams being used in the method sig/body). + _, recvNamed := typesinternal.ReceiverNamed(recv) + targs := recvNamed.TypeArgs() + var rparams []*types.TypeParam + if targs.Len() > 0 { + rparams = make([]*types.TypeParam, targs.Len()) + for i := range rparams { + rparams[i] = types.Unalias(targs.At(i)).(*types.TypeParam) + } + } + msig := r.signature(recv, rparams, nil) + + named.AddMethod(types.NewFunc(mpos, r.currPkg, mname, msig)) + } + } + + case typeParamTag: + // We need to "declare" a typeparam in order to have a name that + // can be referenced recursively (if needed) in the type param's + // bound. + if r.p.version < iexportVersionGenerics { + errorf("unexpected type param type") + } + name0 := tparamName(name) + tn := types.NewTypeName(pos, r.currPkg, name0, nil) + t := types.NewTypeParam(tn, nil) + + // To handle recursive references to the typeparam within its + // bound, save the partial type in tparamIndex before reading the bounds. + id := ident{r.currPkg, name} + r.p.tparamIndex[id] = t + var implicit bool + if r.p.version >= iexportVersionGo1_18 { + implicit = r.bool() + } + constraint := r.typ() + if implicit { + iface, _ := types.Unalias(constraint).(*types.Interface) + if iface == nil { + errorf("non-interface constraint marked implicit") + } + iface.MarkImplicit() + } + // The constraint type may not be complete, if we + // are in the middle of a type recursion involving type + // constraints. So, we defer SetConstraint until we have + // completely set up all types in ImportData. + r.p.later = append(r.p.later, setConstraintArgs{t: t, constraint: constraint}) + + case varTag: + typ := r.typ() + + r.declare(types.NewVar(pos, r.currPkg, name, typ)) + + default: + errorf("unexpected tag: %v", tag) + } +} + +func (r *importReader) declare(obj types.Object) { + obj.Pkg().Scope().Insert(obj) +} + +func (r *importReader) value() (typ types.Type, val constant.Value) { + typ = r.typ() + if r.p.version >= iexportVersionGo1_18 { + // TODO: add support for using the kind. + _ = constant.Kind(r.int64()) + } + + switch b := typ.Underlying().(*types.Basic); b.Info() & types.IsConstType { + case types.IsBoolean: + val = constant.MakeBool(r.bool()) + + case types.IsString: + val = constant.MakeString(r.string()) + + case types.IsInteger: + var x big.Int + r.mpint(&x, b) + val = constant.Make(&x) + + case types.IsFloat: + val = r.mpfloat(b) + + case types.IsComplex: + re := r.mpfloat(b) + im := r.mpfloat(b) + val = constant.BinaryOp(re, token.ADD, constant.MakeImag(im)) + + default: + if b.Kind() == types.Invalid { + val = constant.MakeUnknown() + return + } + errorf("unexpected type %v", typ) // panics + panic("unreachable") + } + + return +} + +func intSize(b *types.Basic) (signed bool, maxBytes uint) { + if (b.Info() & types.IsUntyped) != 0 { + return true, 64 + } + + switch b.Kind() { + case types.Float32, types.Complex64: + return true, 3 + case types.Float64, types.Complex128: + return true, 7 + } + + signed = (b.Info() & types.IsUnsigned) == 0 + switch b.Kind() { + case types.Int8, types.Uint8: + maxBytes = 1 + case types.Int16, types.Uint16: + maxBytes = 2 + case types.Int32, types.Uint32: + maxBytes = 4 + default: + maxBytes = 8 + } + + return +} + +func (r *importReader) mpint(x *big.Int, typ *types.Basic) { + signed, maxBytes := intSize(typ) + + maxSmall := 256 - maxBytes + if signed { + maxSmall = 256 - 2*maxBytes + } + if maxBytes == 1 { + maxSmall = 256 + } + + n, _ := r.declReader.ReadByte() + if uint(n) < maxSmall { + v := int64(n) + if signed { + v >>= 1 + if n&1 != 0 { + v = ^v + } + } + x.SetInt64(v) + return + } + + v := -n + if signed { + v = -(n &^ 1) >> 1 + } + if v < 1 || uint(v) > maxBytes { + errorf("weird decoding: %v, %v => %v", n, signed, v) + } + b := make([]byte, v) + io.ReadFull(&r.declReader, b) + x.SetBytes(b) + if signed && n&1 != 0 { + x.Neg(x) + } +} + +func (r *importReader) mpfloat(typ *types.Basic) constant.Value { + var mant big.Int + r.mpint(&mant, typ) + var f big.Float + f.SetInt(&mant) + if f.Sign() != 0 { + f.SetMantExp(&f, int(r.int64())) + } + return constant.Make(&f) +} + +func (r *importReader) ident() string { + return r.string() +} + +func (r *importReader) qualifiedIdent() (*types.Package, string) { + name := r.string() + pkg := r.pkg() + return pkg, name +} + +func (r *importReader) pos() token.Pos { + if r.p.shallow { + // precise offsets are encoded only in shallow mode + return r.posv2() + } + if r.p.version >= iexportVersionPosCol { + r.posv1() + } else { + r.posv0() + } + + if r.prevFile == "" && r.prevLine == 0 && r.prevColumn == 0 { + return token.NoPos + } + return r.p.fake.pos(r.prevFile, int(r.prevLine), int(r.prevColumn)) +} + +func (r *importReader) posv0() { + delta := r.int64() + if delta != deltaNewFile { + r.prevLine += delta + } else if l := r.int64(); l == -1 { + r.prevLine += deltaNewFile + } else { + r.prevFile = r.string() + r.prevLine = l + } +} + +func (r *importReader) posv1() { + delta := r.int64() + r.prevColumn += delta >> 1 + if delta&1 != 0 { + delta = r.int64() + r.prevLine += delta >> 1 + if delta&1 != 0 { + r.prevFile = r.string() + } + } +} + +func (r *importReader) posv2() token.Pos { + file := r.uint64() + if file == 0 { + return token.NoPos + } + tf := r.p.fileAt(file - 1) + return tf.Pos(int(r.uint64())) +} + +func (r *importReader) typ() types.Type { + return r.p.typAt(r.uint64(), nil) +} + +func isInterface(t types.Type) bool { + _, ok := types.Unalias(t).(*types.Interface) + return ok +} + +func (r *importReader) pkg() *types.Package { return r.p.pkgAt(r.uint64()) } +func (r *importReader) string() string { return r.p.stringAt(r.uint64()) } + +func (r *importReader) doType(base *types.Named) (res types.Type) { + k := r.kind() + if debug { + r.p.trace("importing type %d (base: %v)", k, base) + r.p.indent++ + defer func() { + r.p.indent-- + r.p.trace("=> %s", res) + }() + } + switch k { + default: + errorf("unexpected kind tag in %q: %v", r.p.ipath, k) + return nil + + case aliasType, definedType: + pkg, name := r.qualifiedIdent() + r.p.doDecl(pkg, name) + return pkg.Scope().Lookup(name).(*types.TypeName).Type() + case pointerType: + return types.NewPointer(r.typ()) + case sliceType: + return types.NewSlice(r.typ()) + case arrayType: + n := r.uint64() + return types.NewArray(r.typ(), int64(n)) + case chanType: + dir := chanDir(int(r.uint64())) + return types.NewChan(dir, r.typ()) + case mapType: + return types.NewMap(r.typ(), r.typ()) + case signatureType: + r.currPkg = r.pkg() + return r.signature(nil, nil, nil) + + case structType: + r.currPkg = r.pkg() + + fields := make([]*types.Var, r.uint64()) + tags := make([]string, len(fields)) + for i := range fields { + var field *types.Var + if r.p.shallow { + field, _ = r.objectPathObject().(*types.Var) + } + + fpos := r.pos() + fname := r.ident() + ftyp := r.typ() + emb := r.bool() + tag := r.string() + + // Either this is not a shallow import, the field is local, or the + // encoded objectPath failed to produce an object (a bug). + // + // Even in this last, buggy case, fall back on creating a new field. As + // discussed in iexport.go, this is not correct, but mostly works and is + // preferable to failing (for now at least). + if field == nil { + field = types.NewField(fpos, r.currPkg, fname, ftyp, emb) + } + + fields[i] = field + tags[i] = tag + } + return types.NewStruct(fields, tags) + + case interfaceType: + r.currPkg = r.pkg() + + embeddeds := make([]types.Type, r.uint64()) + for i := range embeddeds { + _ = r.pos() + embeddeds[i] = r.typ() + } + + methods := make([]*types.Func, r.uint64()) + for i := range methods { + var method *types.Func + if r.p.shallow { + method, _ = r.objectPathObject().(*types.Func) + } + + mpos := r.pos() + mname := r.ident() + + // TODO(mdempsky): Matches bimport.go, but I + // don't agree with this. + var recv *types.Var + if base != nil { + recv = types.NewVar(token.NoPos, r.currPkg, "", base) + } + msig := r.signature(recv, nil, nil) + + if method == nil { + method = types.NewFunc(mpos, r.currPkg, mname, msig) + } + methods[i] = method + } + + typ := types.NewInterfaceType(methods, embeddeds) + r.p.interfaceList = append(r.p.interfaceList, typ) + return typ + + case typeParamType: + if r.p.version < iexportVersionGenerics { + errorf("unexpected type param type") + } + pkg, name := r.qualifiedIdent() + id := ident{pkg, name} + if t, ok := r.p.tparamIndex[id]; ok { + // We're already in the process of importing this typeparam. + return t + } + // Otherwise, import the definition of the typeparam now. + r.p.doDecl(pkg, name) + return r.p.tparamIndex[id] + + case instanceType: + if r.p.version < iexportVersionGenerics { + errorf("unexpected instantiation type") + } + // pos does not matter for instances: they are positioned on the original + // type. + _ = r.pos() + len := r.uint64() + targs := make([]types.Type, len) + for i := range targs { + targs[i] = r.typ() + } + baseType := r.typ() + // The imported instantiated type doesn't include any methods, so + // we must always use the methods of the base (orig) type. + // TODO provide a non-nil *Environment + t, _ := types.Instantiate(nil, baseType, targs, false) + + // Workaround for golang/go#61561. See the doc for instanceList for details. + r.p.instanceList = append(r.p.instanceList, t) + return t + + case unionType: + if r.p.version < iexportVersionGenerics { + errorf("unexpected instantiation type") + } + terms := make([]*types.Term, r.uint64()) + for i := range terms { + terms[i] = types.NewTerm(r.bool(), r.typ()) + } + return types.NewUnion(terms) + } +} + +func (r *importReader) kind() itag { + return itag(r.uint64()) +} + +// objectPathObject is the inverse of exportWriter.objectPath. +// +// In shallow mode, certain fields and methods may need to be looked up in an +// imported package. See the doc for exportWriter.objectPath for a full +// explanation. +func (r *importReader) objectPathObject() types.Object { + objPath := objectpath.Path(r.string()) + if objPath == "" { + return nil + } + pkg := r.pkg() + obj, err := objectpath.Object(pkg, objPath) + if err != nil { + if r.p.reportf != nil { + r.p.reportf("failed to find object for objectPath %q: %v", objPath, err) + } + } + return obj +} + +func (r *importReader) signature(recv *types.Var, rparams []*types.TypeParam, tparams []*types.TypeParam) *types.Signature { + params := r.paramList() + results := r.paramList() + variadic := params.Len() > 0 && r.bool() + return types.NewSignatureType(recv, rparams, tparams, params, results, variadic) +} + +func (r *importReader) tparamList() []*types.TypeParam { + n := r.uint64() + if n == 0 { + return nil + } + xs := make([]*types.TypeParam, n) + for i := range xs { + // Note: the standard library importer is tolerant of nil types here, + // though would panic in SetTypeParams. + xs[i] = types.Unalias(r.typ()).(*types.TypeParam) + } + return xs +} + +func (r *importReader) paramList() *types.Tuple { + xs := make([]*types.Var, r.uint64()) + for i := range xs { + xs[i] = r.param() + } + return types.NewTuple(xs...) +} + +func (r *importReader) param() *types.Var { + pos := r.pos() + name := r.ident() + typ := r.typ() + return types.NewParam(pos, r.currPkg, name, typ) +} + +func (r *importReader) bool() bool { + return r.uint64() != 0 +} + +func (r *importReader) int64() int64 { + n, err := binary.ReadVarint(&r.declReader) + if err != nil { + errorf("readVarint: %v", err) + } + return n +} + +func (r *importReader) uint64() uint64 { + n, err := binary.ReadUvarint(&r.declReader) + if err != nil { + errorf("readUvarint: %v", err) + } + return n +} + +func (r *importReader) byte() byte { + x, err := r.declReader.ReadByte() + if err != nil { + errorf("declReader.ReadByte: %v", err) + } + return x +} diff --git a/vendor/golang.org/x/tools/internal/gcimporter/predeclared.go b/vendor/golang.org/x/tools/internal/gcimporter/predeclared.go new file mode 100644 index 0000000..907c855 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/predeclared.go @@ -0,0 +1,91 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gcimporter + +import ( + "go/types" + "sync" +) + +// predecl is a cache for the predeclared types in types.Universe. +// +// Cache a distinct result based on the runtime value of any. +// The pointer value of the any type varies based on GODEBUG settings. +var predeclMu sync.Mutex +var predecl map[types.Type][]types.Type + +func predeclared() []types.Type { + anyt := types.Universe.Lookup("any").Type() + + predeclMu.Lock() + defer predeclMu.Unlock() + + if pre, ok := predecl[anyt]; ok { + return pre + } + + if predecl == nil { + predecl = make(map[types.Type][]types.Type) + } + + decls := []types.Type{ // basic types + types.Typ[types.Bool], + types.Typ[types.Int], + types.Typ[types.Int8], + types.Typ[types.Int16], + types.Typ[types.Int32], + types.Typ[types.Int64], + types.Typ[types.Uint], + types.Typ[types.Uint8], + types.Typ[types.Uint16], + types.Typ[types.Uint32], + types.Typ[types.Uint64], + types.Typ[types.Uintptr], + types.Typ[types.Float32], + types.Typ[types.Float64], + types.Typ[types.Complex64], + types.Typ[types.Complex128], + types.Typ[types.String], + + // basic type aliases + types.Universe.Lookup("byte").Type(), + types.Universe.Lookup("rune").Type(), + + // error + types.Universe.Lookup("error").Type(), + + // untyped types + types.Typ[types.UntypedBool], + types.Typ[types.UntypedInt], + types.Typ[types.UntypedRune], + types.Typ[types.UntypedFloat], + types.Typ[types.UntypedComplex], + types.Typ[types.UntypedString], + types.Typ[types.UntypedNil], + + // package unsafe + types.Typ[types.UnsafePointer], + + // invalid type + types.Typ[types.Invalid], // only appears in packages with errors + + // used internally by gc; never used by this package or in .a files + anyType{}, + + // comparable + types.Universe.Lookup("comparable").Type(), + + // any + anyt, + } + + predecl[anyt] = decls + return decls +} + +type anyType struct{} + +func (t anyType) Underlying() types.Type { return t } +func (t anyType) String() string { return "any" } diff --git a/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go b/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go new file mode 100644 index 0000000..1db4086 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go @@ -0,0 +1,754 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Derived from go/internal/gcimporter/ureader.go + +package gcimporter + +import ( + "fmt" + "go/token" + "go/types" + "sort" + "strings" + + "golang.org/x/tools/internal/aliases" + "golang.org/x/tools/internal/pkgbits" +) + +// A pkgReader holds the shared state for reading a unified IR package +// description. +type pkgReader struct { + pkgbits.PkgDecoder + + fake fakeFileSet + + ctxt *types.Context + imports map[string]*types.Package // previously imported packages, indexed by path + aliases bool // create types.Alias nodes + + // lazily initialized arrays corresponding to the unified IR + // PosBase, Pkg, and Type sections, respectively. + posBases []string // position bases (i.e., file names) + pkgs []*types.Package + typs []types.Type + + // laterFns holds functions that need to be invoked at the end of + // import reading. + laterFns []func() + // laterFors is used in case of 'type A B' to ensure that B is processed before A. + laterFors map[types.Type]int + + // ifaces holds a list of constructed Interfaces, which need to have + // Complete called after importing is done. + ifaces []*types.Interface +} + +// later adds a function to be invoked at the end of import reading. +func (pr *pkgReader) later(fn func()) { + pr.laterFns = append(pr.laterFns, fn) +} + +// See cmd/compile/internal/noder.derivedInfo. +type derivedInfo struct { + idx pkgbits.Index +} + +// See cmd/compile/internal/noder.typeInfo. +type typeInfo struct { + idx pkgbits.Index + derived bool +} + +func UImportData(fset *token.FileSet, imports map[string]*types.Package, data []byte, path string) (_ int, pkg *types.Package, err error) { + if !debug { + defer func() { + if x := recover(); x != nil { + err = fmt.Errorf("internal error in importing %q (%v); please report an issue", path, x) + } + }() + } + + s := string(data) + s = s[:strings.LastIndex(s, "\n$$\n")] + input := pkgbits.NewPkgDecoder(path, s) + pkg = readUnifiedPackage(fset, nil, imports, input) + return +} + +// laterFor adds a function to be invoked at the end of import reading, and records the type that function is finishing. +func (pr *pkgReader) laterFor(t types.Type, fn func()) { + if pr.laterFors == nil { + pr.laterFors = make(map[types.Type]int) + } + pr.laterFors[t] = len(pr.laterFns) + pr.laterFns = append(pr.laterFns, fn) +} + +// readUnifiedPackage reads a package description from the given +// unified IR export data decoder. +func readUnifiedPackage(fset *token.FileSet, ctxt *types.Context, imports map[string]*types.Package, input pkgbits.PkgDecoder) *types.Package { + pr := pkgReader{ + PkgDecoder: input, + + fake: fakeFileSet{ + fset: fset, + files: make(map[string]*fileInfo), + }, + + ctxt: ctxt, + imports: imports, + aliases: aliases.Enabled(), + + posBases: make([]string, input.NumElems(pkgbits.RelocPosBase)), + pkgs: make([]*types.Package, input.NumElems(pkgbits.RelocPkg)), + typs: make([]types.Type, input.NumElems(pkgbits.RelocType)), + } + defer pr.fake.setLines() + + r := pr.newReader(pkgbits.RelocMeta, pkgbits.PublicRootIdx, pkgbits.SyncPublic) + pkg := r.pkg() + if r.Version().Has(pkgbits.HasInit) { + r.Bool() + } + + for i, n := 0, r.Len(); i < n; i++ { + // As if r.obj(), but avoiding the Scope.Lookup call, + // to avoid eager loading of imports. + r.Sync(pkgbits.SyncObject) + if r.Version().Has(pkgbits.DerivedFuncInstance) { + assert(!r.Bool()) + } + r.p.objIdx(r.Reloc(pkgbits.RelocObj)) + assert(r.Len() == 0) + } + + r.Sync(pkgbits.SyncEOF) + + for _, fn := range pr.laterFns { + fn() + } + + for _, iface := range pr.ifaces { + iface.Complete() + } + + // Imports() of pkg are all of the transitive packages that were loaded. + var imps []*types.Package + for _, imp := range pr.pkgs { + if imp != nil && imp != pkg { + imps = append(imps, imp) + } + } + sort.Sort(byPath(imps)) + pkg.SetImports(imps) + + pkg.MarkComplete() + return pkg +} + +// A reader holds the state for reading a single unified IR element +// within a package. +type reader struct { + pkgbits.Decoder + + p *pkgReader + + dict *readerDict +} + +// A readerDict holds the state for type parameters that parameterize +// the current unified IR element. +type readerDict struct { + // bounds is a slice of typeInfos corresponding to the underlying + // bounds of the element's type parameters. + bounds []typeInfo + + // tparams is a slice of the constructed TypeParams for the element. + tparams []*types.TypeParam + + // derived is a slice of types derived from tparams, which may be + // instantiated while reading the current element. + derived []derivedInfo + derivedTypes []types.Type // lazily instantiated from derived +} + +func (pr *pkgReader) newReader(k pkgbits.RelocKind, idx pkgbits.Index, marker pkgbits.SyncMarker) *reader { + return &reader{ + Decoder: pr.NewDecoder(k, idx, marker), + p: pr, + } +} + +func (pr *pkgReader) tempReader(k pkgbits.RelocKind, idx pkgbits.Index, marker pkgbits.SyncMarker) *reader { + return &reader{ + Decoder: pr.TempDecoder(k, idx, marker), + p: pr, + } +} + +func (pr *pkgReader) retireReader(r *reader) { + pr.RetireDecoder(&r.Decoder) +} + +// @@@ Positions + +func (r *reader) pos() token.Pos { + r.Sync(pkgbits.SyncPos) + if !r.Bool() { + return token.NoPos + } + + // TODO(mdempsky): Delta encoding. + posBase := r.posBase() + line := r.Uint() + col := r.Uint() + return r.p.fake.pos(posBase, int(line), int(col)) +} + +func (r *reader) posBase() string { + return r.p.posBaseIdx(r.Reloc(pkgbits.RelocPosBase)) +} + +func (pr *pkgReader) posBaseIdx(idx pkgbits.Index) string { + if b := pr.posBases[idx]; b != "" { + return b + } + + var filename string + { + r := pr.tempReader(pkgbits.RelocPosBase, idx, pkgbits.SyncPosBase) + + // Within types2, position bases have a lot more details (e.g., + // keeping track of where //line directives appeared exactly). + // + // For go/types, we just track the file name. + + filename = r.String() + + if r.Bool() { // file base + // Was: "b = token.NewTrimmedFileBase(filename, true)" + } else { // line base + pos := r.pos() + line := r.Uint() + col := r.Uint() + + // Was: "b = token.NewLineBase(pos, filename, true, line, col)" + _, _, _ = pos, line, col + } + pr.retireReader(r) + } + b := filename + pr.posBases[idx] = b + return b +} + +// @@@ Packages + +func (r *reader) pkg() *types.Package { + r.Sync(pkgbits.SyncPkg) + return r.p.pkgIdx(r.Reloc(pkgbits.RelocPkg)) +} + +func (pr *pkgReader) pkgIdx(idx pkgbits.Index) *types.Package { + // TODO(mdempsky): Consider using some non-nil pointer to indicate + // the universe scope, so we don't need to keep re-reading it. + if pkg := pr.pkgs[idx]; pkg != nil { + return pkg + } + + pkg := pr.newReader(pkgbits.RelocPkg, idx, pkgbits.SyncPkgDef).doPkg() + pr.pkgs[idx] = pkg + return pkg +} + +func (r *reader) doPkg() *types.Package { + path := r.String() + switch path { + case "": + path = r.p.PkgPath() + case "builtin": + return nil // universe + case "unsafe": + return types.Unsafe + } + + if pkg := r.p.imports[path]; pkg != nil { + return pkg + } + + name := r.String() + + pkg := types.NewPackage(path, name) + r.p.imports[path] = pkg + + return pkg +} + +// @@@ Types + +func (r *reader) typ() types.Type { + return r.p.typIdx(r.typInfo(), r.dict) +} + +func (r *reader) typInfo() typeInfo { + r.Sync(pkgbits.SyncType) + if r.Bool() { + return typeInfo{idx: pkgbits.Index(r.Len()), derived: true} + } + return typeInfo{idx: r.Reloc(pkgbits.RelocType), derived: false} +} + +func (pr *pkgReader) typIdx(info typeInfo, dict *readerDict) types.Type { + idx := info.idx + var where *types.Type + if info.derived { + where = &dict.derivedTypes[idx] + idx = dict.derived[idx].idx + } else { + where = &pr.typs[idx] + } + + if typ := *where; typ != nil { + return typ + } + + var typ types.Type + { + r := pr.tempReader(pkgbits.RelocType, idx, pkgbits.SyncTypeIdx) + r.dict = dict + + typ = r.doTyp() + assert(typ != nil) + pr.retireReader(r) + } + // See comment in pkgReader.typIdx explaining how this happens. + if prev := *where; prev != nil { + return prev + } + + *where = typ + return typ +} + +func (r *reader) doTyp() (res types.Type) { + switch tag := pkgbits.CodeType(r.Code(pkgbits.SyncType)); tag { + default: + errorf("unhandled type tag: %v", tag) + panic("unreachable") + + case pkgbits.TypeBasic: + return types.Typ[r.Len()] + + case pkgbits.TypeNamed: + obj, targs := r.obj() + name := obj.(*types.TypeName) + if len(targs) != 0 { + t, _ := types.Instantiate(r.p.ctxt, name.Type(), targs, false) + return t + } + return name.Type() + + case pkgbits.TypeTypeParam: + return r.dict.tparams[r.Len()] + + case pkgbits.TypeArray: + len := int64(r.Uint64()) + return types.NewArray(r.typ(), len) + case pkgbits.TypeChan: + dir := types.ChanDir(r.Len()) + return types.NewChan(dir, r.typ()) + case pkgbits.TypeMap: + return types.NewMap(r.typ(), r.typ()) + case pkgbits.TypePointer: + return types.NewPointer(r.typ()) + case pkgbits.TypeSignature: + return r.signature(nil, nil, nil) + case pkgbits.TypeSlice: + return types.NewSlice(r.typ()) + case pkgbits.TypeStruct: + return r.structType() + case pkgbits.TypeInterface: + return r.interfaceType() + case pkgbits.TypeUnion: + return r.unionType() + } +} + +func (r *reader) structType() *types.Struct { + fields := make([]*types.Var, r.Len()) + var tags []string + for i := range fields { + pos := r.pos() + pkg, name := r.selector() + ftyp := r.typ() + tag := r.String() + embedded := r.Bool() + + fields[i] = types.NewField(pos, pkg, name, ftyp, embedded) + if tag != "" { + for len(tags) < i { + tags = append(tags, "") + } + tags = append(tags, tag) + } + } + return types.NewStruct(fields, tags) +} + +func (r *reader) unionType() *types.Union { + terms := make([]*types.Term, r.Len()) + for i := range terms { + terms[i] = types.NewTerm(r.Bool(), r.typ()) + } + return types.NewUnion(terms) +} + +func (r *reader) interfaceType() *types.Interface { + methods := make([]*types.Func, r.Len()) + embeddeds := make([]types.Type, r.Len()) + implicit := len(methods) == 0 && len(embeddeds) == 1 && r.Bool() + + for i := range methods { + pos := r.pos() + pkg, name := r.selector() + mtyp := r.signature(nil, nil, nil) + methods[i] = types.NewFunc(pos, pkg, name, mtyp) + } + + for i := range embeddeds { + embeddeds[i] = r.typ() + } + + iface := types.NewInterfaceType(methods, embeddeds) + if implicit { + iface.MarkImplicit() + } + + // We need to call iface.Complete(), but if there are any embedded + // defined types, then we may not have set their underlying + // interface type yet. So we need to defer calling Complete until + // after we've called SetUnderlying everywhere. + // + // TODO(mdempsky): After CL 424876 lands, it should be safe to call + // iface.Complete() immediately. + r.p.ifaces = append(r.p.ifaces, iface) + + return iface +} + +func (r *reader) signature(recv *types.Var, rtparams, tparams []*types.TypeParam) *types.Signature { + r.Sync(pkgbits.SyncSignature) + + params := r.params() + results := r.params() + variadic := r.Bool() + + return types.NewSignatureType(recv, rtparams, tparams, params, results, variadic) +} + +func (r *reader) params() *types.Tuple { + r.Sync(pkgbits.SyncParams) + + params := make([]*types.Var, r.Len()) + for i := range params { + params[i] = r.param() + } + + return types.NewTuple(params...) +} + +func (r *reader) param() *types.Var { + r.Sync(pkgbits.SyncParam) + + pos := r.pos() + pkg, name := r.localIdent() + typ := r.typ() + + return types.NewParam(pos, pkg, name, typ) +} + +// @@@ Objects + +func (r *reader) obj() (types.Object, []types.Type) { + r.Sync(pkgbits.SyncObject) + + if r.Version().Has(pkgbits.DerivedFuncInstance) { + assert(!r.Bool()) + } + + pkg, name := r.p.objIdx(r.Reloc(pkgbits.RelocObj)) + obj := pkgScope(pkg).Lookup(name) + + targs := make([]types.Type, r.Len()) + for i := range targs { + targs[i] = r.typ() + } + + return obj, targs +} + +func (pr *pkgReader) objIdx(idx pkgbits.Index) (*types.Package, string) { + + var objPkg *types.Package + var objName string + var tag pkgbits.CodeObj + { + rname := pr.tempReader(pkgbits.RelocName, idx, pkgbits.SyncObject1) + + objPkg, objName = rname.qualifiedIdent() + assert(objName != "") + + tag = pkgbits.CodeObj(rname.Code(pkgbits.SyncCodeObj)) + pr.retireReader(rname) + } + + if tag == pkgbits.ObjStub { + assert(objPkg == nil || objPkg == types.Unsafe) + return objPkg, objName + } + + // Ignore local types promoted to global scope (#55110). + if _, suffix := splitVargenSuffix(objName); suffix != "" { + return objPkg, objName + } + + if objPkg.Scope().Lookup(objName) == nil { + dict := pr.objDictIdx(idx) + + r := pr.newReader(pkgbits.RelocObj, idx, pkgbits.SyncObject1) + r.dict = dict + + declare := func(obj types.Object) { + objPkg.Scope().Insert(obj) + } + + switch tag { + default: + panic("weird") + + case pkgbits.ObjAlias: + pos := r.pos() + var tparams []*types.TypeParam + if r.Version().Has(pkgbits.AliasTypeParamNames) { + tparams = r.typeParamNames() + } + typ := r.typ() + declare(aliases.NewAlias(r.p.aliases, pos, objPkg, objName, typ, tparams)) + + case pkgbits.ObjConst: + pos := r.pos() + typ := r.typ() + val := r.Value() + declare(types.NewConst(pos, objPkg, objName, typ, val)) + + case pkgbits.ObjFunc: + pos := r.pos() + tparams := r.typeParamNames() + sig := r.signature(nil, nil, tparams) + declare(types.NewFunc(pos, objPkg, objName, sig)) + + case pkgbits.ObjType: + pos := r.pos() + + obj := types.NewTypeName(pos, objPkg, objName, nil) + named := types.NewNamed(obj, nil, nil) + declare(obj) + + named.SetTypeParams(r.typeParamNames()) + + setUnderlying := func(underlying types.Type) { + // If the underlying type is an interface, we need to + // duplicate its methods so we can replace the receiver + // parameter's type (#49906). + if iface, ok := types.Unalias(underlying).(*types.Interface); ok && iface.NumExplicitMethods() != 0 { + methods := make([]*types.Func, iface.NumExplicitMethods()) + for i := range methods { + fn := iface.ExplicitMethod(i) + sig := fn.Type().(*types.Signature) + + recv := types.NewVar(fn.Pos(), fn.Pkg(), "", named) + methods[i] = types.NewFunc(fn.Pos(), fn.Pkg(), fn.Name(), types.NewSignature(recv, sig.Params(), sig.Results(), sig.Variadic())) + } + + embeds := make([]types.Type, iface.NumEmbeddeds()) + for i := range embeds { + embeds[i] = iface.EmbeddedType(i) + } + + newIface := types.NewInterfaceType(methods, embeds) + r.p.ifaces = append(r.p.ifaces, newIface) + underlying = newIface + } + + named.SetUnderlying(underlying) + } + + // Since go.dev/cl/455279, we can assume rhs.Underlying() will + // always be non-nil. However, to temporarily support users of + // older snapshot releases, we continue to fallback to the old + // behavior for now. + // + // TODO(mdempsky): Remove fallback code and simplify after + // allowing time for snapshot users to upgrade. + rhs := r.typ() + if underlying := rhs.Underlying(); underlying != nil { + setUnderlying(underlying) + } else { + pk := r.p + pk.laterFor(named, func() { + // First be sure that the rhs is initialized, if it needs to be initialized. + delete(pk.laterFors, named) // prevent cycles + if i, ok := pk.laterFors[rhs]; ok { + f := pk.laterFns[i] + pk.laterFns[i] = func() {} // function is running now, so replace it with a no-op + f() // initialize RHS + } + setUnderlying(rhs.Underlying()) + }) + } + + for i, n := 0, r.Len(); i < n; i++ { + named.AddMethod(r.method()) + } + + case pkgbits.ObjVar: + pos := r.pos() + typ := r.typ() + declare(types.NewVar(pos, objPkg, objName, typ)) + } + } + + return objPkg, objName +} + +func (pr *pkgReader) objDictIdx(idx pkgbits.Index) *readerDict { + + var dict readerDict + + { + r := pr.tempReader(pkgbits.RelocObjDict, idx, pkgbits.SyncObject1) + if implicits := r.Len(); implicits != 0 { + errorf("unexpected object with %v implicit type parameter(s)", implicits) + } + + dict.bounds = make([]typeInfo, r.Len()) + for i := range dict.bounds { + dict.bounds[i] = r.typInfo() + } + + dict.derived = make([]derivedInfo, r.Len()) + dict.derivedTypes = make([]types.Type, len(dict.derived)) + for i := range dict.derived { + dict.derived[i] = derivedInfo{idx: r.Reloc(pkgbits.RelocType)} + if r.Version().Has(pkgbits.DerivedInfoNeeded) { + assert(!r.Bool()) + } + } + + pr.retireReader(r) + } + // function references follow, but reader doesn't need those + + return &dict +} + +func (r *reader) typeParamNames() []*types.TypeParam { + r.Sync(pkgbits.SyncTypeParamNames) + + // Note: This code assumes it only processes objects without + // implement type parameters. This is currently fine, because + // reader is only used to read in exported declarations, which are + // always package scoped. + + if len(r.dict.bounds) == 0 { + return nil + } + + // Careful: Type parameter lists may have cycles. To allow for this, + // we construct the type parameter list in two passes: first we + // create all the TypeNames and TypeParams, then we construct and + // set the bound type. + + r.dict.tparams = make([]*types.TypeParam, len(r.dict.bounds)) + for i := range r.dict.bounds { + pos := r.pos() + pkg, name := r.localIdent() + + tname := types.NewTypeName(pos, pkg, name, nil) + r.dict.tparams[i] = types.NewTypeParam(tname, nil) + } + + typs := make([]types.Type, len(r.dict.bounds)) + for i, bound := range r.dict.bounds { + typs[i] = r.p.typIdx(bound, r.dict) + } + + // TODO(mdempsky): This is subtle, elaborate further. + // + // We have to save tparams outside of the closure, because + // typeParamNames() can be called multiple times with the same + // dictionary instance. + // + // Also, this needs to happen later to make sure SetUnderlying has + // been called. + // + // TODO(mdempsky): Is it safe to have a single "later" slice or do + // we need to have multiple passes? See comments on CL 386002 and + // go.dev/issue/52104. + tparams := r.dict.tparams + r.p.later(func() { + for i, typ := range typs { + tparams[i].SetConstraint(typ) + } + }) + + return r.dict.tparams +} + +func (r *reader) method() *types.Func { + r.Sync(pkgbits.SyncMethod) + pos := r.pos() + pkg, name := r.selector() + + rparams := r.typeParamNames() + sig := r.signature(r.param(), rparams, nil) + + _ = r.pos() // TODO(mdempsky): Remove; this is a hacker for linker.go. + return types.NewFunc(pos, pkg, name, sig) +} + +func (r *reader) qualifiedIdent() (*types.Package, string) { return r.ident(pkgbits.SyncSym) } +func (r *reader) localIdent() (*types.Package, string) { return r.ident(pkgbits.SyncLocalIdent) } +func (r *reader) selector() (*types.Package, string) { return r.ident(pkgbits.SyncSelector) } + +func (r *reader) ident(marker pkgbits.SyncMarker) (*types.Package, string) { + r.Sync(marker) + return r.pkg(), r.String() +} + +// pkgScope returns pkg.Scope(). +// If pkg is nil, it returns types.Universe instead. +// +// TODO(mdempsky): Remove after x/tools can depend on Go 1.19. +func pkgScope(pkg *types.Package) *types.Scope { + if pkg != nil { + return pkg.Scope() + } + return types.Universe +} + +// See cmd/compile/internal/types.SplitVargenSuffix. +func splitVargenSuffix(name string) (base, suffix string) { + i := len(name) + for i > 0 && name[i-1] >= '0' && name[i-1] <= '9' { + i-- + } + const dot = "·" + if i >= len(dot) && name[i-len(dot):i] == dot { + i -= len(dot) + return name[:i], name[i:] + } + return name, "" +} diff --git a/vendor/golang.org/x/tools/internal/gocommand/invoke.go b/vendor/golang.org/x/tools/internal/gocommand/invoke.go index 53cf66d..e333efc 100644 --- a/vendor/golang.org/x/tools/internal/gocommand/invoke.go +++ b/vendor/golang.org/x/tools/internal/gocommand/invoke.go @@ -8,12 +8,14 @@ package gocommand import ( "bytes" "context" + "encoding/json" "errors" "fmt" "io" "log" "os" - "reflect" + "os/exec" + "path/filepath" "regexp" "runtime" "strconv" @@ -21,12 +23,9 @@ import ( "sync" "time" - exec "golang.org/x/sys/execabs" - "golang.org/x/tools/internal/event" "golang.org/x/tools/internal/event/keys" "golang.org/x/tools/internal/event/label" - "golang.org/x/tools/internal/event/tag" ) // An Runner will run go command invocations and serialize @@ -56,11 +55,14 @@ func (runner *Runner) initialize() { // 1.14: go: updating go.mod: existing contents have changed since last read var modConcurrencyError = regexp.MustCompile(`go:.*go.mod.*contents have changed`) -// verb is an event label for the go command verb. -var verb = keys.NewString("verb", "go command verb") +// event keys for go command invocations +var ( + verb = keys.NewString("verb", "go command verb") + directory = keys.NewString("directory", "") +) func invLabels(inv Invocation) []label.Label { - return []label.Label{verb.Of(inv.Verb), tag.Directory.Of(inv.WorkingDir)} + return []label.Label{verb.Of(inv.Verb), directory.Of(inv.WorkingDir)} } // Run is a convenience wrapper around RunRaw. @@ -85,6 +87,7 @@ func (runner *Runner) RunPiped(ctx context.Context, inv Invocation, stdout, stde // RunRaw runs the invocation, serializing requests only if they fight over // go.mod changes. +// Postcondition: both error results have same nilness. func (runner *Runner) RunRaw(ctx context.Context, inv Invocation) (*bytes.Buffer, *bytes.Buffer, error, error) { ctx, done := event.Start(ctx, "gocommand.Runner.RunRaw", invLabels(inv)...) defer done() @@ -95,23 +98,24 @@ func (runner *Runner) RunRaw(ctx context.Context, inv Invocation) (*bytes.Buffer stdout, stderr, friendlyErr, err := runner.runConcurrent(ctx, inv) // If we encounter a load concurrency error, we need to retry serially. - if friendlyErr == nil || !modConcurrencyError.MatchString(friendlyErr.Error()) { - return stdout, stderr, friendlyErr, err + if friendlyErr != nil && modConcurrencyError.MatchString(friendlyErr.Error()) { + event.Error(ctx, "Load concurrency error, will retry serially", err) + + // Run serially by calling runPiped. + stdout.Reset() + stderr.Reset() + friendlyErr, err = runner.runPiped(ctx, inv, stdout, stderr) } - event.Error(ctx, "Load concurrency error, will retry serially", err) - // Run serially by calling runPiped. - stdout.Reset() - stderr.Reset() - friendlyErr, err = runner.runPiped(ctx, inv, stdout, stderr) return stdout, stderr, friendlyErr, err } +// Postcondition: both error results have same nilness. func (runner *Runner) runConcurrent(ctx context.Context, inv Invocation) (*bytes.Buffer, *bytes.Buffer, error, error) { // Wait for 1 worker to become available. select { case <-ctx.Done(): - return nil, nil, nil, ctx.Err() + return nil, nil, ctx.Err(), ctx.Err() case runner.inFlight <- struct{}{}: defer func() { <-runner.inFlight }() } @@ -121,6 +125,7 @@ func (runner *Runner) runConcurrent(ctx context.Context, inv Invocation) (*bytes return stdout, stderr, friendlyErr, err } +// Postcondition: both error results have same nilness. func (runner *Runner) runPiped(ctx context.Context, inv Invocation, stdout, stderr io.Writer) (error, error) { // Make sure the runner is always initialized. runner.initialize() @@ -129,7 +134,7 @@ func (runner *Runner) runPiped(ctx context.Context, inv Invocation, stdout, stde // runPiped commands. select { case <-ctx.Done(): - return nil, ctx.Err() + return ctx.Err(), ctx.Err() case runner.serialized <- struct{}{}: defer func() { <-runner.serialized }() } @@ -139,7 +144,7 @@ func (runner *Runner) runPiped(ctx context.Context, inv Invocation, stdout, stde for i := 0; i < maxInFlight; i++ { select { case <-ctx.Done(): - return nil, ctx.Err() + return ctx.Err(), ctx.Err() case runner.inFlight <- struct{}{}: // Make sure we always "return" any workers we took. defer func() { <-runner.inFlight }() @@ -156,12 +161,17 @@ type Invocation struct { BuildFlags []string // If ModFlag is set, the go command is invoked with -mod=ModFlag. + // TODO(rfindley): remove, in favor of Args. ModFlag string // If ModFile is set, the go command is invoked with -modfile=ModFile. + // TODO(rfindley): remove, in favor of Args. ModFile string - // If Overlay is set, the go command is invoked with -overlay=Overlay. + // Overlay is the name of the JSON overlay file that describes + // unsaved editor buffers; see [WriteOverlays]. + // If set, the go command is invoked with -overlay=Overlay. + // TODO(rfindley): remove, in favor of Args. Overlay string // If CleanEnv is set, the invocation will run only with the environment @@ -172,6 +182,7 @@ type Invocation struct { Logf func(format string, args ...interface{}) } +// Postcondition: both error results have same nilness. func (i *Invocation) runWithFriendlyError(ctx context.Context, stdout, stderr io.Writer) (friendlyError error, rawError error) { rawError = i.run(ctx, stdout, stderr) if rawError != nil { @@ -188,12 +199,14 @@ func (i *Invocation) runWithFriendlyError(ctx context.Context, stdout, stderr io return } -func (i *Invocation) run(ctx context.Context, stdout, stderr io.Writer) error { - log := i.Logf - if log == nil { - log = func(string, ...interface{}) {} +// logf logs if i.Logf is non-nil. +func (i *Invocation) logf(format string, args ...any) { + if i.Logf != nil { + i.Logf(format, args...) } +} +func (i *Invocation) run(ctx context.Context, stdout, stderr io.Writer) error { goArgs := []string{i.Verb} appendModFile := func() { @@ -236,23 +249,23 @@ func (i *Invocation) run(ctx context.Context, stdout, stderr io.Writer) error { cmd.Stdout = stdout cmd.Stderr = stderr - // cmd.WaitDelay was added only in go1.20 (see #50436). - if waitDelay := reflect.ValueOf(cmd).Elem().FieldByName("WaitDelay"); waitDelay.IsValid() { - // https://go.dev/issue/59541: don't wait forever copying stderr - // after the command has exited. - // After CL 484741 we copy stdout manually, so we we'll stop reading that as - // soon as ctx is done. However, we also don't want to wait around forever - // for stderr. Give a much-longer-than-reasonable delay and then assume that - // something has wedged in the kernel or runtime. - waitDelay.Set(reflect.ValueOf(30 * time.Second)) - } - - // On darwin the cwd gets resolved to the real path, which breaks anything that - // expects the working directory to keep the original path, including the + // https://go.dev/issue/59541: don't wait forever copying stderr + // after the command has exited. + // After CL 484741 we copy stdout manually, so we we'll stop reading that as + // soon as ctx is done. However, we also don't want to wait around forever + // for stderr. Give a much-longer-than-reasonable delay and then assume that + // something has wedged in the kernel or runtime. + cmd.WaitDelay = 30 * time.Second + + // The cwd gets resolved to the real path. On Darwin, where + // /tmp is a symlink, this breaks anything that expects the + // working directory to keep the original path, including the // go command when dealing with modules. - // The Go stdlib has a special feature where if the cwd and the PWD are the - // same node then it trusts the PWD, so by setting it in the env for the child - // process we fix up all the paths returned by the go command. + // + // os.Getwd has a special feature where if the cwd and the PWD + // are the same node then it trusts the PWD, so by setting it + // in the env for the child process we fix up all the paths + // returned by the go command. if !i.CleanEnv { cmd.Env = os.Environ() } @@ -262,7 +275,12 @@ func (i *Invocation) run(ctx context.Context, stdout, stderr io.Writer) error { cmd.Dir = i.WorkingDir } - defer func(start time.Time) { log("%s for %v", time.Since(start), cmdDebugStr(cmd)) }(time.Now()) + debugStr := cmdDebugStr(cmd) + i.logf("starting %v", debugStr) + start := time.Now() + defer func() { + i.logf("%s for %v", time.Since(start), debugStr) + }() return runCmdContext(ctx, cmd) } @@ -343,6 +361,7 @@ func runCmdContext(ctx context.Context, cmd *exec.Cmd) (err error) { } } + startTime := time.Now() err = cmd.Start() if stdoutW != nil { // The child process has inherited the pipe file, @@ -369,7 +388,7 @@ func runCmdContext(ctx context.Context, cmd *exec.Cmd) (err error) { case err := <-resChan: return err case <-timer.C: - HandleHangingGoCommand(cmd.Process) + HandleHangingGoCommand(startTime, cmd) case <-ctx.Done(): } } else { @@ -403,7 +422,7 @@ func runCmdContext(ctx context.Context, cmd *exec.Cmd) (err error) { return <-resChan } -func HandleHangingGoCommand(proc *os.Process) { +func HandleHangingGoCommand(start time.Time, cmd *exec.Cmd) { switch runtime.GOOS { case "linux", "darwin", "freebsd", "netbsd": fmt.Fprintln(os.Stderr, `DETECTED A HANGING GO COMMAND @@ -436,7 +455,7 @@ See golang/go#54461 for more details.`) panic(fmt.Sprintf("running %s: %v", listFiles, err)) } } - panic(fmt.Sprintf("detected hanging go command (pid %d): see golang/go#54461 for more details", proc.Pid)) + panic(fmt.Sprintf("detected hanging go command (golang/go#54461); waited %s\n\tcommand:%s\n\tpid:%d", time.Since(start), cmd, cmd.Process.Pid)) } func cmdDebugStr(cmd *exec.Cmd) string { @@ -460,3 +479,73 @@ func cmdDebugStr(cmd *exec.Cmd) string { } return fmt.Sprintf("GOROOT=%v GOPATH=%v GO111MODULE=%v GOPROXY=%v PWD=%v %v", env["GOROOT"], env["GOPATH"], env["GO111MODULE"], env["GOPROXY"], env["PWD"], strings.Join(args, " ")) } + +// WriteOverlays writes each value in the overlay (see the Overlay +// field of go/packages.Config) to a temporary file and returns the name +// of a JSON file describing the mapping that is suitable for the "go +// list -overlay" flag. +// +// On success, the caller must call the cleanup function exactly once +// when the files are no longer needed. +func WriteOverlays(overlay map[string][]byte) (filename string, cleanup func(), err error) { + // Do nothing if there are no overlays in the config. + if len(overlay) == 0 { + return "", func() {}, nil + } + + dir, err := os.MkdirTemp("", "gocommand-*") + if err != nil { + return "", nil, err + } + + // The caller must clean up this directory, + // unless this function returns an error. + // (The cleanup operand of each return + // statement below is ignored.) + defer func() { + cleanup = func() { + os.RemoveAll(dir) + } + if err != nil { + cleanup() + cleanup = nil + } + }() + + // Write each map entry to a temporary file. + overlays := make(map[string]string) + for k, v := range overlay { + // Use a unique basename for each file (001-foo.go), + // to avoid creating nested directories. + base := fmt.Sprintf("%d-%s", 1+len(overlays), filepath.Base(k)) + filename := filepath.Join(dir, base) + err := os.WriteFile(filename, v, 0666) + if err != nil { + return "", nil, err + } + overlays[k] = filename + } + + // Write the JSON overlay file that maps logical file names to temp files. + // + // OverlayJSON is the format overlay files are expected to be in. + // The Replace map maps from overlaid paths to replacement paths: + // the Go command will forward all reads trying to open + // each overlaid path to its replacement path, or consider the overlaid + // path not to exist if the replacement path is empty. + // + // From golang/go#39958. + type OverlayJSON struct { + Replace map[string]string `json:"replace,omitempty"` + } + b, err := json.Marshal(OverlayJSON{Replace: overlays}) + if err != nil { + return "", nil, err + } + filename = filepath.Join(dir, "overlay.json") + if err := os.WriteFile(filename, b, 0666); err != nil { + return "", nil, err + } + + return filename, nil, nil +} diff --git a/vendor/golang.org/x/tools/internal/gocommand/vendor.go b/vendor/golang.org/x/tools/internal/gocommand/vendor.go index 2d3d408..e38d1fb 100644 --- a/vendor/golang.org/x/tools/internal/gocommand/vendor.go +++ b/vendor/golang.org/x/tools/internal/gocommand/vendor.go @@ -107,3 +107,57 @@ func getMainModuleAnd114(ctx context.Context, inv Invocation, r *Runner) (*Modul } return mod, lines[4] == "go1.14", nil } + +// WorkspaceVendorEnabled reports whether workspace vendoring is enabled. It takes a *Runner to execute Go commands +// with the supplied context.Context and Invocation. The Invocation can contain pre-defined fields, +// of which only Verb and Args are modified to run the appropriate Go command. +// Inspired by setDefaultBuildMod in modload/init.go +func WorkspaceVendorEnabled(ctx context.Context, inv Invocation, r *Runner) (bool, []*ModuleJSON, error) { + inv.Verb = "env" + inv.Args = []string{"GOWORK"} + stdout, err := r.Run(ctx, inv) + if err != nil { + return false, nil, err + } + goWork := string(bytes.TrimSpace(stdout.Bytes())) + if fi, err := os.Stat(filepath.Join(filepath.Dir(goWork), "vendor")); err == nil && fi.IsDir() { + mainMods, err := getWorkspaceMainModules(ctx, inv, r) + if err != nil { + return false, nil, err + } + return true, mainMods, nil + } + return false, nil, nil +} + +// getWorkspaceMainModules gets the main modules' information. +// This is the information needed to figure out if vendoring should be enabled. +func getWorkspaceMainModules(ctx context.Context, inv Invocation, r *Runner) ([]*ModuleJSON, error) { + const format = `{{.Path}} +{{.Dir}} +{{.GoMod}} +{{.GoVersion}} +` + inv.Verb = "list" + inv.Args = []string{"-m", "-f", format} + stdout, err := r.Run(ctx, inv) + if err != nil { + return nil, err + } + + lines := strings.Split(strings.TrimSuffix(stdout.String(), "\n"), "\n") + if len(lines) < 4 { + return nil, fmt.Errorf("unexpected stdout: %q", stdout.String()) + } + mods := make([]*ModuleJSON, 0, len(lines)/4) + for i := 0; i < len(lines); i += 4 { + mods = append(mods, &ModuleJSON{ + Path: lines[i], + Dir: lines[i+1], + GoMod: lines[i+2], + GoVersion: lines[i+3], + Main: true, + }) + } + return mods, nil +} diff --git a/vendor/golang.org/x/tools/internal/gopathwalk/walk.go b/vendor/golang.org/x/tools/internal/gopathwalk/walk.go index 452e342..8361515 100644 --- a/vendor/golang.org/x/tools/internal/gopathwalk/walk.go +++ b/vendor/golang.org/x/tools/internal/gopathwalk/walk.go @@ -9,21 +9,27 @@ package gopathwalk import ( "bufio" "bytes" - "log" + "io" + "io/fs" "os" "path/filepath" + "runtime" "strings" + "sync" "time" - - "golang.org/x/tools/internal/fastwalk" ) // Options controls the behavior of a Walk call. type Options struct { // If Logf is non-nil, debug logging is enabled through this function. Logf func(format string, args ...interface{}) + // Search module caches. Also disables legacy goimports ignore rules. ModulesEnabled bool + + // Maximum number of concurrent calls to user-provided callbacks, + // or 0 for GOMAXPROCS. + Concurrency int } // RootType indicates the type of a Root. @@ -44,22 +50,28 @@ type Root struct { Type RootType } -// Walk walks Go source directories ($GOROOT, $GOPATH, etc) to find packages. -// For each package found, add will be called (concurrently) with the absolute +// Walk concurrently walks Go source directories ($GOROOT, $GOPATH, etc) to find packages. +// +// For each package found, add will be called with the absolute // paths of the containing source directory and the package directory. -// add will be called concurrently. +// +// Unlike filepath.WalkDir, Walk follows symbolic links +// (while guarding against cycles). func Walk(roots []Root, add func(root Root, dir string), opts Options) { WalkSkip(roots, add, func(Root, string) bool { return false }, opts) } -// WalkSkip walks Go source directories ($GOROOT, $GOPATH, etc) to find packages. -// For each package found, add will be called (concurrently) with the absolute +// WalkSkip concurrently walks Go source directories ($GOROOT, $GOPATH, etc) to +// find packages. +// +// For each package found, add will be called with the absolute // paths of the containing source directory and the package directory. -// For each directory that will be scanned, skip will be called (concurrently) +// For each directory that will be scanned, skip will be called // with the absolute paths of the containing source directory and the directory. // If skip returns false on a directory it will be processed. -// add will be called concurrently. -// skip will be called concurrently. +// +// Unlike filepath.WalkDir, WalkSkip follows symbolic links +// (while guarding against cycles). func WalkSkip(roots []Root, add func(root Root, dir string), skip func(root Root, dir string) bool, opts Options) { for _, root := range roots { walkDir(root, add, skip, opts) @@ -68,34 +80,51 @@ func WalkSkip(roots []Root, add func(root Root, dir string), skip func(root Root // walkDir creates a walker and starts fastwalk with this walker. func walkDir(root Root, add func(Root, string), skip func(root Root, dir string) bool, opts Options) { + if opts.Logf == nil { + opts.Logf = func(format string, args ...interface{}) {} + } if _, err := os.Stat(root.Path); os.IsNotExist(err) { - if opts.Logf != nil { - opts.Logf("skipping nonexistent directory: %v", root.Path) - } + opts.Logf("skipping nonexistent directory: %v", root.Path) return } start := time.Now() - if opts.Logf != nil { - opts.Logf("scanning %s", root.Path) + opts.Logf("scanning %s", root.Path) + + concurrency := opts.Concurrency + if concurrency == 0 { + // The walk be either CPU-bound or I/O-bound, depending on what the + // caller-supplied add function does and the details of the user's platform + // and machine. Rather than trying to fine-tune the concurrency level for a + // specific environment, we default to GOMAXPROCS: it is likely to be a good + // choice for a CPU-bound add function, and if it is instead I/O-bound, then + // dealing with I/O saturation is arguably the job of the kernel and/or + // runtime. (Oversaturating I/O seems unlikely to harm performance as badly + // as failing to saturate would.) + concurrency = runtime.GOMAXPROCS(0) } w := &walker{ root: root, add: add, skip: skip, opts: opts, + sem: make(chan struct{}, concurrency), } w.init() - if err := fastwalk.Walk(root.Path, w.walk); err != nil { - logf := opts.Logf - if logf == nil { - logf = log.Printf - } - logf("scanning directory %v: %v", root.Path, err) - } - if opts.Logf != nil { - opts.Logf("scanned %s in %v", root.Path, time.Since(start)) + w.sem <- struct{}{} + path := root.Path + if path == "" { + path = "." } + if fi, err := os.Lstat(path); err == nil { + w.walk(path, nil, fs.FileInfoToDirEntry(fi)) + } else { + w.opts.Logf("scanning directory %v: %v", root.Path, err) + } + <-w.sem + w.walking.Wait() + + opts.Logf("scanned %s in %v", root.Path, time.Since(start)) } // walker is the callback for fastwalk.Walk. @@ -105,7 +134,18 @@ type walker struct { skip func(Root, string) bool // The callback that will be invoked for every dir. dir is skipped if it returns true. opts Options // Options passed to Walk by the user. - ignoredDirs []os.FileInfo // The ignored directories, loaded from .goimportsignore files. + walking sync.WaitGroup + sem chan struct{} // Channel of semaphore tokens; send to acquire, receive to release. + ignoredDirs []string + + added sync.Map // map[string]bool +} + +// A symlinkList is a linked list of os.FileInfos for parent directories +// reached via symlinks. +type symlinkList struct { + info os.FileInfo + prev *symlinkList } // init initializes the walker based on its Options @@ -121,14 +161,8 @@ func (w *walker) init() { for _, p := range ignoredPaths { full := filepath.Join(w.root.Path, p) - if fi, err := os.Stat(full); err == nil { - w.ignoredDirs = append(w.ignoredDirs, fi) - if w.opts.Logf != nil { - w.opts.Logf("Directory added to ignore list: %s", full) - } - } else if w.opts.Logf != nil { - w.opts.Logf("Error statting ignored directory: %v", err) - } + w.ignoredDirs = append(w.ignoredDirs, full) + w.opts.Logf("Directory added to ignore list: %s", full) } } @@ -138,12 +172,10 @@ func (w *walker) init() { func (w *walker) getIgnoredDirs(path string) []string { file := filepath.Join(path, ".goimportsignore") slurp, err := os.ReadFile(file) - if w.opts.Logf != nil { - if err != nil { - w.opts.Logf("%v", err) - } else { - w.opts.Logf("Read %s", file) - } + if err != nil { + w.opts.Logf("%v", err) + } else { + w.opts.Logf("Read %s", file) } if err != nil { return nil @@ -162,9 +194,9 @@ func (w *walker) getIgnoredDirs(path string) []string { } // shouldSkipDir reports whether the file should be skipped or not. -func (w *walker) shouldSkipDir(fi os.FileInfo, dir string) bool { +func (w *walker) shouldSkipDir(dir string) bool { for _, ignoredDir := range w.ignoredDirs { - if os.SameFile(fi, ignoredDir) { + if dir == ignoredDir { return true } } @@ -176,85 +208,130 @@ func (w *walker) shouldSkipDir(fi os.FileInfo, dir string) bool { } // walk walks through the given path. -func (w *walker) walk(path string, typ os.FileMode) error { - if typ.IsRegular() { - dir := filepath.Dir(path) - if dir == w.root.Path && (w.root.Type == RootGOROOT || w.root.Type == RootGOPATH) { - // Doesn't make sense to have regular files - // directly in your $GOPATH/src or $GOROOT/src. - return fastwalk.ErrSkipFiles - } - if !strings.HasSuffix(path, ".go") { - return nil +// +// Errors are logged if w.opts.Logf is non-nil, but otherwise ignored. +func (w *walker) walk(path string, pathSymlinks *symlinkList, d fs.DirEntry) { + if d.Type()&os.ModeSymlink != 0 { + // Walk the symlink's target rather than the symlink itself. + // + // (Note that os.Stat, unlike the lower-lever os.Readlink, + // follows arbitrarily many layers of symlinks, so it will eventually + // reach either a non-symlink or a nonexistent target.) + // + // TODO(bcmills): 'go list all' itself ignores symlinks within GOROOT/src + // and GOPATH/src. Do we really need to traverse them here? If so, why? + + fi, err := os.Stat(path) + if err != nil { + w.opts.Logf("%v", err) + return } - w.add(w.root, dir) - return fastwalk.ErrSkipFiles - } - if typ == os.ModeDir { - base := filepath.Base(path) - if base == "" || base[0] == '.' || base[0] == '_' || - base == "testdata" || - (w.root.Type == RootGOROOT && w.opts.ModulesEnabled && base == "vendor") || - (!w.opts.ModulesEnabled && base == "node_modules") { - return filepath.SkipDir + // Avoid walking symlink cycles: if we have already followed a symlink to + // this directory as a parent of itself, don't follow it again. + // + // This doesn't catch the first time through a cycle, but it also minimizes + // the number of extra stat calls we make if we *don't* encounter a cycle. + // Since we don't actually expect to encounter symlink cycles in practice, + // this seems like the right tradeoff. + for parent := pathSymlinks; parent != nil; parent = parent.prev { + if os.SameFile(fi, parent.info) { + return + } } - fi, err := os.Lstat(path) - if err == nil && w.shouldSkipDir(fi, path) { - return filepath.SkipDir + + pathSymlinks = &symlinkList{ + info: fi, + prev: pathSymlinks, } - return nil + d = fs.FileInfoToDirEntry(fi) } - if typ == os.ModeSymlink { - base := filepath.Base(path) - if strings.HasPrefix(base, ".#") { - // Emacs noise. - return nil + + if d.Type().IsRegular() { + if !strings.HasSuffix(path, ".go") { + return } - if w.shouldTraverse(path) { - return fastwalk.ErrTraverseLink + + dir := filepath.Dir(path) + if dir == w.root.Path && (w.root.Type == RootGOROOT || w.root.Type == RootGOPATH) { + // Doesn't make sense to have regular files + // directly in your $GOPATH/src or $GOROOT/src. + // + // TODO(bcmills): there are many levels of directory within + // RootModuleCache where this also wouldn't make sense, + // Can we generalize this to any directory without a corresponding + // import path? + return } - } - return nil -} -// shouldTraverse reports whether the symlink fi, found in dir, -// should be followed. It makes sure symlinks were never visited -// before to avoid symlink loops. -func (w *walker) shouldTraverse(path string) bool { - ts, err := os.Stat(path) - if err != nil { - logf := w.opts.Logf - if logf == nil { - logf = log.Printf + if _, dup := w.added.LoadOrStore(dir, true); !dup { + w.add(w.root, dir) } - logf("%v", err) - return false } - if !ts.IsDir() { - return false + + if !d.IsDir() { + return + } + + base := filepath.Base(path) + if base == "" || base[0] == '.' || base[0] == '_' || + base == "testdata" || + (w.root.Type == RootGOROOT && w.opts.ModulesEnabled && base == "vendor") || + (!w.opts.ModulesEnabled && base == "node_modules") || + w.shouldSkipDir(path) { + return } - if w.shouldSkipDir(ts, filepath.Dir(path)) { - return false + + // Read the directory and walk its entries. + + f, err := os.Open(path) + if err != nil { + w.opts.Logf("%v", err) + return } - // Check for symlink loops by statting each directory component - // and seeing if any are the same file as ts. + defer f.Close() + for { - parent := filepath.Dir(path) - if parent == path { - // Made it to the root without seeing a cycle. - // Use this symlink. - return true - } - parentInfo, err := os.Stat(parent) + // We impose an arbitrary limit on the number of ReadDir results per + // directory to limit the amount of memory consumed for stale or upcoming + // directory entries. The limit trades off CPU (number of syscalls to read + // the whole directory) against RAM (reachable directory entries other than + // the one currently being processed). + // + // Since we process the directories recursively, we will end up maintaining + // a slice of entries for each level of the directory tree. + // (Compare https://go.dev/issue/36197.) + ents, err := f.ReadDir(1024) if err != nil { - return false + if err != io.EOF { + w.opts.Logf("%v", err) + } + break } - if os.SameFile(ts, parentInfo) { - // Cycle. Don't traverse. - return false + + for _, d := range ents { + nextPath := filepath.Join(path, d.Name()) + if d.IsDir() { + select { + case w.sem <- struct{}{}: + // Got a new semaphore token, so we can traverse the directory concurrently. + d := d + w.walking.Add(1) + go func() { + defer func() { + <-w.sem + w.walking.Done() + }() + w.walk(nextPath, pathSymlinks, d) + }() + continue + + default: + // No tokens available, so traverse serially. + } + } + + w.walk(nextPath, pathSymlinks, d) } - path = parent } - } diff --git a/vendor/golang.org/x/tools/internal/imports/fix.go b/vendor/golang.org/x/tools/internal/imports/fix.go index d4f1b4e..c151081 100644 --- a/vendor/golang.org/x/tools/internal/imports/fix.go +++ b/vendor/golang.org/x/tools/internal/imports/fix.go @@ -13,6 +13,8 @@ import ( "go/build" "go/parser" "go/token" + "go/types" + "io/fs" "io/ioutil" "os" "path" @@ -25,10 +27,12 @@ import ( "unicode" "unicode/utf8" + "golang.org/x/sync/errgroup" "golang.org/x/tools/go/ast/astutil" "golang.org/x/tools/internal/event" "golang.org/x/tools/internal/gocommand" "golang.org/x/tools/internal/gopathwalk" + "golang.org/x/tools/internal/stdlib" ) // importToGroup is a list of functions which map from an import path to @@ -101,19 +105,25 @@ type packageInfo struct { // parseOtherFiles parses all the Go files in srcDir except filename, including // test files if filename looks like a test. -func parseOtherFiles(fset *token.FileSet, srcDir, filename string) []*ast.File { +// +// It returns an error only if ctx is cancelled. Files with parse errors are +// ignored. +func parseOtherFiles(ctx context.Context, fset *token.FileSet, srcDir, filename string) ([]*ast.File, error) { // This could use go/packages but it doesn't buy much, and it fails // with https://golang.org/issue/26296 in LoadFiles mode in some cases. considerTests := strings.HasSuffix(filename, "_test.go") fileBase := filepath.Base(filename) - packageFileInfos, err := ioutil.ReadDir(srcDir) + packageFileInfos, err := os.ReadDir(srcDir) if err != nil { - return nil + return nil, ctx.Err() } var files []*ast.File for _, fi := range packageFileInfos { + if ctx.Err() != nil { + return nil, ctx.Err() + } if fi.Name() == fileBase || !strings.HasSuffix(fi.Name(), ".go") { continue } @@ -121,7 +131,7 @@ func parseOtherFiles(fset *token.FileSet, srcDir, filename string) []*ast.File { continue } - f, err := parser.ParseFile(fset, filepath.Join(srcDir, fi.Name()), nil, 0) + f, err := parser.ParseFile(fset, filepath.Join(srcDir, fi.Name()), nil, parser.SkipObjectResolution) if err != nil { continue } @@ -129,7 +139,7 @@ func parseOtherFiles(fset *token.FileSet, srcDir, filename string) []*ast.File { files = append(files, f) } - return files + return files, ctx.Err() } // addGlobals puts the names of package vars into the provided map. @@ -253,7 +263,7 @@ type pass struct { otherFiles []*ast.File // sibling files. // Intermediate state, generated by load. - existingImports map[string]*ImportInfo + existingImports map[string][]*ImportInfo allRefs references missingRefs references @@ -298,6 +308,20 @@ func (p *pass) loadPackageNames(imports []*ImportInfo) error { return nil } +// if there is a trailing major version, remove it +func withoutVersion(nm string) string { + if v := path.Base(nm); len(v) > 0 && v[0] == 'v' { + if _, err := strconv.Atoi(v[1:]); err == nil { + // this is, for instance, called with rand/v2 and returns rand + if len(v) < len(nm) { + xnm := nm[:len(nm)-len(v)-1] + return path.Base(xnm) + } + } + } + return nm +} + // importIdentifier returns the identifier that imp will introduce. It will // guess if the package name has not been loaded, e.g. because the source // is not available. @@ -307,7 +331,7 @@ func (p *pass) importIdentifier(imp *ImportInfo) string { } known := p.knownPackages[imp.ImportPath] if known != nil && known.name != "" { - return known.name + return withoutVersion(known.name) } return ImportPathToAssumedName(imp.ImportPath) } @@ -318,7 +342,7 @@ func (p *pass) importIdentifier(imp *ImportInfo) string { func (p *pass) load() ([]*ImportFix, bool) { p.knownPackages = map[string]*packageInfo{} p.missingRefs = references{} - p.existingImports = map[string]*ImportInfo{} + p.existingImports = map[string][]*ImportInfo{} // Load basic information about the file in question. p.allRefs = collectReferences(p.f) @@ -342,14 +366,12 @@ func (p *pass) load() ([]*ImportFix, bool) { if p.loadRealPackageNames { err := p.loadPackageNames(append(imports, p.candidates...)) if err != nil { - if p.env.Logf != nil { - p.env.Logf("loading package names: %v", err) - } + p.env.logf("loading package names: %v", err) return nil, false } } for _, imp := range imports { - p.existingImports[p.importIdentifier(imp)] = imp + p.existingImports[p.importIdentifier(imp)] = append(p.existingImports[p.importIdentifier(imp)], imp) } // Find missing references. @@ -388,31 +410,33 @@ func (p *pass) fix() ([]*ImportFix, bool) { // Found everything, or giving up. Add the new imports and remove any unused. var fixes []*ImportFix - for _, imp := range p.existingImports { - // We deliberately ignore globals here, because we can't be sure - // they're in the same package. People do things like put multiple - // main packages in the same directory, and we don't want to - // remove imports if they happen to have the same name as a var in - // a different package. - if _, ok := p.allRefs[p.importIdentifier(imp)]; !ok { - fixes = append(fixes, &ImportFix{ - StmtInfo: *imp, - IdentName: p.importIdentifier(imp), - FixType: DeleteImport, - }) - continue - } + for _, identifierImports := range p.existingImports { + for _, imp := range identifierImports { + // We deliberately ignore globals here, because we can't be sure + // they're in the same package. People do things like put multiple + // main packages in the same directory, and we don't want to + // remove imports if they happen to have the same name as a var in + // a different package. + if _, ok := p.allRefs[p.importIdentifier(imp)]; !ok { + fixes = append(fixes, &ImportFix{ + StmtInfo: *imp, + IdentName: p.importIdentifier(imp), + FixType: DeleteImport, + }) + continue + } - // An existing import may need to update its import name to be correct. - if name := p.importSpecName(imp); name != imp.Name { - fixes = append(fixes, &ImportFix{ - StmtInfo: ImportInfo{ - Name: name, - ImportPath: imp.ImportPath, - }, - IdentName: p.importIdentifier(imp), - FixType: SetImportName, - }) + // An existing import may need to update its import name to be correct. + if name := p.importSpecName(imp); name != imp.Name { + fixes = append(fixes, &ImportFix{ + StmtInfo: ImportInfo{ + Name: name, + ImportPath: imp.ImportPath, + }, + IdentName: p.importIdentifier(imp), + FixType: SetImportName, + }) + } } } // Collecting fixes involved map iteration, so sort for stability. See @@ -507,9 +531,9 @@ func (p *pass) assumeSiblingImportsValid() { } for left, rights := range refs { if imp, ok := importsByName[left]; ok { - if m, ok := stdlib[imp.ImportPath]; ok { + if m, ok := stdlib.PackageSymbols[imp.ImportPath]; ok { // We have the stdlib in memory; no need to guess. - rights = copyExports(m) + rights = symbolNameSet(m) } p.addCandidate(imp, &packageInfo{ // no name; we already know it. @@ -541,6 +565,8 @@ func (p *pass) addCandidate(imp *ImportInfo, pkg *packageInfo) { // // This is declared as a variable rather than a function so goimports can // easily be extended by adding a file with an init function. +// +// DO NOT REMOVE: used internally at Google. var fixImports = fixImportsDefault func fixImportsDefault(fset *token.FileSet, f *ast.File, filename string, env *ProcessEnv) error { @@ -560,9 +586,7 @@ func getFixes(ctx context.Context, fset *token.FileSet, f *ast.File, filename st return nil, err } srcDir := filepath.Dir(abs) - if env.Logf != nil { - env.Logf("fixImports(filename=%q), abs=%q, srcDir=%q ...", filename, abs, srcDir) - } + env.logf("fixImports(filename=%q), abs=%q, srcDir=%q ...", filename, abs, srcDir) // First pass: looking only at f, and using the naive algorithm to // derive package names from import paths, see if the file is already @@ -573,7 +597,10 @@ func getFixes(ctx context.Context, fset *token.FileSet, f *ast.File, filename st return fixes, nil } - otherFiles := parseOtherFiles(fset, srcDir, filename) + otherFiles, err := parseOtherFiles(ctx, fset, srcDir, filename) + if err != nil { + return nil, err + } // Second pass: add information from other files in the same package, // like their package vars and imports. @@ -637,7 +664,7 @@ func getCandidatePkgs(ctx context.Context, wrappedCallback *scanCallback, filena dupCheck := map[string]struct{}{} // Start off with the standard library. - for importPath, exports := range stdlib { + for importPath, symbols := range stdlib.PackageSymbols { p := &pkg{ dir: filepath.Join(goenv["GOROOT"], "src", importPath), importPathShort: importPath, @@ -646,6 +673,13 @@ func getCandidatePkgs(ctx context.Context, wrappedCallback *scanCallback, filena } dupCheck[importPath] = struct{}{} if notSelf(p) && wrappedCallback.dirFound(p) && wrappedCallback.packageNameLoaded(p) { + var exports []stdlib.Symbol + for _, sym := range symbols { + switch sym.Kind { + case stdlib.Func, stdlib.Type, stdlib.Var, stdlib.Const: + exports = append(exports, sym) + } + } wrappedCallback.exportsLoaded(p, exports) } } @@ -666,7 +700,7 @@ func getCandidatePkgs(ctx context.Context, wrappedCallback *scanCallback, filena dupCheck[pkg.importPathShort] = struct{}{} return notSelf(pkg) && wrappedCallback.packageNameLoaded(pkg) }, - exportsLoaded: func(pkg *pkg, exports []string) { + exportsLoaded: func(pkg *pkg, exports []stdlib.Symbol) { // If we're an x_test, load the package under test's test variant. if strings.HasSuffix(filePkg, "_test") && pkg.dir == filepath.Dir(filename) { var err error @@ -697,20 +731,21 @@ func ScoreImportPaths(ctx context.Context, env *ProcessEnv, paths []string) (map return result, nil } -func PrimeCache(ctx context.Context, env *ProcessEnv) error { +func PrimeCache(ctx context.Context, resolver Resolver) error { // Fully scan the disk for directories, but don't actually read any Go files. callback := &scanCallback{ - rootFound: func(gopathwalk.Root) bool { - return true + rootFound: func(root gopathwalk.Root) bool { + // See getCandidatePkgs: walking GOROOT is apparently expensive and + // unnecessary. + return root.Type != gopathwalk.RootGOROOT }, dirFound: func(pkg *pkg) bool { return false }, - packageNameLoaded: func(pkg *pkg) bool { - return false - }, + // packageNameLoaded and exportsLoaded must never be called. } - return getCandidatePkgs(ctx, callback, "", "", env) + + return resolver.scan(ctx, callback) } func candidateImportName(pkg *pkg) string { @@ -790,7 +825,7 @@ func GetImportPaths(ctx context.Context, wrapped func(ImportFix), searchPrefix, // A PackageExport is a package and its exports. type PackageExport struct { Fix *ImportFix - Exports []string + Exports []stdlib.Symbol } // GetPackageExports returns all known packages with name pkg and their exports. @@ -805,8 +840,8 @@ func GetPackageExports(ctx context.Context, wrapped func(PackageExport), searchP packageNameLoaded: func(pkg *pkg) bool { return pkg.packageName == searchPkg }, - exportsLoaded: func(pkg *pkg, exports []string) { - sort.Strings(exports) + exportsLoaded: func(pkg *pkg, exports []stdlib.Symbol) { + sortSymbols(exports) wrapped(PackageExport{ Fix: &ImportFix{ StmtInfo: ImportInfo{ @@ -824,16 +859,45 @@ func GetPackageExports(ctx context.Context, wrapped func(PackageExport), searchP return getCandidatePkgs(ctx, callback, filename, filePkg, env) } -var requiredGoEnvVars = []string{"GO111MODULE", "GOFLAGS", "GOINSECURE", "GOMOD", "GOMODCACHE", "GONOPROXY", "GONOSUMDB", "GOPATH", "GOPROXY", "GOROOT", "GOSUMDB", "GOWORK"} +// TODO(rfindley): we should depend on GOOS and GOARCH, to provide accurate +// imports when doing cross-platform development. +var requiredGoEnvVars = []string{ + "GO111MODULE", + "GOFLAGS", + "GOINSECURE", + "GOMOD", + "GOMODCACHE", + "GONOPROXY", + "GONOSUMDB", + "GOPATH", + "GOPROXY", + "GOROOT", + "GOSUMDB", + "GOWORK", +} // ProcessEnv contains environment variables and settings that affect the use of // the go command, the go/build package, etc. +// +// ...a ProcessEnv *also* overwrites its Env along with derived state in the +// form of the resolver. And because it is lazily initialized, an env may just +// be broken and unusable, but there is no way for the caller to detect that: +// all queries will just fail. +// +// TODO(rfindley): refactor this package so that this type (perhaps renamed to +// just Env or Config) is an immutable configuration struct, to be exchanged +// for an initialized object via a constructor that returns an error. Perhaps +// the signature should be `func NewResolver(*Env) (*Resolver, error)`, where +// resolver is a concrete type used for resolving imports. Via this +// refactoring, we can avoid the need to call ProcessEnv.init and +// ProcessEnv.GoEnv everywhere, and implicitly fix all the places where this +// these are misused. Also, we'd delegate the caller the decision of how to +// handle a broken environment. type ProcessEnv struct { GocmdRunner *gocommand.Runner BuildFlags []string ModFlag string - ModFile string // SkipPathInScan returns true if the path should be skipped from scans of // the RootCurrentModule root type. The function argument is a clean, @@ -843,7 +907,7 @@ type ProcessEnv struct { // Env overrides the OS environment, and can be used to specify // GOPROXY, GO111MODULE, etc. PATH cannot be set here, because // exec.Command will not honor it. - // Specifying all of RequiredGoEnvVars avoids a call to `go env`. + // Specifying all of requiredGoEnvVars avoids a call to `go env`. Env map[string]string WorkingDir string @@ -851,9 +915,17 @@ type ProcessEnv struct { // If Logf is non-nil, debug logging is enabled through this function. Logf func(format string, args ...interface{}) - initialized bool + // If set, ModCache holds a shared cache of directory info to use across + // multiple ProcessEnvs. + ModCache *DirInfoCache + + initialized bool // see TODO above - resolver Resolver + // resolver and resolverErr are lazily evaluated (see GetResolver). + // This is unclean, but see the big TODO in the docstring for ProcessEnv + // above: for now, we can't be sure that the ProcessEnv is fully initialized. + resolver Resolver + resolverErr error } func (e *ProcessEnv) goEnv() (map[string]string, error) { @@ -933,20 +1005,43 @@ func (e *ProcessEnv) env() []string { } func (e *ProcessEnv) GetResolver() (Resolver, error) { - if e.resolver != nil { - return e.resolver, nil - } if err := e.init(); err != nil { return nil, err } - if len(e.Env["GOMOD"]) == 0 && len(e.Env["GOWORK"]) == 0 { - e.resolver = newGopathResolver(e) - return e.resolver, nil + + if e.resolver == nil && e.resolverErr == nil { + // TODO(rfindley): we should only use a gopathResolver here if the working + // directory is actually *in* GOPATH. (I seem to recall an open gopls issue + // for this behavior, but I can't find it). + // + // For gopls, we can optionally explicitly choose a resolver type, since we + // already know the view type. + if len(e.Env["GOMOD"]) == 0 && len(e.Env["GOWORK"]) == 0 { + e.resolver = newGopathResolver(e) + e.logf("created gopath resolver") + } else if r, err := newModuleResolver(e, e.ModCache); err != nil { + e.resolverErr = err + e.logf("failed to create module resolver: %v", err) + } else { + e.resolver = Resolver(r) + e.logf("created module resolver") + } + } + + return e.resolver, e.resolverErr +} + +// logf logs if e.Logf is non-nil. +func (e *ProcessEnv) logf(format string, args ...any) { + if e.Logf != nil { + e.Logf(format, args...) } - e.resolver = newModuleResolver(e) - return e.resolver, nil } +// buildContext returns the build.Context to use for matching files. +// +// TODO(rfindley): support dynamic GOOS, GOARCH here, when doing cross-platform +// development. func (e *ProcessEnv) buildContext() (*build.Context, error) { ctx := build.Default goenv, err := e.goEnv() @@ -996,24 +1091,40 @@ func addStdlibCandidates(pass *pass, refs references) error { if err != nil { return err } + localbase := func(nm string) string { + ans := path.Base(nm) + if ans[0] == 'v' { + // this is called, for instance, with math/rand/v2 and returns rand/v2 + if _, err := strconv.Atoi(ans[1:]); err == nil { + ix := strings.LastIndex(nm, ans) + more := path.Base(nm[:ix]) + ans = path.Join(more, ans) + } + } + return ans + } add := func(pkg string) { // Prevent self-imports. if path.Base(pkg) == pass.f.Name.Name && filepath.Join(goenv["GOROOT"], "src", pkg) == pass.srcDir { return } - exports := copyExports(stdlib[pkg]) + exports := symbolNameSet(stdlib.PackageSymbols[pkg]) pass.addCandidate( &ImportInfo{ImportPath: pkg}, - &packageInfo{name: path.Base(pkg), exports: exports}) + &packageInfo{name: localbase(pkg), exports: exports}) } for left := range refs { if left == "rand" { - // Make sure we try crypto/rand before math/rand. + // Make sure we try crypto/rand before any version of math/rand as both have Int() + // and our policy is to recommend crypto add("crypto/rand") - add("math/rand") + // if the user's no later than go1.21, this should be "math/rand" + // but we have no way of figuring out what the user is using + // TODO: investigate using the toolchain version to disambiguate in the stdlib + add("math/rand/v2") continue } - for importPath := range stdlib { + for importPath := range stdlib.PackageSymbols { if path.Base(importPath) == left { add(importPath) } @@ -1026,15 +1137,23 @@ func addStdlibCandidates(pass *pass, refs references) error { type Resolver interface { // loadPackageNames loads the package names in importPaths. loadPackageNames(importPaths []string, srcDir string) (map[string]string, error) + // scan works with callback to search for packages. See scanCallback for details. scan(ctx context.Context, callback *scanCallback) error - // loadExports returns the set of exported symbols in the package at dir. - // loadExports may be called concurrently. - loadExports(ctx context.Context, pkg *pkg, includeTest bool) (string, []string, error) + + // loadExports returns the package name and set of exported symbols in the + // package at dir. loadExports may be called concurrently. + loadExports(ctx context.Context, pkg *pkg, includeTest bool) (string, []stdlib.Symbol, error) + // scoreImportPath returns the relevance for an import path. scoreImportPath(ctx context.Context, path string) float64 - ClearForNewScan() + // ClearForNewScan returns a new Resolver based on the receiver that has + // cleared its internal caches of directory contents. + // + // The new resolver should be primed and then set via + // [ProcessEnv.UpdateResolver]. + ClearForNewScan() Resolver } // A scanCallback controls a call to scan and receives its results. @@ -1053,7 +1172,7 @@ type scanCallback struct { // If it returns true, the package's exports will be loaded. packageNameLoaded func(pkg *pkg) bool // exportsLoaded is called when a package's exports have been loaded. - exportsLoaded func(pkg *pkg, exports []string) + exportsLoaded func(pkg *pkg, exports []stdlib.Symbol) } func addExternalCandidates(ctx context.Context, pass *pass, refs references, filename string) error { @@ -1091,7 +1210,7 @@ func addExternalCandidates(ctx context.Context, pass *pass, refs references, fil if err != nil { return err } - if err = resolver.scan(context.Background(), callback); err != nil { + if err = resolver.scan(ctx, callback); err != nil { return err } @@ -1100,57 +1219,66 @@ func addExternalCandidates(ctx context.Context, pass *pass, refs references, fil imp *ImportInfo pkg *packageInfo } - results := make(chan result, len(refs)) + results := make([]*result, len(refs)) - ctx, cancel := context.WithCancel(context.TODO()) - var wg sync.WaitGroup - defer func() { - cancel() - wg.Wait() - }() - var ( - firstErr error - firstErrOnce sync.Once - ) - for pkgName, symbols := range refs { - wg.Add(1) - go func(pkgName string, symbols map[string]bool) { - defer wg.Done() + g, ctx := errgroup.WithContext(ctx) + + searcher := symbolSearcher{ + logf: pass.env.logf, + srcDir: pass.srcDir, + xtest: strings.HasSuffix(pass.f.Name.Name, "_test"), + loadExports: resolver.loadExports, + } - found, err := findImport(ctx, pass, found[pkgName], pkgName, symbols, filename) + i := 0 + for pkgName, symbols := range refs { + index := i // claim an index in results + i++ + pkgName := pkgName + symbols := symbols + g.Go(func() error { + found, err := searcher.search(ctx, found[pkgName], pkgName, symbols) if err != nil { - firstErrOnce.Do(func() { - firstErr = err - cancel() - }) - return + return err } - if found == nil { - return // No matching package. + return nil // No matching package. } imp := &ImportInfo{ ImportPath: found.importPathShort, } - pkg := &packageInfo{ name: pkgName, exports: symbols, } - results <- result{imp, pkg} - }(pkgName, symbols) + results[index] = &result{imp, pkg} + return nil + }) + } + if err := g.Wait(); err != nil { + return err } - go func() { - wg.Wait() - close(results) - }() - for result := range results { + for _, result := range results { + if result == nil { + continue + } + // Don't offer completions that would shadow predeclared + // names, such as github.com/coreos/etcd/error. + if types.Universe.Lookup(result.pkg.name) != nil { // predeclared + // Ideally we would skip this candidate only + // if the predeclared name is actually + // referenced by the file, but that's a lot + // trickier to compute and would still create + // an import that is likely to surprise the + // user before long. + continue + } pass.addCandidate(result.imp, result.pkg) } - return firstErr + return nil } // notIdentifier reports whether ch is an invalid identifier character. @@ -1190,31 +1318,22 @@ func ImportPathToAssumedName(importPath string) string { type gopathResolver struct { env *ProcessEnv walked bool - cache *dirInfoCache + cache *DirInfoCache scanSema chan struct{} // scanSema prevents concurrent scans. } func newGopathResolver(env *ProcessEnv) *gopathResolver { r := &gopathResolver{ - env: env, - cache: &dirInfoCache{ - dirs: map[string]*directoryPackageInfo{}, - listeners: map[*int]cacheListener{}, - }, + env: env, + cache: NewDirInfoCache(), scanSema: make(chan struct{}, 1), } r.scanSema <- struct{}{} return r } -func (r *gopathResolver) ClearForNewScan() { - <-r.scanSema - r.cache = &dirInfoCache{ - dirs: map[string]*directoryPackageInfo{}, - listeners: map[*int]cacheListener{}, - } - r.walked = false - r.scanSema <- struct{}{} +func (r *gopathResolver) ClearForNewScan() Resolver { + return newGopathResolver(r.env) } func (r *gopathResolver) loadPackageNames(importPaths []string, srcDir string) (map[string]string, error) { @@ -1232,7 +1351,7 @@ func (r *gopathResolver) loadPackageNames(importPaths []string, srcDir string) ( // importPathToName finds out the actual package name, as declared in its .go files. func importPathToName(bctx *build.Context, importPath, srcDir string) string { // Fast path for standard library without going to disk. - if _, ok := stdlib[importPath]; ok { + if stdlib.HasPackage(importPath) { return path.Base(importPath) // stdlib packages always match their paths. } @@ -1430,7 +1549,7 @@ func (r *gopathResolver) scan(ctx context.Context, callback *scanCallback) error } func (r *gopathResolver) scoreImportPath(ctx context.Context, path string) float64 { - if _, ok := stdlib[path]; ok { + if stdlib.HasPackage(path) { return MaxRelevance } return MaxRelevance - 1 @@ -1447,7 +1566,7 @@ func filterRoots(roots []gopathwalk.Root, include func(gopathwalk.Root) bool) [] return result } -func (r *gopathResolver) loadExports(ctx context.Context, pkg *pkg, includeTest bool) (string, []string, error) { +func (r *gopathResolver) loadExports(ctx context.Context, pkg *pkg, includeTest bool) (string, []stdlib.Symbol, error) { if info, ok := r.cache.Load(pkg.dir); ok && !includeTest { return r.cache.CacheExports(ctx, r.env, info) } @@ -1467,13 +1586,13 @@ func VendorlessPath(ipath string) string { return ipath } -func loadExportsFromFiles(ctx context.Context, env *ProcessEnv, dir string, includeTest bool) (string, []string, error) { +func loadExportsFromFiles(ctx context.Context, env *ProcessEnv, dir string, includeTest bool) (string, []stdlib.Symbol, error) { // Look for non-test, buildable .go files which could provide exports. - all, err := ioutil.ReadDir(dir) + all, err := os.ReadDir(dir) if err != nil { return "", nil, err } - var files []os.FileInfo + var files []fs.DirEntry for _, fi := range all { name := fi.Name() if !strings.HasSuffix(name, ".go") || (!includeTest && strings.HasSuffix(name, "_test.go")) { @@ -1491,7 +1610,7 @@ func loadExportsFromFiles(ctx context.Context, env *ProcessEnv, dir string, incl } var pkgName string - var exports []string + var exports []stdlib.Symbol fset := token.NewFileSet() for _, fi := range files { select { @@ -1501,11 +1620,10 @@ func loadExportsFromFiles(ctx context.Context, env *ProcessEnv, dir string, incl } fullFile := filepath.Join(dir, fi.Name()) + // Legacy ast.Object resolution is needed here. f, err := parser.ParseFile(fset, fullFile, nil, 0) if err != nil { - if env.Logf != nil { - env.Logf("error parsing %v: %v", fullFile, err) - } + env.logf("error parsing %v: %v", fullFile, err) continue } if f.Name.Name == "documentation" { @@ -1518,40 +1636,72 @@ func loadExportsFromFiles(ctx context.Context, env *ProcessEnv, dir string, incl continue } pkgName = f.Name.Name - for name := range f.Scope.Objects { + for name, obj := range f.Scope.Objects { if ast.IsExported(name) { - exports = append(exports, name) + var kind stdlib.Kind + switch obj.Kind { + case ast.Con: + kind = stdlib.Const + case ast.Typ: + kind = stdlib.Type + case ast.Var: + kind = stdlib.Var + case ast.Fun: + kind = stdlib.Func + } + exports = append(exports, stdlib.Symbol{ + Name: name, + Kind: kind, + Version: 0, // unknown; be permissive + }) } } } + sortSymbols(exports) - if env.Logf != nil { - sortedExports := append([]string(nil), exports...) - sort.Strings(sortedExports) - env.Logf("loaded exports in dir %v (package %v): %v", dir, pkgName, strings.Join(sortedExports, ", ")) - } + env.logf("loaded exports in dir %v (package %v): %v", dir, pkgName, exports) return pkgName, exports, nil } -// findImport searches for a package with the given symbols. -// If no package is found, findImport returns ("", false, nil) -func findImport(ctx context.Context, pass *pass, candidates []pkgDistance, pkgName string, symbols map[string]bool, filename string) (*pkg, error) { +func sortSymbols(syms []stdlib.Symbol) { + sort.Slice(syms, func(i, j int) bool { + return syms[i].Name < syms[j].Name + }) +} + +// A symbolSearcher searches for a package with a set of symbols, among a set +// of candidates. See [symbolSearcher.search]. +// +// The search occurs within the scope of a single file, with context captured +// in srcDir and xtest. +type symbolSearcher struct { + logf func(string, ...any) + srcDir string // directory containing the file + xtest bool // if set, the file containing is an x_test file + loadExports func(ctx context.Context, pkg *pkg, includeTest bool) (string, []stdlib.Symbol, error) +} + +// search searches the provided candidates for a package containing all +// exported symbols. +// +// If successful, returns the resulting package. +func (s *symbolSearcher) search(ctx context.Context, candidates []pkgDistance, pkgName string, symbols map[string]bool) (*pkg, error) { // Sort the candidates by their import package length, // assuming that shorter package names are better than long // ones. Note that this sorts by the de-vendored name, so // there's no "penalty" for vendoring. sort.Sort(byDistanceOrImportPathShortLength(candidates)) - if pass.env.Logf != nil { + if s.logf != nil { for i, c := range candidates { - pass.env.Logf("%s candidate %d/%d: %v in %v", pkgName, i+1, len(candidates), c.pkg.importPathShort, c.pkg.dir) + s.logf("%s candidate %d/%d: %v in %v", pkgName, i+1, len(candidates), c.pkg.importPathShort, c.pkg.dir) } } - resolver, err := pass.env.GetResolver() - if err != nil { - return nil, err - } - // Collect exports for packages with matching names. + // Arrange rescv so that we can we can await results in order of relevance + // and exit as soon as we find the first match. + // + // Search with bounded concurrency, returning as soon as the first result + // among rescv is non-nil. rescv := make([]chan *pkg, len(candidates)) for i := range candidates { rescv[i] = make(chan *pkg, 1) @@ -1559,6 +1709,7 @@ func findImport(ctx context.Context, pass *pass, candidates []pkgDistance, pkgNa const maxConcurrentPackageImport = 4 loadExportsSem := make(chan struct{}, maxConcurrentPackageImport) + // Ensure that all work is completed at exit. ctx, cancel := context.WithCancel(ctx) var wg sync.WaitGroup defer func() { @@ -1566,6 +1717,7 @@ func findImport(ctx context.Context, pass *pass, candidates []pkgDistance, pkgNa wg.Wait() }() + // Start the search. wg.Add(1) go func() { defer wg.Done() @@ -1576,55 +1728,67 @@ func findImport(ctx context.Context, pass *pass, candidates []pkgDistance, pkgNa return } + i := i + c := c wg.Add(1) - go func(c pkgDistance, resc chan<- *pkg) { + go func() { defer func() { <-loadExportsSem wg.Done() }() - - if pass.env.Logf != nil { - pass.env.Logf("loading exports in dir %s (seeking package %s)", c.pkg.dir, pkgName) + if s.logf != nil { + s.logf("loading exports in dir %s (seeking package %s)", c.pkg.dir, pkgName) } - // If we're an x_test, load the package under test's test variant. - includeTest := strings.HasSuffix(pass.f.Name.Name, "_test") && c.pkg.dir == pass.srcDir - _, exports, err := resolver.loadExports(ctx, c.pkg, includeTest) + pkg, err := s.searchOne(ctx, c, symbols) if err != nil { - if pass.env.Logf != nil { - pass.env.Logf("loading exports in dir %s (seeking package %s): %v", c.pkg.dir, pkgName, err) - } - resc <- nil - return - } - - exportsMap := make(map[string]bool, len(exports)) - for _, sym := range exports { - exportsMap[sym] = true - } - - // If it doesn't have the right - // symbols, send nil to mean no match. - for symbol := range symbols { - if !exportsMap[symbol] { - resc <- nil - return + if s.logf != nil && ctx.Err() == nil { + s.logf("loading exports in dir %s (seeking package %s): %v", c.pkg.dir, pkgName, err) } + pkg = nil } - resc <- c.pkg - }(c, rescv[i]) + rescv[i] <- pkg // may be nil + }() } }() + // Await the first (best) result. for _, resc := range rescv { - pkg := <-resc - if pkg == nil { - continue + select { + case r := <-resc: + if r != nil { + return r, nil + } + case <-ctx.Done(): + return nil, ctx.Err() } - return pkg, nil } return nil, nil } +func (s *symbolSearcher) searchOne(ctx context.Context, c pkgDistance, symbols map[string]bool) (*pkg, error) { + if ctx.Err() != nil { + return nil, ctx.Err() + } + // If we're considering the package under test from an x_test, load the + // test variant. + includeTest := s.xtest && c.pkg.dir == s.srcDir + _, exports, err := s.loadExports(ctx, c.pkg, includeTest) + if err != nil { + return nil, err + } + + exportsMap := make(map[string]bool, len(exports)) + for _, sym := range exports { + exportsMap[sym.Name] = true + } + for symbol := range symbols { + if !exportsMap[symbol] { + return nil, nil // no match + } + } + return c.pkg, nil +} + // pkgIsCandidate reports whether pkg is a candidate for satisfying the // finding which package pkgIdent in the file named by filename is trying // to refer to. @@ -1644,58 +1808,24 @@ func pkgIsCandidate(filename string, refs references, pkg *pkg) bool { } // Speed optimization to minimize disk I/O: - // the last two components on disk must contain the - // package name somewhere. // - // This permits mismatch naming like directory - // "go-foo" being package "foo", or "pkg.v3" being "pkg", - // or directory "google.golang.org/api/cloudbilling/v1" - // being package "cloudbilling", but doesn't - // permit a directory "foo" to be package - // "bar", which is strongly discouraged - // anyway. There's no reason goimports needs - // to be slow just to accommodate that. + // Use the matchesPath heuristic to filter to package paths that could + // reasonably match a dangling reference. + // + // This permits mismatch naming like directory "go-foo" being package "foo", + // or "pkg.v3" being "pkg", or directory + // "google.golang.org/api/cloudbilling/v1" being package "cloudbilling", but + // doesn't permit a directory "foo" to be package "bar", which is strongly + // discouraged anyway. There's no reason goimports needs to be slow just to + // accommodate that. for pkgIdent := range refs { - lastTwo := lastTwoComponents(pkg.importPathShort) - if strings.Contains(lastTwo, pkgIdent) { + if matchesPath(pkgIdent, pkg.importPathShort) { return true } - if hasHyphenOrUpperASCII(lastTwo) && !hasHyphenOrUpperASCII(pkgIdent) { - lastTwo = lowerASCIIAndRemoveHyphen(lastTwo) - if strings.Contains(lastTwo, pkgIdent) { - return true - } - } } return false } -func hasHyphenOrUpperASCII(s string) bool { - for i := 0; i < len(s); i++ { - b := s[i] - if b == '-' || ('A' <= b && b <= 'Z') { - return true - } - } - return false -} - -func lowerASCIIAndRemoveHyphen(s string) (ret string) { - buf := make([]byte, 0, len(s)) - for i := 0; i < len(s); i++ { - b := s[i] - switch { - case b == '-': - continue - case 'A' <= b && b <= 'Z': - buf = append(buf, b+('a'-'A')) - default: - buf = append(buf, b) - } - } - return string(buf) -} - // canUse reports whether the package in dir is usable from filename, // respecting the Go "internal" and "vendor" visibility rules. func canUse(filename, dir string) bool { @@ -1736,19 +1866,84 @@ func canUse(filename, dir string) bool { return !strings.Contains(relSlash, "/vendor/") && !strings.Contains(relSlash, "/internal/") && !strings.HasSuffix(relSlash, "/internal") } -// lastTwoComponents returns at most the last two path components -// of v, using either / or \ as the path separator. -func lastTwoComponents(v string) string { +// matchesPath reports whether ident may match a potential package name +// referred to by path, using heuristics to filter out unidiomatic package +// names. +// +// Specifically, it checks whether either of the last two '/'- or '\'-delimited +// path segments matches the identifier. The segment-matching heuristic must +// allow for various conventions around segment naming, including go-foo, +// foo-go, and foo.v3. To handle all of these, matching considers both (1) the +// entire segment, ignoring '-' and '.', as well as (2) the last subsegment +// separated by '-' or '.'. So the segment foo-go matches all of the following +// identifiers: foo, go, and foogo. All matches are case insensitive (for ASCII +// identifiers). +// +// See the docstring for [pkgIsCandidate] for an explanation of how this +// heuristic filters potential candidate packages. +func matchesPath(ident, path string) bool { + // Ignore case, for ASCII. + lowerIfASCII := func(b byte) byte { + if 'A' <= b && b <= 'Z' { + return b + ('a' - 'A') + } + return b + } + + // match reports whether path[start:end] matches ident, ignoring [.-]. + match := func(start, end int) bool { + ii := len(ident) - 1 // current byte in ident + pi := end - 1 // current byte in path + for ; pi >= start && ii >= 0; pi-- { + pb := path[pi] + if pb == '-' || pb == '.' { + continue + } + pb = lowerIfASCII(pb) + ib := lowerIfASCII(ident[ii]) + if pb != ib { + return false + } + ii-- + } + return ii < 0 && pi < start // all bytes matched + } + + // segmentEnd and subsegmentEnd hold the end points of the current segment + // and subsegment intervals. + segmentEnd := len(path) + subsegmentEnd := len(path) + + // Count slashes; we only care about the last two segments. nslash := 0 - for i := len(v) - 1; i >= 0; i-- { - if v[i] == '/' || v[i] == '\\' { + + for i := len(path) - 1; i >= 0; i-- { + switch b := path[i]; b { + // TODO(rfindley): we handle backlashes here only because the previous + // heuristic handled backslashes. This is perhaps overly defensive, but is + // the result of many lessons regarding Chesterton's fence and the + // goimports codebase. + // + // However, this function is only ever called with something called an + // 'importPath'. Is it possible that this is a real import path, and + // therefore we need only consider forward slashes? + case '/', '\\': + if match(i+1, segmentEnd) || match(i+1, subsegmentEnd) { + return true + } nslash++ if nslash == 2 { - return v[i:] + return false // did not match above + } + segmentEnd, subsegmentEnd = i, i // reset + case '-', '.': + if match(i+1, subsegmentEnd) { + return true } + subsegmentEnd = i } } - return v + return match(0, segmentEnd) || match(0, subsegmentEnd) } type visitFn func(node ast.Node) ast.Visitor @@ -1757,10 +1952,13 @@ func (fn visitFn) Visit(node ast.Node) ast.Visitor { return fn(node) } -func copyExports(pkg []string) map[string]bool { - m := make(map[string]bool, len(pkg)) - for _, v := range pkg { - m[v] = true +func symbolNameSet(symbols []stdlib.Symbol) map[string]bool { + names := make(map[string]bool) + for _, sym := range symbols { + switch sym.Kind { + case stdlib.Const, stdlib.Var, stdlib.Type, stdlib.Func: + names[sym.Name] = true + } } - return m + return names } diff --git a/vendor/golang.org/x/tools/internal/imports/imports.go b/vendor/golang.org/x/tools/internal/imports/imports.go index 58e637b..ff6b59a 100644 --- a/vendor/golang.org/x/tools/internal/imports/imports.go +++ b/vendor/golang.org/x/tools/internal/imports/imports.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:generate go run mkstdlib.go - // Package imports implements a Go pretty-printer (like package "go/format") // that also adds or removes import statements as necessary. package imports @@ -88,7 +86,7 @@ func ApplyFixes(fixes []*ImportFix, filename string, src []byte, opt *Options, e // Don't use parse() -- we don't care about fragments or statement lists // here, and we need to work with unparseable files. fileSet := token.NewFileSet() - parserMode := parser.Mode(0) + parserMode := parser.SkipObjectResolution if opt.Comments { parserMode |= parser.ParseComments } @@ -109,7 +107,7 @@ func ApplyFixes(fixes []*ImportFix, filename string, src []byte, opt *Options, e } // formatFile formats the file syntax tree. -// It may mutate the token.FileSet. +// It may mutate the token.FileSet and the ast.File. // // If an adjust function is provided, it is called after formatting // with the original source (formatFile's src parameter) and the @@ -167,7 +165,7 @@ func formatFile(fset *token.FileSet, file *ast.File, src []byte, adjust func(ori // parse parses src, which was read from filename, // as a Go source file or statement list. func parse(fset *token.FileSet, filename string, src []byte, opt *Options) (*ast.File, func(orig, src []byte) []byte, error) { - parserMode := parser.Mode(0) + var parserMode parser.Mode // legacy ast.Object resolution is required here if opt.Comments { parserMode |= parser.ParseComments } @@ -236,7 +234,7 @@ func parse(fset *token.FileSet, filename string, src []byte, opt *Options) (*ast src = src[:len(src)-len("}\n")] // Gofmt has also indented the function body one level. // Remove that indent. - src = bytes.Replace(src, []byte("\n\t"), []byte("\n"), -1) + src = bytes.ReplaceAll(src, []byte("\n\t"), []byte("\n")) return matchSpace(orig, src) } return file, adjust, nil diff --git a/vendor/golang.org/x/tools/internal/imports/mod.go b/vendor/golang.org/x/tools/internal/imports/mod.go index 977d238..8555e3f 100644 --- a/vendor/golang.org/x/tools/internal/imports/mod.go +++ b/vendor/golang.org/x/tools/internal/imports/mod.go @@ -9,7 +9,6 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" "os" "path" "path/filepath" @@ -22,78 +21,138 @@ import ( "golang.org/x/tools/internal/event" "golang.org/x/tools/internal/gocommand" "golang.org/x/tools/internal/gopathwalk" + "golang.org/x/tools/internal/stdlib" ) -// ModuleResolver implements resolver for modules using the go command as little -// as feasible. +// Notes(rfindley): ModuleResolver appears to be heavily optimized for scanning +// as fast as possible, which is desirable for a call to goimports from the +// command line, but it doesn't work as well for gopls, where it suffers from +// slow startup (golang/go#44863) and intermittent hanging (golang/go#59216), +// both caused by populating the cache, albeit in slightly different ways. +// +// A high level list of TODOs: +// - Optimize the scan itself, as there is some redundancy statting and +// reading go.mod files. +// - Invert the relationship between ProcessEnv and Resolver (see the +// docstring of ProcessEnv). +// - Make it easier to use an external resolver implementation. +// +// Smaller TODOs are annotated in the code below. + +// ModuleResolver implements the Resolver interface for a workspace using +// modules. +// +// A goal of the ModuleResolver is to invoke the Go command as little as +// possible. To this end, it runs the Go command only for listing module +// information (i.e. `go list -m -e -json ...`). Package scanning, the process +// of loading package information for the modules, is implemented internally +// via the scan method. +// +// It has two types of state: the state derived from the go command, which +// is populated by init, and the state derived from scans, which is populated +// via scan. A root is considered scanned if it has been walked to discover +// directories. However, if the scan did not require additional information +// from the directory (such as package name or exports), the directory +// information itself may be partially populated. It will be lazily filled in +// as needed by scans, using the scanCallback. type ModuleResolver struct { - env *ProcessEnv - moduleCacheDir string - dummyVendorMod *gocommand.ModuleJSON // If vendoring is enabled, the pseudo-module that represents the /vendor directory. - roots []gopathwalk.Root - scanSema chan struct{} // scanSema prevents concurrent scans and guards scannedRoots. - scannedRoots map[gopathwalk.Root]bool - - initialized bool - mains []*gocommand.ModuleJSON - mainByDir map[string]*gocommand.ModuleJSON - modsByModPath []*gocommand.ModuleJSON // All modules, ordered by # of path components in module Path... - modsByDir []*gocommand.ModuleJSON // ...or number of path components in their Dir. - - // moduleCacheCache stores information about the module cache. - moduleCacheCache *dirInfoCache - otherCache *dirInfoCache + env *ProcessEnv + + // Module state, populated during construction + dummyVendorMod *gocommand.ModuleJSON // if vendoring is enabled, a pseudo-module to represent the /vendor directory + moduleCacheDir string // GOMODCACHE, inferred from GOPATH if unset + roots []gopathwalk.Root // roots to scan, in approximate order of importance + mains []*gocommand.ModuleJSON // main modules + mainByDir map[string]*gocommand.ModuleJSON // module information by dir, to join with roots + modsByModPath []*gocommand.ModuleJSON // all modules, ordered by # of path components in their module path + modsByDir []*gocommand.ModuleJSON // ...or by the number of path components in their Dir. + + // Scanning state, populated by scan + + // scanSema prevents concurrent scans, and guards scannedRoots and the cache + // fields below (though the caches themselves are concurrency safe). + // Receive to acquire, send to release. + scanSema chan struct{} + scannedRoots map[gopathwalk.Root]bool // if true, root has been walked + + // Caches of directory info, populated by scans and scan callbacks + // + // moduleCacheCache stores cached information about roots in the module + // cache, which are immutable and therefore do not need to be invalidated. + // + // otherCache stores information about all other roots (even GOROOT), which + // may change. + moduleCacheCache *DirInfoCache + otherCache *DirInfoCache } -func newModuleResolver(e *ProcessEnv) *ModuleResolver { +// newModuleResolver returns a new module-aware goimports resolver. +// +// Note: use caution when modifying this constructor: changes must also be +// reflected in ModuleResolver.ClearForNewScan. +func newModuleResolver(e *ProcessEnv, moduleCacheCache *DirInfoCache) (*ModuleResolver, error) { r := &ModuleResolver{ env: e, scanSema: make(chan struct{}, 1), } - r.scanSema <- struct{}{} - return r -} - -func (r *ModuleResolver) init() error { - if r.initialized { - return nil - } + r.scanSema <- struct{}{} // release goenv, err := r.env.goEnv() if err != nil { - return err + return nil, err } + + // TODO(rfindley): can we refactor to share logic with r.env.invokeGo? inv := gocommand.Invocation{ BuildFlags: r.env.BuildFlags, ModFlag: r.env.ModFlag, - ModFile: r.env.ModFile, Env: r.env.env(), Logf: r.env.Logf, WorkingDir: r.env.WorkingDir, } vendorEnabled := false - var mainModVendor *gocommand.ModuleJSON - - // Module vendor directories are ignored in workspace mode: - // https://go.googlesource.com/proposal/+/master/design/45713-workspace.md - if len(r.env.Env["GOWORK"]) == 0 { + var mainModVendor *gocommand.ModuleJSON // for module vendoring + var mainModsVendor []*gocommand.ModuleJSON // for workspace vendoring + + goWork := r.env.Env["GOWORK"] + if len(goWork) == 0 { + // TODO(rfindley): VendorEnabled runs the go command to get GOFLAGS, but + // they should be available from the ProcessEnv. Can we avoid the redundant + // invocation? vendorEnabled, mainModVendor, err = gocommand.VendorEnabled(context.TODO(), inv, r.env.GocmdRunner) if err != nil { - return err + return nil, err + } + } else { + vendorEnabled, mainModsVendor, err = gocommand.WorkspaceVendorEnabled(context.Background(), inv, r.env.GocmdRunner) + if err != nil { + return nil, err } } - if mainModVendor != nil && vendorEnabled { - // Vendor mode is on, so all the non-Main modules are irrelevant, - // and we need to search /vendor for everything. - r.mains = []*gocommand.ModuleJSON{mainModVendor} - r.dummyVendorMod = &gocommand.ModuleJSON{ - Path: "", - Dir: filepath.Join(mainModVendor.Dir, "vendor"), + if vendorEnabled { + if mainModVendor != nil { + // Module vendor mode is on, so all the non-Main modules are irrelevant, + // and we need to search /vendor for everything. + r.mains = []*gocommand.ModuleJSON{mainModVendor} + r.dummyVendorMod = &gocommand.ModuleJSON{ + Path: "", + Dir: filepath.Join(mainModVendor.Dir, "vendor"), + } + r.modsByModPath = []*gocommand.ModuleJSON{mainModVendor, r.dummyVendorMod} + r.modsByDir = []*gocommand.ModuleJSON{mainModVendor, r.dummyVendorMod} + } else { + // Workspace vendor mode is on, so all the non-Main modules are irrelevant, + // and we need to search /vendor for everything. + r.mains = mainModsVendor + r.dummyVendorMod = &gocommand.ModuleJSON{ + Path: "", + Dir: filepath.Join(filepath.Dir(goWork), "vendor"), + } + r.modsByModPath = append(append([]*gocommand.ModuleJSON{}, mainModsVendor...), r.dummyVendorMod) + r.modsByDir = append(append([]*gocommand.ModuleJSON{}, mainModsVendor...), r.dummyVendorMod) } - r.modsByModPath = []*gocommand.ModuleJSON{mainModVendor, r.dummyVendorMod} - r.modsByDir = []*gocommand.ModuleJSON{mainModVendor, r.dummyVendorMod} } else { // Vendor mode is off, so run go list -m ... to find everything. err := r.initAllMods() @@ -101,19 +160,14 @@ func (r *ModuleResolver) init() error { // GO111MODULE=on. Other errors are fatal. if err != nil { if errMsg := err.Error(); !strings.Contains(errMsg, "working directory is not part of a module") && !strings.Contains(errMsg, "go.mod file not found") { - return err + return nil, err } } } - if gmc := r.env.Env["GOMODCACHE"]; gmc != "" { - r.moduleCacheDir = gmc - } else { - gopaths := filepath.SplitList(goenv["GOPATH"]) - if len(gopaths) == 0 { - return fmt.Errorf("empty GOPATH") - } - r.moduleCacheDir = filepath.Join(gopaths[0], "/pkg/mod") + r.moduleCacheDir = gomodcacheForEnv(goenv) + if r.moduleCacheDir == "" { + return nil, fmt.Errorf("cannot resolve GOMODCACHE") } sort.Slice(r.modsByModPath, func(i, j int) bool { @@ -129,8 +183,9 @@ func (r *ModuleResolver) init() error { return count(j) < count(i) // descending order }) - r.roots = []gopathwalk.Root{ - {Path: filepath.Join(goenv["GOROOT"], "/src"), Type: gopathwalk.RootGOROOT}, + r.roots = []gopathwalk.Root{} + if goenv["GOROOT"] != "" { // "" happens in tests + r.roots = append(r.roots, gopathwalk.Root{Path: filepath.Join(goenv["GOROOT"], "/src"), Type: gopathwalk.RootGOROOT}) } r.mainByDir = make(map[string]*gocommand.ModuleJSON) for _, main := range r.mains { @@ -142,7 +197,11 @@ func (r *ModuleResolver) init() error { } else { addDep := func(mod *gocommand.ModuleJSON) { if mod.Replace == nil { - // This is redundant with the cache, but we'll skip it cheaply enough. + // This is redundant with the cache, but we'll skip it cheaply enough + // when we encounter it in the module cache scan. + // + // Including it at a lower index in r.roots than the module cache dir + // helps prioritize matches from within existing dependencies. r.roots = append(r.roots, gopathwalk.Root{Path: mod.Dir, Type: gopathwalk.RootModuleCache}) } else { r.roots = append(r.roots, gopathwalk.Root{Path: mod.Dir, Type: gopathwalk.RootOther}) @@ -159,24 +218,43 @@ func (r *ModuleResolver) init() error { addDep(mod) } } + // If provided, share the moduleCacheCache. + // + // TODO(rfindley): The module cache is immutable. However, the loaded + // exports do depend on GOOS and GOARCH. Fortunately, the + // ProcessEnv.buildContext does not adjust these from build.DefaultContext + // (even though it should). So for now, this is OK to share, but we need to + // add logic for handling GOOS/GOARCH. + r.moduleCacheCache = moduleCacheCache r.roots = append(r.roots, gopathwalk.Root{Path: r.moduleCacheDir, Type: gopathwalk.RootModuleCache}) } r.scannedRoots = map[gopathwalk.Root]bool{} if r.moduleCacheCache == nil { - r.moduleCacheCache = &dirInfoCache{ - dirs: map[string]*directoryPackageInfo{}, - listeners: map[*int]cacheListener{}, - } + r.moduleCacheCache = NewDirInfoCache() } - if r.otherCache == nil { - r.otherCache = &dirInfoCache{ - dirs: map[string]*directoryPackageInfo{}, - listeners: map[*int]cacheListener{}, - } - } - r.initialized = true - return nil + r.otherCache = NewDirInfoCache() + return r, nil +} + +// gomodcacheForEnv returns the GOMODCACHE value to use based on the given env +// map, which must have GOMODCACHE and GOPATH populated. +// +// TODO(rfindley): this is defensive refactoring. +// 1. Is this even relevant anymore? Can't we just read GOMODCACHE. +// 2. Use this to separate module cache scanning from other scanning. +func gomodcacheForEnv(goenv map[string]string) string { + if gmc := goenv["GOMODCACHE"]; gmc != "" { + // golang/go#67156: ensure that the module cache is clean, since it is + // assumed as a prefix to directories scanned by gopathwalk, which are + // themselves clean. + return filepath.Clean(gmc) + } + gopaths := filepath.SplitList(goenv["GOPATH"]) + if len(gopaths) == 0 { + return "" + } + return filepath.Join(gopaths[0], "/pkg/mod") } func (r *ModuleResolver) initAllMods() error { @@ -190,9 +268,7 @@ func (r *ModuleResolver) initAllMods() error { return err } if mod.Dir == "" { - if r.env.Logf != nil { - r.env.Logf("module %v has not been downloaded and will be ignored", mod.Path) - } + r.env.logf("module %v has not been downloaded and will be ignored", mod.Path) // Can't do anything with a module that's not downloaded. continue } @@ -207,30 +283,86 @@ func (r *ModuleResolver) initAllMods() error { return nil } -func (r *ModuleResolver) ClearForNewScan() { - <-r.scanSema - r.scannedRoots = map[gopathwalk.Root]bool{} - r.otherCache = &dirInfoCache{ - dirs: map[string]*directoryPackageInfo{}, - listeners: map[*int]cacheListener{}, +// ClearForNewScan invalidates the last scan. +// +// It preserves the set of roots, but forgets about the set of directories. +// Though it forgets the set of module cache directories, it remembers their +// contents, since they are assumed to be immutable. +func (r *ModuleResolver) ClearForNewScan() Resolver { + <-r.scanSema // acquire r, to guard scannedRoots + r2 := &ModuleResolver{ + env: r.env, + dummyVendorMod: r.dummyVendorMod, + moduleCacheDir: r.moduleCacheDir, + roots: r.roots, + mains: r.mains, + mainByDir: r.mainByDir, + modsByModPath: r.modsByModPath, + + scanSema: make(chan struct{}, 1), + scannedRoots: make(map[gopathwalk.Root]bool), + otherCache: NewDirInfoCache(), + moduleCacheCache: r.moduleCacheCache, + } + r2.scanSema <- struct{}{} // r2 must start released + // Invalidate root scans. We don't need to invalidate module cache roots, + // because they are immutable. + // (We don't support a use case where GOMODCACHE is cleaned in the middle of + // e.g. a gopls session: the user must restart gopls to get accurate + // imports.) + // + // Scanning for new directories in GOMODCACHE should be handled elsewhere, + // via a call to ScanModuleCache. + for _, root := range r.roots { + if root.Type == gopathwalk.RootModuleCache && r.scannedRoots[root] { + r2.scannedRoots[root] = true + } } - r.scanSema <- struct{}{} + r.scanSema <- struct{}{} // release r + return r2 } -func (r *ModuleResolver) ClearForNewMod() { - <-r.scanSema - *r = ModuleResolver{ - env: r.env, - moduleCacheCache: r.moduleCacheCache, - otherCache: r.otherCache, - scanSema: r.scanSema, +// ClearModuleInfo invalidates resolver state that depends on go.mod file +// contents (essentially, the output of go list -m -json ...). +// +// Notably, it does not forget directory contents, which are reset +// asynchronously via ClearForNewScan. +// +// If the ProcessEnv is a GOPATH environment, ClearModuleInfo is a no op. +// +// TODO(rfindley): move this to a new env.go, consolidating ProcessEnv methods. +func (e *ProcessEnv) ClearModuleInfo() { + if r, ok := e.resolver.(*ModuleResolver); ok { + resolver, err := newModuleResolver(e, e.ModCache) + if err != nil { + e.resolver = nil + e.resolverErr = err + return + } + + <-r.scanSema // acquire (guards caches) + resolver.moduleCacheCache = r.moduleCacheCache + resolver.otherCache = r.otherCache + r.scanSema <- struct{}{} // release + + e.UpdateResolver(resolver) } - r.init() - r.scanSema <- struct{}{} } -// findPackage returns the module and directory that contains the package at -// the given import path, or returns nil, "" if no module is in scope. +// UpdateResolver sets the resolver for the ProcessEnv to use in imports +// operations. Only for use with the result of [Resolver.ClearForNewScan]. +// +// TODO(rfindley): this awkward API is a result of the (arguably) inverted +// relationship between configuration and state described in the doc comment +// for [ProcessEnv]. +func (e *ProcessEnv) UpdateResolver(r Resolver) { + e.resolver = r + e.resolverErr = nil +} + +// findPackage returns the module and directory from within the main modules +// and their dependencies that contains the package at the given import path, +// or returns nil, "" if no module is in scope. func (r *ModuleResolver) findPackage(importPath string) (*gocommand.ModuleJSON, string) { // This can't find packages in the stdlib, but that's harmless for all // the existing code paths. @@ -265,7 +397,7 @@ func (r *ModuleResolver) findPackage(importPath string) (*gocommand.ModuleJSON, } // Not cached. Read the filesystem. - pkgFiles, err := ioutil.ReadDir(pkgDir) + pkgFiles, err := os.ReadDir(pkgDir) if err != nil { continue } @@ -296,10 +428,6 @@ func (r *ModuleResolver) cacheStore(info directoryPackageInfo) { } } -func (r *ModuleResolver) cacheKeys() []string { - return append(r.moduleCacheCache.Keys(), r.otherCache.Keys()...) -} - // cachePackageName caches the package name for a dir already in the cache. func (r *ModuleResolver) cachePackageName(info directoryPackageInfo) (string, error) { if info.rootType == gopathwalk.RootModuleCache { @@ -308,7 +436,7 @@ func (r *ModuleResolver) cachePackageName(info directoryPackageInfo) (string, er return r.otherCache.CachePackageName(info) } -func (r *ModuleResolver) cacheExports(ctx context.Context, env *ProcessEnv, info directoryPackageInfo) (string, []string, error) { +func (r *ModuleResolver) cacheExports(ctx context.Context, env *ProcessEnv, info directoryPackageInfo) (string, []stdlib.Symbol, error) { if info.rootType == gopathwalk.RootModuleCache { return r.moduleCacheCache.CacheExports(ctx, env, info) } @@ -368,15 +496,15 @@ func (r *ModuleResolver) dirIsNestedModule(dir string, mod *gocommand.ModuleJSON return modDir != mod.Dir } -func (r *ModuleResolver) modInfo(dir string) (modDir string, modName string) { - readModName := func(modFile string) string { - modBytes, err := ioutil.ReadFile(modFile) - if err != nil { - return "" - } - return modulePath(modBytes) +func readModName(modFile string) string { + modBytes, err := os.ReadFile(modFile) + if err != nil { + return "" } + return modulePath(modBytes) +} +func (r *ModuleResolver) modInfo(dir string) (modDir, modName string) { if r.dirInModuleCache(dir) { if matches := modCacheRegexp.FindStringSubmatch(dir); len(matches) == 3 { index := strings.Index(dir, matches[1]+"@"+matches[2]) @@ -410,11 +538,9 @@ func (r *ModuleResolver) dirInModuleCache(dir string) bool { } func (r *ModuleResolver) loadPackageNames(importPaths []string, srcDir string) (map[string]string, error) { - if err := r.init(); err != nil { - return nil, err - } names := map[string]string{} for _, path := range importPaths { + // TODO(rfindley): shouldn't this use the dirInfoCache? _, packageDir := r.findPackage(path) if packageDir == "" { continue @@ -432,10 +558,6 @@ func (r *ModuleResolver) scan(ctx context.Context, callback *scanCallback) error ctx, done := event.Start(ctx, "imports.ModuleResolver.scan") defer done() - if err := r.init(); err != nil { - return err - } - processDir := func(info directoryPackageInfo) { // Skip this directory if we were not able to get the package information successfully. if scanned, err := info.reachedStatus(directoryScanned); !scanned || err != nil { @@ -445,18 +567,18 @@ func (r *ModuleResolver) scan(ctx context.Context, callback *scanCallback) error if err != nil { return } - if !callback.dirFound(pkg) { return } + pkg.packageName, err = r.cachePackageName(info) if err != nil { return } - if !callback.packageNameLoaded(pkg) { return } + _, exports, err := r.loadExports(ctx, pkg, false) if err != nil { return @@ -495,7 +617,6 @@ func (r *ModuleResolver) scan(ctx context.Context, callback *scanCallback) error return packageScanned } - // Add anything new to the cache, and process it if we're still listening. add := func(root gopathwalk.Root, dir string) { r.cacheStore(r.scanDirForPackage(root, dir)) } @@ -510,9 +631,9 @@ func (r *ModuleResolver) scan(ctx context.Context, callback *scanCallback) error select { case <-ctx.Done(): return - case <-r.scanSema: + case <-r.scanSema: // acquire } - defer func() { r.scanSema <- struct{}{} }() + defer func() { r.scanSema <- struct{}{} }() // release // We have the lock on r.scannedRoots, and no other scans can run. for _, root := range roots { if ctx.Err() != nil { @@ -535,7 +656,7 @@ func (r *ModuleResolver) scan(ctx context.Context, callback *scanCallback) error } func (r *ModuleResolver) scoreImportPath(ctx context.Context, path string) float64 { - if _, ok := stdlib[path]; ok { + if stdlib.HasPackage(path) { return MaxRelevance } mod, _ := r.findPackage(path) @@ -613,10 +734,7 @@ func (r *ModuleResolver) canonicalize(info directoryPackageInfo) (*pkg, error) { return res, nil } -func (r *ModuleResolver) loadExports(ctx context.Context, pkg *pkg, includeTest bool) (string, []string, error) { - if err := r.init(); err != nil { - return "", nil, err - } +func (r *ModuleResolver) loadExports(ctx context.Context, pkg *pkg, includeTest bool) (string, []stdlib.Symbol, error) { if info, ok := r.cacheLoad(pkg.dir); ok && !includeTest { return r.cacheExports(ctx, r.env, info) } @@ -625,8 +743,8 @@ func (r *ModuleResolver) loadExports(ctx context.Context, pkg *pkg, includeTest func (r *ModuleResolver) scanDirForPackage(root gopathwalk.Root, dir string) directoryPackageInfo { subdir := "" - if dir != root.Path { - subdir = dir[len(root.Path)+len("/"):] + if prefix := root.Path + string(filepath.Separator); strings.HasPrefix(dir, prefix) { + subdir = dir[len(prefix):] } importPath := filepath.ToSlash(subdir) if strings.HasPrefix(importPath, "vendor/") { @@ -649,9 +767,7 @@ func (r *ModuleResolver) scanDirForPackage(root gopathwalk.Root, dir string) dir } modPath, err := module.UnescapePath(filepath.ToSlash(matches[1])) if err != nil { - if r.env.Logf != nil { - r.env.Logf("decoding module cache path %q: %v", subdir, err) - } + r.env.logf("decoding module cache path %q: %v", subdir, err) return directoryPackageInfo{ status: directoryScanned, err: fmt.Errorf("decoding module cache path %q: %v", subdir, err), diff --git a/vendor/golang.org/x/tools/internal/imports/mod_cache.go b/vendor/golang.org/x/tools/internal/imports/mod_cache.go index 45690ab..b119269 100644 --- a/vendor/golang.org/x/tools/internal/imports/mod_cache.go +++ b/vendor/golang.org/x/tools/internal/imports/mod_cache.go @@ -7,9 +7,14 @@ package imports import ( "context" "fmt" + "path" + "path/filepath" + "strings" "sync" + "golang.org/x/mod/module" "golang.org/x/tools/internal/gopathwalk" + "golang.org/x/tools/internal/stdlib" ) // To find packages to import, the resolver needs to know about all of @@ -39,6 +44,8 @@ const ( exportsLoaded ) +// directoryPackageInfo holds (possibly incomplete) information about packages +// contained in a given directory. type directoryPackageInfo struct { // status indicates the extent to which this struct has been filled in. status directoryPackageStatus @@ -63,8 +70,11 @@ type directoryPackageInfo struct { packageName string // the package name, as declared in the source. // Set when status >= exportsLoaded. - - exports []string + // TODO(rfindley): it's hard to see this, but exports depend implicitly on + // the default build context GOOS and GOARCH. + // + // We can make this explicit, and key exports by GOOS, GOARCH. + exports []stdlib.Symbol } // reachedStatus returns true when info has a status at least target and any error associated with @@ -79,7 +89,7 @@ func (info *directoryPackageInfo) reachedStatus(target directoryPackageStatus) ( return true, nil } -// dirInfoCache is a concurrency safe map for storing information about +// DirInfoCache is a concurrency-safe map for storing information about // directories that may contain packages. // // The information in this cache is built incrementally. Entries are initialized in scan. @@ -92,21 +102,26 @@ func (info *directoryPackageInfo) reachedStatus(target directoryPackageStatus) ( // The information in the cache is not expected to change for the cache's // lifetime, so there is no protection against competing writes. Users should // take care not to hold the cache across changes to the underlying files. -// -// TODO(suzmue): consider other concurrency strategies and data structures (RWLocks, sync.Map, etc) -type dirInfoCache struct { +type DirInfoCache struct { mu sync.Mutex // dirs stores information about packages in directories, keyed by absolute path. dirs map[string]*directoryPackageInfo listeners map[*int]cacheListener } +func NewDirInfoCache() *DirInfoCache { + return &DirInfoCache{ + dirs: make(map[string]*directoryPackageInfo), + listeners: make(map[*int]cacheListener), + } +} + type cacheListener func(directoryPackageInfo) // ScanAndListen calls listener on all the items in the cache, and on anything // newly added. The returned stop function waits for all in-flight callbacks to // finish and blocks new ones. -func (d *dirInfoCache) ScanAndListen(ctx context.Context, listener cacheListener) func() { +func (d *DirInfoCache) ScanAndListen(ctx context.Context, listener cacheListener) func() { ctx, cancel := context.WithCancel(ctx) // Flushing out all the callbacks is tricky without knowing how many there @@ -162,8 +177,10 @@ func (d *dirInfoCache) ScanAndListen(ctx context.Context, listener cacheListener } // Store stores the package info for dir. -func (d *dirInfoCache) Store(dir string, info directoryPackageInfo) { +func (d *DirInfoCache) Store(dir string, info directoryPackageInfo) { d.mu.Lock() + // TODO(rfindley, golang/go#59216): should we overwrite an existing entry? + // That seems incorrect as the cache should be idempotent. _, old := d.dirs[dir] d.dirs[dir] = &info var listeners []cacheListener @@ -180,7 +197,7 @@ func (d *dirInfoCache) Store(dir string, info directoryPackageInfo) { } // Load returns a copy of the directoryPackageInfo for absolute directory dir. -func (d *dirInfoCache) Load(dir string) (directoryPackageInfo, bool) { +func (d *DirInfoCache) Load(dir string) (directoryPackageInfo, bool) { d.mu.Lock() defer d.mu.Unlock() info, ok := d.dirs[dir] @@ -191,7 +208,7 @@ func (d *dirInfoCache) Load(dir string) (directoryPackageInfo, bool) { } // Keys returns the keys currently present in d. -func (d *dirInfoCache) Keys() (keys []string) { +func (d *DirInfoCache) Keys() (keys []string) { d.mu.Lock() defer d.mu.Unlock() for key := range d.dirs { @@ -200,7 +217,7 @@ func (d *dirInfoCache) Keys() (keys []string) { return keys } -func (d *dirInfoCache) CachePackageName(info directoryPackageInfo) (string, error) { +func (d *DirInfoCache) CachePackageName(info directoryPackageInfo) (string, error) { if loaded, err := info.reachedStatus(nameLoaded); loaded { return info.packageName, err } @@ -213,7 +230,7 @@ func (d *dirInfoCache) CachePackageName(info directoryPackageInfo) (string, erro return info.packageName, info.err } -func (d *dirInfoCache) CacheExports(ctx context.Context, env *ProcessEnv, info directoryPackageInfo) (string, []string, error) { +func (d *DirInfoCache) CacheExports(ctx context.Context, env *ProcessEnv, info directoryPackageInfo) (string, []stdlib.Symbol, error) { if reached, _ := info.reachedStatus(exportsLoaded); reached { return info.packageName, info.exports, info.err } @@ -234,3 +251,81 @@ func (d *dirInfoCache) CacheExports(ctx context.Context, env *ProcessEnv, info d d.Store(info.dir, info) return info.packageName, info.exports, info.err } + +// ScanModuleCache walks the given directory, which must be a GOMODCACHE value, +// for directory package information, storing the results in cache. +func ScanModuleCache(dir string, cache *DirInfoCache, logf func(string, ...any)) { + // Note(rfindley): it's hard to see, but this function attempts to implement + // just the side effects on cache of calling PrimeCache with a ProcessEnv + // that has the given dir as its GOMODCACHE. + // + // Teasing out the control flow, we see that we can avoid any handling of + // vendor/ and can infer module info entirely from the path, simplifying the + // logic here. + + root := gopathwalk.Root{ + Path: filepath.Clean(dir), + Type: gopathwalk.RootModuleCache, + } + + directoryInfo := func(root gopathwalk.Root, dir string) directoryPackageInfo { + // This is a copy of ModuleResolver.scanDirForPackage, trimmed down to + // logic that applies to a module cache directory. + + subdir := "" + if dir != root.Path { + subdir = dir[len(root.Path)+len("/"):] + } + + matches := modCacheRegexp.FindStringSubmatch(subdir) + if len(matches) == 0 { + return directoryPackageInfo{ + status: directoryScanned, + err: fmt.Errorf("invalid module cache path: %v", subdir), + } + } + modPath, err := module.UnescapePath(filepath.ToSlash(matches[1])) + if err != nil { + if logf != nil { + logf("decoding module cache path %q: %v", subdir, err) + } + return directoryPackageInfo{ + status: directoryScanned, + err: fmt.Errorf("decoding module cache path %q: %v", subdir, err), + } + } + importPath := path.Join(modPath, filepath.ToSlash(matches[3])) + index := strings.Index(dir, matches[1]+"@"+matches[2]) + modDir := filepath.Join(dir[:index], matches[1]+"@"+matches[2]) + modName := readModName(filepath.Join(modDir, "go.mod")) + return directoryPackageInfo{ + status: directoryScanned, + dir: dir, + rootType: root.Type, + nonCanonicalImportPath: importPath, + moduleDir: modDir, + moduleName: modName, + } + } + + add := func(root gopathwalk.Root, dir string) { + info := directoryInfo(root, dir) + cache.Store(info.dir, info) + } + + skip := func(_ gopathwalk.Root, dir string) bool { + // Skip directories that have already been scanned. + // + // Note that gopathwalk only adds "package" directories, which must contain + // a .go file, and all such package directories in the module cache are + // immutable. So if we can load a dir, it can be skipped. + info, ok := cache.Load(dir) + if !ok { + return false + } + packageScanned, _ := info.reachedStatus(directoryScanned) + return packageScanned + } + + gopathwalk.WalkSkip([]gopathwalk.Root{root}, add, skip, gopathwalk.Options{Logf: logf, ModulesEnabled: true}) +} diff --git a/vendor/golang.org/x/tools/internal/imports/sortimports.go b/vendor/golang.org/x/tools/internal/imports/sortimports.go index 1a0a7eb..da8194f 100644 --- a/vendor/golang.org/x/tools/internal/imports/sortimports.go +++ b/vendor/golang.org/x/tools/internal/imports/sortimports.go @@ -18,7 +18,7 @@ import ( // sortImports sorts runs of consecutive import lines in import blocks in f. // It also removes duplicate imports when it is possible to do so without data loss. // -// It may mutate the token.File. +// It may mutate the token.File and the ast.File. func sortImports(localPrefix string, tokFile *token.File, f *ast.File) { for i, d := range f.Decls { d, ok := d.(*ast.GenDecl) diff --git a/vendor/golang.org/x/tools/internal/imports/zstdlib.go b/vendor/golang.org/x/tools/internal/imports/zstdlib.go deleted file mode 100644 index 9f992c2..0000000 --- a/vendor/golang.org/x/tools/internal/imports/zstdlib.go +++ /dev/null @@ -1,11345 +0,0 @@ -// Copyright 2022 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Code generated by mkstdlib.go. DO NOT EDIT. - -package imports - -var stdlib = map[string][]string{ - "archive/tar": { - "ErrFieldTooLong", - "ErrHeader", - "ErrInsecurePath", - "ErrWriteAfterClose", - "ErrWriteTooLong", - "FileInfoHeader", - "Format", - "FormatGNU", - "FormatPAX", - "FormatUSTAR", - "FormatUnknown", - "Header", - "NewReader", - "NewWriter", - "Reader", - "TypeBlock", - "TypeChar", - "TypeCont", - "TypeDir", - "TypeFifo", - "TypeGNULongLink", - "TypeGNULongName", - "TypeGNUSparse", - "TypeLink", - "TypeReg", - "TypeRegA", - "TypeSymlink", - "TypeXGlobalHeader", - "TypeXHeader", - "Writer", - }, - "archive/zip": { - "Compressor", - "Decompressor", - "Deflate", - "ErrAlgorithm", - "ErrChecksum", - "ErrFormat", - "ErrInsecurePath", - "File", - "FileHeader", - "FileInfoHeader", - "NewReader", - "NewWriter", - "OpenReader", - "ReadCloser", - "Reader", - "RegisterCompressor", - "RegisterDecompressor", - "Store", - "Writer", - }, - "bufio": { - "ErrAdvanceTooFar", - "ErrBadReadCount", - "ErrBufferFull", - "ErrFinalToken", - "ErrInvalidUnreadByte", - "ErrInvalidUnreadRune", - "ErrNegativeAdvance", - "ErrNegativeCount", - "ErrTooLong", - "MaxScanTokenSize", - "NewReadWriter", - "NewReader", - "NewReaderSize", - "NewScanner", - "NewWriter", - "NewWriterSize", - "ReadWriter", - "Reader", - "ScanBytes", - "ScanLines", - "ScanRunes", - "ScanWords", - "Scanner", - "SplitFunc", - "Writer", - }, - "bytes": { - "Buffer", - "Clone", - "Compare", - "Contains", - "ContainsAny", - "ContainsFunc", - "ContainsRune", - "Count", - "Cut", - "CutPrefix", - "CutSuffix", - "Equal", - "EqualFold", - "ErrTooLarge", - "Fields", - "FieldsFunc", - "HasPrefix", - "HasSuffix", - "Index", - "IndexAny", - "IndexByte", - "IndexFunc", - "IndexRune", - "Join", - "LastIndex", - "LastIndexAny", - "LastIndexByte", - "LastIndexFunc", - "Map", - "MinRead", - "NewBuffer", - "NewBufferString", - "NewReader", - "Reader", - "Repeat", - "Replace", - "ReplaceAll", - "Runes", - "Split", - "SplitAfter", - "SplitAfterN", - "SplitN", - "Title", - "ToLower", - "ToLowerSpecial", - "ToTitle", - "ToTitleSpecial", - "ToUpper", - "ToUpperSpecial", - "ToValidUTF8", - "Trim", - "TrimFunc", - "TrimLeft", - "TrimLeftFunc", - "TrimPrefix", - "TrimRight", - "TrimRightFunc", - "TrimSpace", - "TrimSuffix", - }, - "cmp": { - "Compare", - "Less", - "Ordered", - }, - "compress/bzip2": { - "NewReader", - "StructuralError", - }, - "compress/flate": { - "BestCompression", - "BestSpeed", - "CorruptInputError", - "DefaultCompression", - "HuffmanOnly", - "InternalError", - "NewReader", - "NewReaderDict", - "NewWriter", - "NewWriterDict", - "NoCompression", - "ReadError", - "Reader", - "Resetter", - "WriteError", - "Writer", - }, - "compress/gzip": { - "BestCompression", - "BestSpeed", - "DefaultCompression", - "ErrChecksum", - "ErrHeader", - "Header", - "HuffmanOnly", - "NewReader", - "NewWriter", - "NewWriterLevel", - "NoCompression", - "Reader", - "Writer", - }, - "compress/lzw": { - "LSB", - "MSB", - "NewReader", - "NewWriter", - "Order", - "Reader", - "Writer", - }, - "compress/zlib": { - "BestCompression", - "BestSpeed", - "DefaultCompression", - "ErrChecksum", - "ErrDictionary", - "ErrHeader", - "HuffmanOnly", - "NewReader", - "NewReaderDict", - "NewWriter", - "NewWriterLevel", - "NewWriterLevelDict", - "NoCompression", - "Resetter", - "Writer", - }, - "container/heap": { - "Fix", - "Init", - "Interface", - "Pop", - "Push", - "Remove", - }, - "container/list": { - "Element", - "List", - "New", - }, - "container/ring": { - "New", - "Ring", - }, - "context": { - "AfterFunc", - "Background", - "CancelCauseFunc", - "CancelFunc", - "Canceled", - "Cause", - "Context", - "DeadlineExceeded", - "TODO", - "WithCancel", - "WithCancelCause", - "WithDeadline", - "WithDeadlineCause", - "WithTimeout", - "WithTimeoutCause", - "WithValue", - "WithoutCancel", - }, - "crypto": { - "BLAKE2b_256", - "BLAKE2b_384", - "BLAKE2b_512", - "BLAKE2s_256", - "Decrypter", - "DecrypterOpts", - "Hash", - "MD4", - "MD5", - "MD5SHA1", - "PrivateKey", - "PublicKey", - "RIPEMD160", - "RegisterHash", - "SHA1", - "SHA224", - "SHA256", - "SHA384", - "SHA3_224", - "SHA3_256", - "SHA3_384", - "SHA3_512", - "SHA512", - "SHA512_224", - "SHA512_256", - "Signer", - "SignerOpts", - }, - "crypto/aes": { - "BlockSize", - "KeySizeError", - "NewCipher", - }, - "crypto/cipher": { - "AEAD", - "Block", - "BlockMode", - "NewCBCDecrypter", - "NewCBCEncrypter", - "NewCFBDecrypter", - "NewCFBEncrypter", - "NewCTR", - "NewGCM", - "NewGCMWithNonceSize", - "NewGCMWithTagSize", - "NewOFB", - "Stream", - "StreamReader", - "StreamWriter", - }, - "crypto/des": { - "BlockSize", - "KeySizeError", - "NewCipher", - "NewTripleDESCipher", - }, - "crypto/dsa": { - "ErrInvalidPublicKey", - "GenerateKey", - "GenerateParameters", - "L1024N160", - "L2048N224", - "L2048N256", - "L3072N256", - "ParameterSizes", - "Parameters", - "PrivateKey", - "PublicKey", - "Sign", - "Verify", - }, - "crypto/ecdh": { - "Curve", - "P256", - "P384", - "P521", - "PrivateKey", - "PublicKey", - "X25519", - }, - "crypto/ecdsa": { - "GenerateKey", - "PrivateKey", - "PublicKey", - "Sign", - "SignASN1", - "Verify", - "VerifyASN1", - }, - "crypto/ed25519": { - "GenerateKey", - "NewKeyFromSeed", - "Options", - "PrivateKey", - "PrivateKeySize", - "PublicKey", - "PublicKeySize", - "SeedSize", - "Sign", - "SignatureSize", - "Verify", - "VerifyWithOptions", - }, - "crypto/elliptic": { - "Curve", - "CurveParams", - "GenerateKey", - "Marshal", - "MarshalCompressed", - "P224", - "P256", - "P384", - "P521", - "Unmarshal", - "UnmarshalCompressed", - }, - "crypto/hmac": { - "Equal", - "New", - }, - "crypto/md5": { - "BlockSize", - "New", - "Size", - "Sum", - }, - "crypto/rand": { - "Int", - "Prime", - "Read", - "Reader", - }, - "crypto/rc4": { - "Cipher", - "KeySizeError", - "NewCipher", - }, - "crypto/rsa": { - "CRTValue", - "DecryptOAEP", - "DecryptPKCS1v15", - "DecryptPKCS1v15SessionKey", - "EncryptOAEP", - "EncryptPKCS1v15", - "ErrDecryption", - "ErrMessageTooLong", - "ErrVerification", - "GenerateKey", - "GenerateMultiPrimeKey", - "OAEPOptions", - "PKCS1v15DecryptOptions", - "PSSOptions", - "PSSSaltLengthAuto", - "PSSSaltLengthEqualsHash", - "PrecomputedValues", - "PrivateKey", - "PublicKey", - "SignPKCS1v15", - "SignPSS", - "VerifyPKCS1v15", - "VerifyPSS", - }, - "crypto/sha1": { - "BlockSize", - "New", - "Size", - "Sum", - }, - "crypto/sha256": { - "BlockSize", - "New", - "New224", - "Size", - "Size224", - "Sum224", - "Sum256", - }, - "crypto/sha512": { - "BlockSize", - "New", - "New384", - "New512_224", - "New512_256", - "Size", - "Size224", - "Size256", - "Size384", - "Sum384", - "Sum512", - "Sum512_224", - "Sum512_256", - }, - "crypto/subtle": { - "ConstantTimeByteEq", - "ConstantTimeCompare", - "ConstantTimeCopy", - "ConstantTimeEq", - "ConstantTimeLessOrEq", - "ConstantTimeSelect", - "XORBytes", - }, - "crypto/tls": { - "AlertError", - "Certificate", - "CertificateRequestInfo", - "CertificateVerificationError", - "CipherSuite", - "CipherSuiteName", - "CipherSuites", - "Client", - "ClientAuthType", - "ClientHelloInfo", - "ClientSessionCache", - "ClientSessionState", - "Config", - "Conn", - "ConnectionState", - "CurveID", - "CurveP256", - "CurveP384", - "CurveP521", - "Dial", - "DialWithDialer", - "Dialer", - "ECDSAWithP256AndSHA256", - "ECDSAWithP384AndSHA384", - "ECDSAWithP521AndSHA512", - "ECDSAWithSHA1", - "Ed25519", - "InsecureCipherSuites", - "Listen", - "LoadX509KeyPair", - "NewLRUClientSessionCache", - "NewListener", - "NewResumptionState", - "NoClientCert", - "PKCS1WithSHA1", - "PKCS1WithSHA256", - "PKCS1WithSHA384", - "PKCS1WithSHA512", - "PSSWithSHA256", - "PSSWithSHA384", - "PSSWithSHA512", - "ParseSessionState", - "QUICClient", - "QUICConfig", - "QUICConn", - "QUICEncryptionLevel", - "QUICEncryptionLevelApplication", - "QUICEncryptionLevelEarly", - "QUICEncryptionLevelHandshake", - "QUICEncryptionLevelInitial", - "QUICEvent", - "QUICEventKind", - "QUICHandshakeDone", - "QUICNoEvent", - "QUICRejectedEarlyData", - "QUICServer", - "QUICSessionTicketOptions", - "QUICSetReadSecret", - "QUICSetWriteSecret", - "QUICTransportParameters", - "QUICTransportParametersRequired", - "QUICWriteData", - "RecordHeaderError", - "RenegotiateFreelyAsClient", - "RenegotiateNever", - "RenegotiateOnceAsClient", - "RenegotiationSupport", - "RequestClientCert", - "RequireAndVerifyClientCert", - "RequireAnyClientCert", - "Server", - "SessionState", - "SignatureScheme", - "TLS_AES_128_GCM_SHA256", - "TLS_AES_256_GCM_SHA384", - "TLS_CHACHA20_POLY1305_SHA256", - "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", - "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256", - "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", - "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305", - "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256", - "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", - "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", - "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", - "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305", - "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256", - "TLS_ECDHE_RSA_WITH_RC4_128_SHA", - "TLS_FALLBACK_SCSV", - "TLS_RSA_WITH_3DES_EDE_CBC_SHA", - "TLS_RSA_WITH_AES_128_CBC_SHA", - "TLS_RSA_WITH_AES_128_CBC_SHA256", - "TLS_RSA_WITH_AES_128_GCM_SHA256", - "TLS_RSA_WITH_AES_256_CBC_SHA", - "TLS_RSA_WITH_AES_256_GCM_SHA384", - "TLS_RSA_WITH_RC4_128_SHA", - "VerifyClientCertIfGiven", - "VersionName", - "VersionSSL30", - "VersionTLS10", - "VersionTLS11", - "VersionTLS12", - "VersionTLS13", - "X25519", - "X509KeyPair", - }, - "crypto/x509": { - "CANotAuthorizedForExtKeyUsage", - "CANotAuthorizedForThisName", - "CertPool", - "Certificate", - "CertificateInvalidError", - "CertificateRequest", - "ConstraintViolationError", - "CreateCertificate", - "CreateCertificateRequest", - "CreateRevocationList", - "DSA", - "DSAWithSHA1", - "DSAWithSHA256", - "DecryptPEMBlock", - "ECDSA", - "ECDSAWithSHA1", - "ECDSAWithSHA256", - "ECDSAWithSHA384", - "ECDSAWithSHA512", - "Ed25519", - "EncryptPEMBlock", - "ErrUnsupportedAlgorithm", - "Expired", - "ExtKeyUsage", - "ExtKeyUsageAny", - "ExtKeyUsageClientAuth", - "ExtKeyUsageCodeSigning", - "ExtKeyUsageEmailProtection", - "ExtKeyUsageIPSECEndSystem", - "ExtKeyUsageIPSECTunnel", - "ExtKeyUsageIPSECUser", - "ExtKeyUsageMicrosoftCommercialCodeSigning", - "ExtKeyUsageMicrosoftKernelCodeSigning", - "ExtKeyUsageMicrosoftServerGatedCrypto", - "ExtKeyUsageNetscapeServerGatedCrypto", - "ExtKeyUsageOCSPSigning", - "ExtKeyUsageServerAuth", - "ExtKeyUsageTimeStamping", - "HostnameError", - "IncompatibleUsage", - "IncorrectPasswordError", - "InsecureAlgorithmError", - "InvalidReason", - "IsEncryptedPEMBlock", - "KeyUsage", - "KeyUsageCRLSign", - "KeyUsageCertSign", - "KeyUsageContentCommitment", - "KeyUsageDataEncipherment", - "KeyUsageDecipherOnly", - "KeyUsageDigitalSignature", - "KeyUsageEncipherOnly", - "KeyUsageKeyAgreement", - "KeyUsageKeyEncipherment", - "MD2WithRSA", - "MD5WithRSA", - "MarshalECPrivateKey", - "MarshalPKCS1PrivateKey", - "MarshalPKCS1PublicKey", - "MarshalPKCS8PrivateKey", - "MarshalPKIXPublicKey", - "NameConstraintsWithoutSANs", - "NameMismatch", - "NewCertPool", - "NotAuthorizedToSign", - "PEMCipher", - "PEMCipher3DES", - "PEMCipherAES128", - "PEMCipherAES192", - "PEMCipherAES256", - "PEMCipherDES", - "ParseCRL", - "ParseCertificate", - "ParseCertificateRequest", - "ParseCertificates", - "ParseDERCRL", - "ParseECPrivateKey", - "ParsePKCS1PrivateKey", - "ParsePKCS1PublicKey", - "ParsePKCS8PrivateKey", - "ParsePKIXPublicKey", - "ParseRevocationList", - "PublicKeyAlgorithm", - "PureEd25519", - "RSA", - "RevocationList", - "RevocationListEntry", - "SHA1WithRSA", - "SHA256WithRSA", - "SHA256WithRSAPSS", - "SHA384WithRSA", - "SHA384WithRSAPSS", - "SHA512WithRSA", - "SHA512WithRSAPSS", - "SetFallbackRoots", - "SignatureAlgorithm", - "SystemCertPool", - "SystemRootsError", - "TooManyConstraints", - "TooManyIntermediates", - "UnconstrainedName", - "UnhandledCriticalExtension", - "UnknownAuthorityError", - "UnknownPublicKeyAlgorithm", - "UnknownSignatureAlgorithm", - "VerifyOptions", - }, - "crypto/x509/pkix": { - "AlgorithmIdentifier", - "AttributeTypeAndValue", - "AttributeTypeAndValueSET", - "CertificateList", - "Extension", - "Name", - "RDNSequence", - "RelativeDistinguishedNameSET", - "RevokedCertificate", - "TBSCertificateList", - }, - "database/sql": { - "ColumnType", - "Conn", - "DB", - "DBStats", - "Drivers", - "ErrConnDone", - "ErrNoRows", - "ErrTxDone", - "IsolationLevel", - "LevelDefault", - "LevelLinearizable", - "LevelReadCommitted", - "LevelReadUncommitted", - "LevelRepeatableRead", - "LevelSerializable", - "LevelSnapshot", - "LevelWriteCommitted", - "Named", - "NamedArg", - "NullBool", - "NullByte", - "NullFloat64", - "NullInt16", - "NullInt32", - "NullInt64", - "NullString", - "NullTime", - "Open", - "OpenDB", - "Out", - "RawBytes", - "Register", - "Result", - "Row", - "Rows", - "Scanner", - "Stmt", - "Tx", - "TxOptions", - }, - "database/sql/driver": { - "Bool", - "ColumnConverter", - "Conn", - "ConnBeginTx", - "ConnPrepareContext", - "Connector", - "DefaultParameterConverter", - "Driver", - "DriverContext", - "ErrBadConn", - "ErrRemoveArgument", - "ErrSkip", - "Execer", - "ExecerContext", - "Int32", - "IsScanValue", - "IsValue", - "IsolationLevel", - "NamedValue", - "NamedValueChecker", - "NotNull", - "Null", - "Pinger", - "Queryer", - "QueryerContext", - "Result", - "ResultNoRows", - "Rows", - "RowsAffected", - "RowsColumnTypeDatabaseTypeName", - "RowsColumnTypeLength", - "RowsColumnTypeNullable", - "RowsColumnTypePrecisionScale", - "RowsColumnTypeScanType", - "RowsNextResultSet", - "SessionResetter", - "Stmt", - "StmtExecContext", - "StmtQueryContext", - "String", - "Tx", - "TxOptions", - "Validator", - "Value", - "ValueConverter", - "Valuer", - }, - "debug/buildinfo": { - "BuildInfo", - "Read", - "ReadFile", - }, - "debug/dwarf": { - "AddrType", - "ArrayType", - "Attr", - "AttrAbstractOrigin", - "AttrAccessibility", - "AttrAddrBase", - "AttrAddrClass", - "AttrAlignment", - "AttrAllocated", - "AttrArtificial", - "AttrAssociated", - "AttrBaseTypes", - "AttrBinaryScale", - "AttrBitOffset", - "AttrBitSize", - "AttrByteSize", - "AttrCallAllCalls", - "AttrCallAllSourceCalls", - "AttrCallAllTailCalls", - "AttrCallColumn", - "AttrCallDataLocation", - "AttrCallDataValue", - "AttrCallFile", - "AttrCallLine", - "AttrCallOrigin", - "AttrCallPC", - "AttrCallParameter", - "AttrCallReturnPC", - "AttrCallTailCall", - "AttrCallTarget", - "AttrCallTargetClobbered", - "AttrCallValue", - "AttrCalling", - "AttrCommonRef", - "AttrCompDir", - "AttrConstExpr", - "AttrConstValue", - "AttrContainingType", - "AttrCount", - "AttrDataBitOffset", - "AttrDataLocation", - "AttrDataMemberLoc", - "AttrDecimalScale", - "AttrDecimalSign", - "AttrDeclColumn", - "AttrDeclFile", - "AttrDeclLine", - "AttrDeclaration", - "AttrDefaultValue", - "AttrDefaulted", - "AttrDeleted", - "AttrDescription", - "AttrDigitCount", - "AttrDiscr", - "AttrDiscrList", - "AttrDiscrValue", - "AttrDwoName", - "AttrElemental", - "AttrEncoding", - "AttrEndianity", - "AttrEntrypc", - "AttrEnumClass", - "AttrExplicit", - "AttrExportSymbols", - "AttrExtension", - "AttrExternal", - "AttrFrameBase", - "AttrFriend", - "AttrHighpc", - "AttrIdentifierCase", - "AttrImport", - "AttrInline", - "AttrIsOptional", - "AttrLanguage", - "AttrLinkageName", - "AttrLocation", - "AttrLoclistsBase", - "AttrLowerBound", - "AttrLowpc", - "AttrMacroInfo", - "AttrMacros", - "AttrMainSubprogram", - "AttrMutable", - "AttrName", - "AttrNamelistItem", - "AttrNoreturn", - "AttrObjectPointer", - "AttrOrdering", - "AttrPictureString", - "AttrPriority", - "AttrProducer", - "AttrPrototyped", - "AttrPure", - "AttrRanges", - "AttrRank", - "AttrRecursive", - "AttrReference", - "AttrReturnAddr", - "AttrRnglistsBase", - "AttrRvalueReference", - "AttrSegment", - "AttrSibling", - "AttrSignature", - "AttrSmall", - "AttrSpecification", - "AttrStartScope", - "AttrStaticLink", - "AttrStmtList", - "AttrStrOffsetsBase", - "AttrStride", - "AttrStrideSize", - "AttrStringLength", - "AttrStringLengthBitSize", - "AttrStringLengthByteSize", - "AttrThreadsScaled", - "AttrTrampoline", - "AttrType", - "AttrUpperBound", - "AttrUseLocation", - "AttrUseUTF8", - "AttrVarParam", - "AttrVirtuality", - "AttrVisibility", - "AttrVtableElemLoc", - "BasicType", - "BoolType", - "CharType", - "Class", - "ClassAddrPtr", - "ClassAddress", - "ClassBlock", - "ClassConstant", - "ClassExprLoc", - "ClassFlag", - "ClassLinePtr", - "ClassLocList", - "ClassLocListPtr", - "ClassMacPtr", - "ClassRangeListPtr", - "ClassReference", - "ClassReferenceAlt", - "ClassReferenceSig", - "ClassRngList", - "ClassRngListsPtr", - "ClassStrOffsetsPtr", - "ClassString", - "ClassStringAlt", - "ClassUnknown", - "CommonType", - "ComplexType", - "Data", - "DecodeError", - "DotDotDotType", - "Entry", - "EnumType", - "EnumValue", - "ErrUnknownPC", - "Field", - "FloatType", - "FuncType", - "IntType", - "LineEntry", - "LineFile", - "LineReader", - "LineReaderPos", - "New", - "Offset", - "PtrType", - "QualType", - "Reader", - "StructField", - "StructType", - "Tag", - "TagAccessDeclaration", - "TagArrayType", - "TagAtomicType", - "TagBaseType", - "TagCallSite", - "TagCallSiteParameter", - "TagCatchDwarfBlock", - "TagClassType", - "TagCoarrayType", - "TagCommonDwarfBlock", - "TagCommonInclusion", - "TagCompileUnit", - "TagCondition", - "TagConstType", - "TagConstant", - "TagDwarfProcedure", - "TagDynamicType", - "TagEntryPoint", - "TagEnumerationType", - "TagEnumerator", - "TagFileType", - "TagFormalParameter", - "TagFriend", - "TagGenericSubrange", - "TagImmutableType", - "TagImportedDeclaration", - "TagImportedModule", - "TagImportedUnit", - "TagInheritance", - "TagInlinedSubroutine", - "TagInterfaceType", - "TagLabel", - "TagLexDwarfBlock", - "TagMember", - "TagModule", - "TagMutableType", - "TagNamelist", - "TagNamelistItem", - "TagNamespace", - "TagPackedType", - "TagPartialUnit", - "TagPointerType", - "TagPtrToMemberType", - "TagReferenceType", - "TagRestrictType", - "TagRvalueReferenceType", - "TagSetType", - "TagSharedType", - "TagSkeletonUnit", - "TagStringType", - "TagStructType", - "TagSubprogram", - "TagSubrangeType", - "TagSubroutineType", - "TagTemplateAlias", - "TagTemplateTypeParameter", - "TagTemplateValueParameter", - "TagThrownType", - "TagTryDwarfBlock", - "TagTypeUnit", - "TagTypedef", - "TagUnionType", - "TagUnspecifiedParameters", - "TagUnspecifiedType", - "TagVariable", - "TagVariant", - "TagVariantPart", - "TagVolatileType", - "TagWithStmt", - "Type", - "TypedefType", - "UcharType", - "UintType", - "UnspecifiedType", - "UnsupportedType", - "VoidType", - }, - "debug/elf": { - "ARM_MAGIC_TRAMP_NUMBER", - "COMPRESS_HIOS", - "COMPRESS_HIPROC", - "COMPRESS_LOOS", - "COMPRESS_LOPROC", - "COMPRESS_ZLIB", - "COMPRESS_ZSTD", - "Chdr32", - "Chdr64", - "Class", - "CompressionType", - "DF_1_CONFALT", - "DF_1_DIRECT", - "DF_1_DISPRELDNE", - "DF_1_DISPRELPND", - "DF_1_EDITED", - "DF_1_ENDFILTEE", - "DF_1_GLOBAL", - "DF_1_GLOBAUDIT", - "DF_1_GROUP", - "DF_1_IGNMULDEF", - "DF_1_INITFIRST", - "DF_1_INTERPOSE", - "DF_1_KMOD", - "DF_1_LOADFLTR", - "DF_1_NOCOMMON", - "DF_1_NODEFLIB", - "DF_1_NODELETE", - "DF_1_NODIRECT", - "DF_1_NODUMP", - "DF_1_NOHDR", - "DF_1_NOKSYMS", - "DF_1_NOOPEN", - "DF_1_NORELOC", - "DF_1_NOW", - "DF_1_ORIGIN", - "DF_1_PIE", - "DF_1_SINGLETON", - "DF_1_STUB", - "DF_1_SYMINTPOSE", - "DF_1_TRANS", - "DF_1_WEAKFILTER", - "DF_BIND_NOW", - "DF_ORIGIN", - "DF_STATIC_TLS", - "DF_SYMBOLIC", - "DF_TEXTREL", - "DT_ADDRRNGHI", - "DT_ADDRRNGLO", - "DT_AUDIT", - "DT_AUXILIARY", - "DT_BIND_NOW", - "DT_CHECKSUM", - "DT_CONFIG", - "DT_DEBUG", - "DT_DEPAUDIT", - "DT_ENCODING", - "DT_FEATURE", - "DT_FILTER", - "DT_FINI", - "DT_FINI_ARRAY", - "DT_FINI_ARRAYSZ", - "DT_FLAGS", - "DT_FLAGS_1", - "DT_GNU_CONFLICT", - "DT_GNU_CONFLICTSZ", - "DT_GNU_HASH", - "DT_GNU_LIBLIST", - "DT_GNU_LIBLISTSZ", - "DT_GNU_PRELINKED", - "DT_HASH", - "DT_HIOS", - "DT_HIPROC", - "DT_INIT", - "DT_INIT_ARRAY", - "DT_INIT_ARRAYSZ", - "DT_JMPREL", - "DT_LOOS", - "DT_LOPROC", - "DT_MIPS_AUX_DYNAMIC", - "DT_MIPS_BASE_ADDRESS", - "DT_MIPS_COMPACT_SIZE", - "DT_MIPS_CONFLICT", - "DT_MIPS_CONFLICTNO", - "DT_MIPS_CXX_FLAGS", - "DT_MIPS_DELTA_CLASS", - "DT_MIPS_DELTA_CLASSSYM", - "DT_MIPS_DELTA_CLASSSYM_NO", - "DT_MIPS_DELTA_CLASS_NO", - "DT_MIPS_DELTA_INSTANCE", - "DT_MIPS_DELTA_INSTANCE_NO", - "DT_MIPS_DELTA_RELOC", - "DT_MIPS_DELTA_RELOC_NO", - "DT_MIPS_DELTA_SYM", - "DT_MIPS_DELTA_SYM_NO", - "DT_MIPS_DYNSTR_ALIGN", - "DT_MIPS_FLAGS", - "DT_MIPS_GOTSYM", - "DT_MIPS_GP_VALUE", - "DT_MIPS_HIDDEN_GOTIDX", - "DT_MIPS_HIPAGENO", - "DT_MIPS_ICHECKSUM", - "DT_MIPS_INTERFACE", - "DT_MIPS_INTERFACE_SIZE", - "DT_MIPS_IVERSION", - "DT_MIPS_LIBLIST", - "DT_MIPS_LIBLISTNO", - "DT_MIPS_LOCALPAGE_GOTIDX", - "DT_MIPS_LOCAL_GOTIDX", - "DT_MIPS_LOCAL_GOTNO", - "DT_MIPS_MSYM", - "DT_MIPS_OPTIONS", - "DT_MIPS_PERF_SUFFIX", - "DT_MIPS_PIXIE_INIT", - "DT_MIPS_PLTGOT", - "DT_MIPS_PROTECTED_GOTIDX", - "DT_MIPS_RLD_MAP", - "DT_MIPS_RLD_MAP_REL", - "DT_MIPS_RLD_TEXT_RESOLVE_ADDR", - "DT_MIPS_RLD_VERSION", - "DT_MIPS_RWPLT", - "DT_MIPS_SYMBOL_LIB", - "DT_MIPS_SYMTABNO", - "DT_MIPS_TIME_STAMP", - "DT_MIPS_UNREFEXTNO", - "DT_MOVEENT", - "DT_MOVESZ", - "DT_MOVETAB", - "DT_NEEDED", - "DT_NULL", - "DT_PLTGOT", - "DT_PLTPAD", - "DT_PLTPADSZ", - "DT_PLTREL", - "DT_PLTRELSZ", - "DT_POSFLAG_1", - "DT_PPC64_GLINK", - "DT_PPC64_OPD", - "DT_PPC64_OPDSZ", - "DT_PPC64_OPT", - "DT_PPC_GOT", - "DT_PPC_OPT", - "DT_PREINIT_ARRAY", - "DT_PREINIT_ARRAYSZ", - "DT_REL", - "DT_RELA", - "DT_RELACOUNT", - "DT_RELAENT", - "DT_RELASZ", - "DT_RELCOUNT", - "DT_RELENT", - "DT_RELSZ", - "DT_RPATH", - "DT_RUNPATH", - "DT_SONAME", - "DT_SPARC_REGISTER", - "DT_STRSZ", - "DT_STRTAB", - "DT_SYMBOLIC", - "DT_SYMENT", - "DT_SYMINENT", - "DT_SYMINFO", - "DT_SYMINSZ", - "DT_SYMTAB", - "DT_SYMTAB_SHNDX", - "DT_TEXTREL", - "DT_TLSDESC_GOT", - "DT_TLSDESC_PLT", - "DT_USED", - "DT_VALRNGHI", - "DT_VALRNGLO", - "DT_VERDEF", - "DT_VERDEFNUM", - "DT_VERNEED", - "DT_VERNEEDNUM", - "DT_VERSYM", - "Data", - "Dyn32", - "Dyn64", - "DynFlag", - "DynFlag1", - "DynTag", - "EI_ABIVERSION", - "EI_CLASS", - "EI_DATA", - "EI_NIDENT", - "EI_OSABI", - "EI_PAD", - "EI_VERSION", - "ELFCLASS32", - "ELFCLASS64", - "ELFCLASSNONE", - "ELFDATA2LSB", - "ELFDATA2MSB", - "ELFDATANONE", - "ELFMAG", - "ELFOSABI_86OPEN", - "ELFOSABI_AIX", - "ELFOSABI_ARM", - "ELFOSABI_AROS", - "ELFOSABI_CLOUDABI", - "ELFOSABI_FENIXOS", - "ELFOSABI_FREEBSD", - "ELFOSABI_HPUX", - "ELFOSABI_HURD", - "ELFOSABI_IRIX", - "ELFOSABI_LINUX", - "ELFOSABI_MODESTO", - "ELFOSABI_NETBSD", - "ELFOSABI_NONE", - "ELFOSABI_NSK", - "ELFOSABI_OPENBSD", - "ELFOSABI_OPENVMS", - "ELFOSABI_SOLARIS", - "ELFOSABI_STANDALONE", - "ELFOSABI_TRU64", - "EM_386", - "EM_486", - "EM_56800EX", - "EM_68HC05", - "EM_68HC08", - "EM_68HC11", - "EM_68HC12", - "EM_68HC16", - "EM_68K", - "EM_78KOR", - "EM_8051", - "EM_860", - "EM_88K", - "EM_960", - "EM_AARCH64", - "EM_ALPHA", - "EM_ALPHA_STD", - "EM_ALTERA_NIOS2", - "EM_AMDGPU", - "EM_ARC", - "EM_ARCA", - "EM_ARC_COMPACT", - "EM_ARC_COMPACT2", - "EM_ARM", - "EM_AVR", - "EM_AVR32", - "EM_BA1", - "EM_BA2", - "EM_BLACKFIN", - "EM_BPF", - "EM_C166", - "EM_CDP", - "EM_CE", - "EM_CLOUDSHIELD", - "EM_COGE", - "EM_COLDFIRE", - "EM_COOL", - "EM_COREA_1ST", - "EM_COREA_2ND", - "EM_CR", - "EM_CR16", - "EM_CRAYNV2", - "EM_CRIS", - "EM_CRX", - "EM_CSR_KALIMBA", - "EM_CUDA", - "EM_CYPRESS_M8C", - "EM_D10V", - "EM_D30V", - "EM_DSP24", - "EM_DSPIC30F", - "EM_DXP", - "EM_ECOG1", - "EM_ECOG16", - "EM_ECOG1X", - "EM_ECOG2", - "EM_ETPU", - "EM_EXCESS", - "EM_F2MC16", - "EM_FIREPATH", - "EM_FR20", - "EM_FR30", - "EM_FT32", - "EM_FX66", - "EM_H8S", - "EM_H8_300", - "EM_H8_300H", - "EM_H8_500", - "EM_HUANY", - "EM_IA_64", - "EM_INTEL205", - "EM_INTEL206", - "EM_INTEL207", - "EM_INTEL208", - "EM_INTEL209", - "EM_IP2K", - "EM_JAVELIN", - "EM_K10M", - "EM_KM32", - "EM_KMX16", - "EM_KMX32", - "EM_KMX8", - "EM_KVARC", - "EM_L10M", - "EM_LANAI", - "EM_LATTICEMICO32", - "EM_LOONGARCH", - "EM_M16C", - "EM_M32", - "EM_M32C", - "EM_M32R", - "EM_MANIK", - "EM_MAX", - "EM_MAXQ30", - "EM_MCHP_PIC", - "EM_MCST_ELBRUS", - "EM_ME16", - "EM_METAG", - "EM_MICROBLAZE", - "EM_MIPS", - "EM_MIPS_RS3_LE", - "EM_MIPS_RS4_BE", - "EM_MIPS_X", - "EM_MMA", - "EM_MMDSP_PLUS", - "EM_MMIX", - "EM_MN10200", - "EM_MN10300", - "EM_MOXIE", - "EM_MSP430", - "EM_NCPU", - "EM_NDR1", - "EM_NDS32", - "EM_NONE", - "EM_NORC", - "EM_NS32K", - "EM_OPEN8", - "EM_OPENRISC", - "EM_PARISC", - "EM_PCP", - "EM_PDP10", - "EM_PDP11", - "EM_PDSP", - "EM_PJ", - "EM_PPC", - "EM_PPC64", - "EM_PRISM", - "EM_QDSP6", - "EM_R32C", - "EM_RCE", - "EM_RH32", - "EM_RISCV", - "EM_RL78", - "EM_RS08", - "EM_RX", - "EM_S370", - "EM_S390", - "EM_SCORE7", - "EM_SEP", - "EM_SE_C17", - "EM_SE_C33", - "EM_SH", - "EM_SHARC", - "EM_SLE9X", - "EM_SNP1K", - "EM_SPARC", - "EM_SPARC32PLUS", - "EM_SPARCV9", - "EM_ST100", - "EM_ST19", - "EM_ST200", - "EM_ST7", - "EM_ST9PLUS", - "EM_STARCORE", - "EM_STM8", - "EM_STXP7X", - "EM_SVX", - "EM_TILE64", - "EM_TILEGX", - "EM_TILEPRO", - "EM_TINYJ", - "EM_TI_ARP32", - "EM_TI_C2000", - "EM_TI_C5500", - "EM_TI_C6000", - "EM_TI_PRU", - "EM_TMM_GPP", - "EM_TPC", - "EM_TRICORE", - "EM_TRIMEDIA", - "EM_TSK3000", - "EM_UNICORE", - "EM_V800", - "EM_V850", - "EM_VAX", - "EM_VIDEOCORE", - "EM_VIDEOCORE3", - "EM_VIDEOCORE5", - "EM_VISIUM", - "EM_VPP500", - "EM_X86_64", - "EM_XCORE", - "EM_XGATE", - "EM_XIMO16", - "EM_XTENSA", - "EM_Z80", - "EM_ZSP", - "ET_CORE", - "ET_DYN", - "ET_EXEC", - "ET_HIOS", - "ET_HIPROC", - "ET_LOOS", - "ET_LOPROC", - "ET_NONE", - "ET_REL", - "EV_CURRENT", - "EV_NONE", - "ErrNoSymbols", - "File", - "FileHeader", - "FormatError", - "Header32", - "Header64", - "ImportedSymbol", - "Machine", - "NT_FPREGSET", - "NT_PRPSINFO", - "NT_PRSTATUS", - "NType", - "NewFile", - "OSABI", - "Open", - "PF_MASKOS", - "PF_MASKPROC", - "PF_R", - "PF_W", - "PF_X", - "PT_AARCH64_ARCHEXT", - "PT_AARCH64_UNWIND", - "PT_ARM_ARCHEXT", - "PT_ARM_EXIDX", - "PT_DYNAMIC", - "PT_GNU_EH_FRAME", - "PT_GNU_MBIND_HI", - "PT_GNU_MBIND_LO", - "PT_GNU_PROPERTY", - "PT_GNU_RELRO", - "PT_GNU_STACK", - "PT_HIOS", - "PT_HIPROC", - "PT_INTERP", - "PT_LOAD", - "PT_LOOS", - "PT_LOPROC", - "PT_MIPS_ABIFLAGS", - "PT_MIPS_OPTIONS", - "PT_MIPS_REGINFO", - "PT_MIPS_RTPROC", - "PT_NOTE", - "PT_NULL", - "PT_OPENBSD_BOOTDATA", - "PT_OPENBSD_RANDOMIZE", - "PT_OPENBSD_WXNEEDED", - "PT_PAX_FLAGS", - "PT_PHDR", - "PT_S390_PGSTE", - "PT_SHLIB", - "PT_SUNWSTACK", - "PT_SUNW_EH_FRAME", - "PT_TLS", - "Prog", - "Prog32", - "Prog64", - "ProgFlag", - "ProgHeader", - "ProgType", - "R_386", - "R_386_16", - "R_386_32", - "R_386_32PLT", - "R_386_8", - "R_386_COPY", - "R_386_GLOB_DAT", - "R_386_GOT32", - "R_386_GOT32X", - "R_386_GOTOFF", - "R_386_GOTPC", - "R_386_IRELATIVE", - "R_386_JMP_SLOT", - "R_386_NONE", - "R_386_PC16", - "R_386_PC32", - "R_386_PC8", - "R_386_PLT32", - "R_386_RELATIVE", - "R_386_SIZE32", - "R_386_TLS_DESC", - "R_386_TLS_DESC_CALL", - "R_386_TLS_DTPMOD32", - "R_386_TLS_DTPOFF32", - "R_386_TLS_GD", - "R_386_TLS_GD_32", - "R_386_TLS_GD_CALL", - "R_386_TLS_GD_POP", - "R_386_TLS_GD_PUSH", - "R_386_TLS_GOTDESC", - "R_386_TLS_GOTIE", - "R_386_TLS_IE", - "R_386_TLS_IE_32", - "R_386_TLS_LDM", - "R_386_TLS_LDM_32", - "R_386_TLS_LDM_CALL", - "R_386_TLS_LDM_POP", - "R_386_TLS_LDM_PUSH", - "R_386_TLS_LDO_32", - "R_386_TLS_LE", - "R_386_TLS_LE_32", - "R_386_TLS_TPOFF", - "R_386_TLS_TPOFF32", - "R_390", - "R_390_12", - "R_390_16", - "R_390_20", - "R_390_32", - "R_390_64", - "R_390_8", - "R_390_COPY", - "R_390_GLOB_DAT", - "R_390_GOT12", - "R_390_GOT16", - "R_390_GOT20", - "R_390_GOT32", - "R_390_GOT64", - "R_390_GOTENT", - "R_390_GOTOFF", - "R_390_GOTOFF16", - "R_390_GOTOFF64", - "R_390_GOTPC", - "R_390_GOTPCDBL", - "R_390_GOTPLT12", - "R_390_GOTPLT16", - "R_390_GOTPLT20", - "R_390_GOTPLT32", - "R_390_GOTPLT64", - "R_390_GOTPLTENT", - "R_390_GOTPLTOFF16", - "R_390_GOTPLTOFF32", - "R_390_GOTPLTOFF64", - "R_390_JMP_SLOT", - "R_390_NONE", - "R_390_PC16", - "R_390_PC16DBL", - "R_390_PC32", - "R_390_PC32DBL", - "R_390_PC64", - "R_390_PLT16DBL", - "R_390_PLT32", - "R_390_PLT32DBL", - "R_390_PLT64", - "R_390_RELATIVE", - "R_390_TLS_DTPMOD", - "R_390_TLS_DTPOFF", - "R_390_TLS_GD32", - "R_390_TLS_GD64", - "R_390_TLS_GDCALL", - "R_390_TLS_GOTIE12", - "R_390_TLS_GOTIE20", - "R_390_TLS_GOTIE32", - "R_390_TLS_GOTIE64", - "R_390_TLS_IE32", - "R_390_TLS_IE64", - "R_390_TLS_IEENT", - "R_390_TLS_LDCALL", - "R_390_TLS_LDM32", - "R_390_TLS_LDM64", - "R_390_TLS_LDO32", - "R_390_TLS_LDO64", - "R_390_TLS_LE32", - "R_390_TLS_LE64", - "R_390_TLS_LOAD", - "R_390_TLS_TPOFF", - "R_AARCH64", - "R_AARCH64_ABS16", - "R_AARCH64_ABS32", - "R_AARCH64_ABS64", - "R_AARCH64_ADD_ABS_LO12_NC", - "R_AARCH64_ADR_GOT_PAGE", - "R_AARCH64_ADR_PREL_LO21", - "R_AARCH64_ADR_PREL_PG_HI21", - "R_AARCH64_ADR_PREL_PG_HI21_NC", - "R_AARCH64_CALL26", - "R_AARCH64_CONDBR19", - "R_AARCH64_COPY", - "R_AARCH64_GLOB_DAT", - "R_AARCH64_GOT_LD_PREL19", - "R_AARCH64_IRELATIVE", - "R_AARCH64_JUMP26", - "R_AARCH64_JUMP_SLOT", - "R_AARCH64_LD64_GOTOFF_LO15", - "R_AARCH64_LD64_GOTPAGE_LO15", - "R_AARCH64_LD64_GOT_LO12_NC", - "R_AARCH64_LDST128_ABS_LO12_NC", - "R_AARCH64_LDST16_ABS_LO12_NC", - "R_AARCH64_LDST32_ABS_LO12_NC", - "R_AARCH64_LDST64_ABS_LO12_NC", - "R_AARCH64_LDST8_ABS_LO12_NC", - "R_AARCH64_LD_PREL_LO19", - "R_AARCH64_MOVW_SABS_G0", - "R_AARCH64_MOVW_SABS_G1", - "R_AARCH64_MOVW_SABS_G2", - "R_AARCH64_MOVW_UABS_G0", - "R_AARCH64_MOVW_UABS_G0_NC", - "R_AARCH64_MOVW_UABS_G1", - "R_AARCH64_MOVW_UABS_G1_NC", - "R_AARCH64_MOVW_UABS_G2", - "R_AARCH64_MOVW_UABS_G2_NC", - "R_AARCH64_MOVW_UABS_G3", - "R_AARCH64_NONE", - "R_AARCH64_NULL", - "R_AARCH64_P32_ABS16", - "R_AARCH64_P32_ABS32", - "R_AARCH64_P32_ADD_ABS_LO12_NC", - "R_AARCH64_P32_ADR_GOT_PAGE", - "R_AARCH64_P32_ADR_PREL_LO21", - "R_AARCH64_P32_ADR_PREL_PG_HI21", - "R_AARCH64_P32_CALL26", - "R_AARCH64_P32_CONDBR19", - "R_AARCH64_P32_COPY", - "R_AARCH64_P32_GLOB_DAT", - "R_AARCH64_P32_GOT_LD_PREL19", - "R_AARCH64_P32_IRELATIVE", - "R_AARCH64_P32_JUMP26", - "R_AARCH64_P32_JUMP_SLOT", - "R_AARCH64_P32_LD32_GOT_LO12_NC", - "R_AARCH64_P32_LDST128_ABS_LO12_NC", - "R_AARCH64_P32_LDST16_ABS_LO12_NC", - "R_AARCH64_P32_LDST32_ABS_LO12_NC", - "R_AARCH64_P32_LDST64_ABS_LO12_NC", - "R_AARCH64_P32_LDST8_ABS_LO12_NC", - "R_AARCH64_P32_LD_PREL_LO19", - "R_AARCH64_P32_MOVW_SABS_G0", - "R_AARCH64_P32_MOVW_UABS_G0", - "R_AARCH64_P32_MOVW_UABS_G0_NC", - "R_AARCH64_P32_MOVW_UABS_G1", - "R_AARCH64_P32_PREL16", - "R_AARCH64_P32_PREL32", - "R_AARCH64_P32_RELATIVE", - "R_AARCH64_P32_TLSDESC", - "R_AARCH64_P32_TLSDESC_ADD_LO12_NC", - "R_AARCH64_P32_TLSDESC_ADR_PAGE21", - "R_AARCH64_P32_TLSDESC_ADR_PREL21", - "R_AARCH64_P32_TLSDESC_CALL", - "R_AARCH64_P32_TLSDESC_LD32_LO12_NC", - "R_AARCH64_P32_TLSDESC_LD_PREL19", - "R_AARCH64_P32_TLSGD_ADD_LO12_NC", - "R_AARCH64_P32_TLSGD_ADR_PAGE21", - "R_AARCH64_P32_TLSIE_ADR_GOTTPREL_PAGE21", - "R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC", - "R_AARCH64_P32_TLSIE_LD_GOTTPREL_PREL19", - "R_AARCH64_P32_TLSLE_ADD_TPREL_HI12", - "R_AARCH64_P32_TLSLE_ADD_TPREL_LO12", - "R_AARCH64_P32_TLSLE_ADD_TPREL_LO12_NC", - "R_AARCH64_P32_TLSLE_MOVW_TPREL_G0", - "R_AARCH64_P32_TLSLE_MOVW_TPREL_G0_NC", - "R_AARCH64_P32_TLSLE_MOVW_TPREL_G1", - "R_AARCH64_P32_TLS_DTPMOD", - "R_AARCH64_P32_TLS_DTPREL", - "R_AARCH64_P32_TLS_TPREL", - "R_AARCH64_P32_TSTBR14", - "R_AARCH64_PREL16", - "R_AARCH64_PREL32", - "R_AARCH64_PREL64", - "R_AARCH64_RELATIVE", - "R_AARCH64_TLSDESC", - "R_AARCH64_TLSDESC_ADD", - "R_AARCH64_TLSDESC_ADD_LO12_NC", - "R_AARCH64_TLSDESC_ADR_PAGE21", - "R_AARCH64_TLSDESC_ADR_PREL21", - "R_AARCH64_TLSDESC_CALL", - "R_AARCH64_TLSDESC_LD64_LO12_NC", - "R_AARCH64_TLSDESC_LDR", - "R_AARCH64_TLSDESC_LD_PREL19", - "R_AARCH64_TLSDESC_OFF_G0_NC", - "R_AARCH64_TLSDESC_OFF_G1", - "R_AARCH64_TLSGD_ADD_LO12_NC", - "R_AARCH64_TLSGD_ADR_PAGE21", - "R_AARCH64_TLSGD_ADR_PREL21", - "R_AARCH64_TLSGD_MOVW_G0_NC", - "R_AARCH64_TLSGD_MOVW_G1", - "R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21", - "R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC", - "R_AARCH64_TLSIE_LD_GOTTPREL_PREL19", - "R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC", - "R_AARCH64_TLSIE_MOVW_GOTTPREL_G1", - "R_AARCH64_TLSLD_ADR_PAGE21", - "R_AARCH64_TLSLD_ADR_PREL21", - "R_AARCH64_TLSLD_LDST128_DTPREL_LO12", - "R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC", - "R_AARCH64_TLSLE_ADD_TPREL_HI12", - "R_AARCH64_TLSLE_ADD_TPREL_LO12", - "R_AARCH64_TLSLE_ADD_TPREL_LO12_NC", - "R_AARCH64_TLSLE_LDST128_TPREL_LO12", - "R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC", - "R_AARCH64_TLSLE_MOVW_TPREL_G0", - "R_AARCH64_TLSLE_MOVW_TPREL_G0_NC", - "R_AARCH64_TLSLE_MOVW_TPREL_G1", - "R_AARCH64_TLSLE_MOVW_TPREL_G1_NC", - "R_AARCH64_TLSLE_MOVW_TPREL_G2", - "R_AARCH64_TLS_DTPMOD64", - "R_AARCH64_TLS_DTPREL64", - "R_AARCH64_TLS_TPREL64", - "R_AARCH64_TSTBR14", - "R_ALPHA", - "R_ALPHA_BRADDR", - "R_ALPHA_COPY", - "R_ALPHA_GLOB_DAT", - "R_ALPHA_GPDISP", - "R_ALPHA_GPREL32", - "R_ALPHA_GPRELHIGH", - "R_ALPHA_GPRELLOW", - "R_ALPHA_GPVALUE", - "R_ALPHA_HINT", - "R_ALPHA_IMMED_BR_HI32", - "R_ALPHA_IMMED_GP_16", - "R_ALPHA_IMMED_GP_HI32", - "R_ALPHA_IMMED_LO32", - "R_ALPHA_IMMED_SCN_HI32", - "R_ALPHA_JMP_SLOT", - "R_ALPHA_LITERAL", - "R_ALPHA_LITUSE", - "R_ALPHA_NONE", - "R_ALPHA_OP_PRSHIFT", - "R_ALPHA_OP_PSUB", - "R_ALPHA_OP_PUSH", - "R_ALPHA_OP_STORE", - "R_ALPHA_REFLONG", - "R_ALPHA_REFQUAD", - "R_ALPHA_RELATIVE", - "R_ALPHA_SREL16", - "R_ALPHA_SREL32", - "R_ALPHA_SREL64", - "R_ARM", - "R_ARM_ABS12", - "R_ARM_ABS16", - "R_ARM_ABS32", - "R_ARM_ABS32_NOI", - "R_ARM_ABS8", - "R_ARM_ALU_PCREL_15_8", - "R_ARM_ALU_PCREL_23_15", - "R_ARM_ALU_PCREL_7_0", - "R_ARM_ALU_PC_G0", - "R_ARM_ALU_PC_G0_NC", - "R_ARM_ALU_PC_G1", - "R_ARM_ALU_PC_G1_NC", - "R_ARM_ALU_PC_G2", - "R_ARM_ALU_SBREL_19_12_NC", - "R_ARM_ALU_SBREL_27_20_CK", - "R_ARM_ALU_SB_G0", - "R_ARM_ALU_SB_G0_NC", - "R_ARM_ALU_SB_G1", - "R_ARM_ALU_SB_G1_NC", - "R_ARM_ALU_SB_G2", - "R_ARM_AMP_VCALL9", - "R_ARM_BASE_ABS", - "R_ARM_CALL", - "R_ARM_COPY", - "R_ARM_GLOB_DAT", - "R_ARM_GNU_VTENTRY", - "R_ARM_GNU_VTINHERIT", - "R_ARM_GOT32", - "R_ARM_GOTOFF", - "R_ARM_GOTOFF12", - "R_ARM_GOTPC", - "R_ARM_GOTRELAX", - "R_ARM_GOT_ABS", - "R_ARM_GOT_BREL12", - "R_ARM_GOT_PREL", - "R_ARM_IRELATIVE", - "R_ARM_JUMP24", - "R_ARM_JUMP_SLOT", - "R_ARM_LDC_PC_G0", - "R_ARM_LDC_PC_G1", - "R_ARM_LDC_PC_G2", - "R_ARM_LDC_SB_G0", - "R_ARM_LDC_SB_G1", - "R_ARM_LDC_SB_G2", - "R_ARM_LDRS_PC_G0", - "R_ARM_LDRS_PC_G1", - "R_ARM_LDRS_PC_G2", - "R_ARM_LDRS_SB_G0", - "R_ARM_LDRS_SB_G1", - "R_ARM_LDRS_SB_G2", - "R_ARM_LDR_PC_G1", - "R_ARM_LDR_PC_G2", - "R_ARM_LDR_SBREL_11_10_NC", - "R_ARM_LDR_SB_G0", - "R_ARM_LDR_SB_G1", - "R_ARM_LDR_SB_G2", - "R_ARM_ME_TOO", - "R_ARM_MOVT_ABS", - "R_ARM_MOVT_BREL", - "R_ARM_MOVT_PREL", - "R_ARM_MOVW_ABS_NC", - "R_ARM_MOVW_BREL", - "R_ARM_MOVW_BREL_NC", - "R_ARM_MOVW_PREL_NC", - "R_ARM_NONE", - "R_ARM_PC13", - "R_ARM_PC24", - "R_ARM_PLT32", - "R_ARM_PLT32_ABS", - "R_ARM_PREL31", - "R_ARM_PRIVATE_0", - "R_ARM_PRIVATE_1", - "R_ARM_PRIVATE_10", - "R_ARM_PRIVATE_11", - "R_ARM_PRIVATE_12", - "R_ARM_PRIVATE_13", - "R_ARM_PRIVATE_14", - "R_ARM_PRIVATE_15", - "R_ARM_PRIVATE_2", - "R_ARM_PRIVATE_3", - "R_ARM_PRIVATE_4", - "R_ARM_PRIVATE_5", - "R_ARM_PRIVATE_6", - "R_ARM_PRIVATE_7", - "R_ARM_PRIVATE_8", - "R_ARM_PRIVATE_9", - "R_ARM_RABS32", - "R_ARM_RBASE", - "R_ARM_REL32", - "R_ARM_REL32_NOI", - "R_ARM_RELATIVE", - "R_ARM_RPC24", - "R_ARM_RREL32", - "R_ARM_RSBREL32", - "R_ARM_RXPC25", - "R_ARM_SBREL31", - "R_ARM_SBREL32", - "R_ARM_SWI24", - "R_ARM_TARGET1", - "R_ARM_TARGET2", - "R_ARM_THM_ABS5", - "R_ARM_THM_ALU_ABS_G0_NC", - "R_ARM_THM_ALU_ABS_G1_NC", - "R_ARM_THM_ALU_ABS_G2_NC", - "R_ARM_THM_ALU_ABS_G3", - "R_ARM_THM_ALU_PREL_11_0", - "R_ARM_THM_GOT_BREL12", - "R_ARM_THM_JUMP11", - "R_ARM_THM_JUMP19", - "R_ARM_THM_JUMP24", - "R_ARM_THM_JUMP6", - "R_ARM_THM_JUMP8", - "R_ARM_THM_MOVT_ABS", - "R_ARM_THM_MOVT_BREL", - "R_ARM_THM_MOVT_PREL", - "R_ARM_THM_MOVW_ABS_NC", - "R_ARM_THM_MOVW_BREL", - "R_ARM_THM_MOVW_BREL_NC", - "R_ARM_THM_MOVW_PREL_NC", - "R_ARM_THM_PC12", - "R_ARM_THM_PC22", - "R_ARM_THM_PC8", - "R_ARM_THM_RPC22", - "R_ARM_THM_SWI8", - "R_ARM_THM_TLS_CALL", - "R_ARM_THM_TLS_DESCSEQ16", - "R_ARM_THM_TLS_DESCSEQ32", - "R_ARM_THM_XPC22", - "R_ARM_TLS_CALL", - "R_ARM_TLS_DESCSEQ", - "R_ARM_TLS_DTPMOD32", - "R_ARM_TLS_DTPOFF32", - "R_ARM_TLS_GD32", - "R_ARM_TLS_GOTDESC", - "R_ARM_TLS_IE12GP", - "R_ARM_TLS_IE32", - "R_ARM_TLS_LDM32", - "R_ARM_TLS_LDO12", - "R_ARM_TLS_LDO32", - "R_ARM_TLS_LE12", - "R_ARM_TLS_LE32", - "R_ARM_TLS_TPOFF32", - "R_ARM_V4BX", - "R_ARM_XPC25", - "R_INFO", - "R_INFO32", - "R_LARCH", - "R_LARCH_32", - "R_LARCH_32_PCREL", - "R_LARCH_64", - "R_LARCH_ABS64_HI12", - "R_LARCH_ABS64_LO20", - "R_LARCH_ABS_HI20", - "R_LARCH_ABS_LO12", - "R_LARCH_ADD16", - "R_LARCH_ADD24", - "R_LARCH_ADD32", - "R_LARCH_ADD64", - "R_LARCH_ADD8", - "R_LARCH_B16", - "R_LARCH_B21", - "R_LARCH_B26", - "R_LARCH_COPY", - "R_LARCH_GNU_VTENTRY", - "R_LARCH_GNU_VTINHERIT", - "R_LARCH_GOT64_HI12", - "R_LARCH_GOT64_LO20", - "R_LARCH_GOT64_PC_HI12", - "R_LARCH_GOT64_PC_LO20", - "R_LARCH_GOT_HI20", - "R_LARCH_GOT_LO12", - "R_LARCH_GOT_PC_HI20", - "R_LARCH_GOT_PC_LO12", - "R_LARCH_IRELATIVE", - "R_LARCH_JUMP_SLOT", - "R_LARCH_MARK_LA", - "R_LARCH_MARK_PCREL", - "R_LARCH_NONE", - "R_LARCH_PCALA64_HI12", - "R_LARCH_PCALA64_LO20", - "R_LARCH_PCALA_HI20", - "R_LARCH_PCALA_LO12", - "R_LARCH_RELATIVE", - "R_LARCH_RELAX", - "R_LARCH_SOP_ADD", - "R_LARCH_SOP_AND", - "R_LARCH_SOP_ASSERT", - "R_LARCH_SOP_IF_ELSE", - "R_LARCH_SOP_NOT", - "R_LARCH_SOP_POP_32_S_0_10_10_16_S2", - "R_LARCH_SOP_POP_32_S_0_5_10_16_S2", - "R_LARCH_SOP_POP_32_S_10_12", - "R_LARCH_SOP_POP_32_S_10_16", - "R_LARCH_SOP_POP_32_S_10_16_S2", - "R_LARCH_SOP_POP_32_S_10_5", - "R_LARCH_SOP_POP_32_S_5_20", - "R_LARCH_SOP_POP_32_U", - "R_LARCH_SOP_POP_32_U_10_12", - "R_LARCH_SOP_PUSH_ABSOLUTE", - "R_LARCH_SOP_PUSH_DUP", - "R_LARCH_SOP_PUSH_GPREL", - "R_LARCH_SOP_PUSH_PCREL", - "R_LARCH_SOP_PUSH_PLT_PCREL", - "R_LARCH_SOP_PUSH_TLS_GD", - "R_LARCH_SOP_PUSH_TLS_GOT", - "R_LARCH_SOP_PUSH_TLS_TPREL", - "R_LARCH_SOP_SL", - "R_LARCH_SOP_SR", - "R_LARCH_SOP_SUB", - "R_LARCH_SUB16", - "R_LARCH_SUB24", - "R_LARCH_SUB32", - "R_LARCH_SUB64", - "R_LARCH_SUB8", - "R_LARCH_TLS_DTPMOD32", - "R_LARCH_TLS_DTPMOD64", - "R_LARCH_TLS_DTPREL32", - "R_LARCH_TLS_DTPREL64", - "R_LARCH_TLS_GD_HI20", - "R_LARCH_TLS_GD_PC_HI20", - "R_LARCH_TLS_IE64_HI12", - "R_LARCH_TLS_IE64_LO20", - "R_LARCH_TLS_IE64_PC_HI12", - "R_LARCH_TLS_IE64_PC_LO20", - "R_LARCH_TLS_IE_HI20", - "R_LARCH_TLS_IE_LO12", - "R_LARCH_TLS_IE_PC_HI20", - "R_LARCH_TLS_IE_PC_LO12", - "R_LARCH_TLS_LD_HI20", - "R_LARCH_TLS_LD_PC_HI20", - "R_LARCH_TLS_LE64_HI12", - "R_LARCH_TLS_LE64_LO20", - "R_LARCH_TLS_LE_HI20", - "R_LARCH_TLS_LE_LO12", - "R_LARCH_TLS_TPREL32", - "R_LARCH_TLS_TPREL64", - "R_MIPS", - "R_MIPS_16", - "R_MIPS_26", - "R_MIPS_32", - "R_MIPS_64", - "R_MIPS_ADD_IMMEDIATE", - "R_MIPS_CALL16", - "R_MIPS_CALL_HI16", - "R_MIPS_CALL_LO16", - "R_MIPS_DELETE", - "R_MIPS_GOT16", - "R_MIPS_GOT_DISP", - "R_MIPS_GOT_HI16", - "R_MIPS_GOT_LO16", - "R_MIPS_GOT_OFST", - "R_MIPS_GOT_PAGE", - "R_MIPS_GPREL16", - "R_MIPS_GPREL32", - "R_MIPS_HI16", - "R_MIPS_HIGHER", - "R_MIPS_HIGHEST", - "R_MIPS_INSERT_A", - "R_MIPS_INSERT_B", - "R_MIPS_JALR", - "R_MIPS_LITERAL", - "R_MIPS_LO16", - "R_MIPS_NONE", - "R_MIPS_PC16", - "R_MIPS_PJUMP", - "R_MIPS_REL16", - "R_MIPS_REL32", - "R_MIPS_RELGOT", - "R_MIPS_SCN_DISP", - "R_MIPS_SHIFT5", - "R_MIPS_SHIFT6", - "R_MIPS_SUB", - "R_MIPS_TLS_DTPMOD32", - "R_MIPS_TLS_DTPMOD64", - "R_MIPS_TLS_DTPREL32", - "R_MIPS_TLS_DTPREL64", - "R_MIPS_TLS_DTPREL_HI16", - "R_MIPS_TLS_DTPREL_LO16", - "R_MIPS_TLS_GD", - "R_MIPS_TLS_GOTTPREL", - "R_MIPS_TLS_LDM", - "R_MIPS_TLS_TPREL32", - "R_MIPS_TLS_TPREL64", - "R_MIPS_TLS_TPREL_HI16", - "R_MIPS_TLS_TPREL_LO16", - "R_PPC", - "R_PPC64", - "R_PPC64_ADDR14", - "R_PPC64_ADDR14_BRNTAKEN", - "R_PPC64_ADDR14_BRTAKEN", - "R_PPC64_ADDR16", - "R_PPC64_ADDR16_DS", - "R_PPC64_ADDR16_HA", - "R_PPC64_ADDR16_HI", - "R_PPC64_ADDR16_HIGH", - "R_PPC64_ADDR16_HIGHA", - "R_PPC64_ADDR16_HIGHER", - "R_PPC64_ADDR16_HIGHER34", - "R_PPC64_ADDR16_HIGHERA", - "R_PPC64_ADDR16_HIGHERA34", - "R_PPC64_ADDR16_HIGHEST", - "R_PPC64_ADDR16_HIGHEST34", - "R_PPC64_ADDR16_HIGHESTA", - "R_PPC64_ADDR16_HIGHESTA34", - "R_PPC64_ADDR16_LO", - "R_PPC64_ADDR16_LO_DS", - "R_PPC64_ADDR24", - "R_PPC64_ADDR32", - "R_PPC64_ADDR64", - "R_PPC64_ADDR64_LOCAL", - "R_PPC64_COPY", - "R_PPC64_D28", - "R_PPC64_D34", - "R_PPC64_D34_HA30", - "R_PPC64_D34_HI30", - "R_PPC64_D34_LO", - "R_PPC64_DTPMOD64", - "R_PPC64_DTPREL16", - "R_PPC64_DTPREL16_DS", - "R_PPC64_DTPREL16_HA", - "R_PPC64_DTPREL16_HI", - "R_PPC64_DTPREL16_HIGH", - "R_PPC64_DTPREL16_HIGHA", - "R_PPC64_DTPREL16_HIGHER", - "R_PPC64_DTPREL16_HIGHERA", - "R_PPC64_DTPREL16_HIGHEST", - "R_PPC64_DTPREL16_HIGHESTA", - "R_PPC64_DTPREL16_LO", - "R_PPC64_DTPREL16_LO_DS", - "R_PPC64_DTPREL34", - "R_PPC64_DTPREL64", - "R_PPC64_ENTRY", - "R_PPC64_GLOB_DAT", - "R_PPC64_GNU_VTENTRY", - "R_PPC64_GNU_VTINHERIT", - "R_PPC64_GOT16", - "R_PPC64_GOT16_DS", - "R_PPC64_GOT16_HA", - "R_PPC64_GOT16_HI", - "R_PPC64_GOT16_LO", - "R_PPC64_GOT16_LO_DS", - "R_PPC64_GOT_DTPREL16_DS", - "R_PPC64_GOT_DTPREL16_HA", - "R_PPC64_GOT_DTPREL16_HI", - "R_PPC64_GOT_DTPREL16_LO_DS", - "R_PPC64_GOT_DTPREL_PCREL34", - "R_PPC64_GOT_PCREL34", - "R_PPC64_GOT_TLSGD16", - "R_PPC64_GOT_TLSGD16_HA", - "R_PPC64_GOT_TLSGD16_HI", - "R_PPC64_GOT_TLSGD16_LO", - "R_PPC64_GOT_TLSGD_PCREL34", - "R_PPC64_GOT_TLSLD16", - "R_PPC64_GOT_TLSLD16_HA", - "R_PPC64_GOT_TLSLD16_HI", - "R_PPC64_GOT_TLSLD16_LO", - "R_PPC64_GOT_TLSLD_PCREL34", - "R_PPC64_GOT_TPREL16_DS", - "R_PPC64_GOT_TPREL16_HA", - "R_PPC64_GOT_TPREL16_HI", - "R_PPC64_GOT_TPREL16_LO_DS", - "R_PPC64_GOT_TPREL_PCREL34", - "R_PPC64_IRELATIVE", - "R_PPC64_JMP_IREL", - "R_PPC64_JMP_SLOT", - "R_PPC64_NONE", - "R_PPC64_PCREL28", - "R_PPC64_PCREL34", - "R_PPC64_PCREL_OPT", - "R_PPC64_PLT16_HA", - "R_PPC64_PLT16_HI", - "R_PPC64_PLT16_LO", - "R_PPC64_PLT16_LO_DS", - "R_PPC64_PLT32", - "R_PPC64_PLT64", - "R_PPC64_PLTCALL", - "R_PPC64_PLTCALL_NOTOC", - "R_PPC64_PLTGOT16", - "R_PPC64_PLTGOT16_DS", - "R_PPC64_PLTGOT16_HA", - "R_PPC64_PLTGOT16_HI", - "R_PPC64_PLTGOT16_LO", - "R_PPC64_PLTGOT_LO_DS", - "R_PPC64_PLTREL32", - "R_PPC64_PLTREL64", - "R_PPC64_PLTSEQ", - "R_PPC64_PLTSEQ_NOTOC", - "R_PPC64_PLT_PCREL34", - "R_PPC64_PLT_PCREL34_NOTOC", - "R_PPC64_REL14", - "R_PPC64_REL14_BRNTAKEN", - "R_PPC64_REL14_BRTAKEN", - "R_PPC64_REL16", - "R_PPC64_REL16DX_HA", - "R_PPC64_REL16_HA", - "R_PPC64_REL16_HI", - "R_PPC64_REL16_HIGH", - "R_PPC64_REL16_HIGHA", - "R_PPC64_REL16_HIGHER", - "R_PPC64_REL16_HIGHER34", - "R_PPC64_REL16_HIGHERA", - "R_PPC64_REL16_HIGHERA34", - "R_PPC64_REL16_HIGHEST", - "R_PPC64_REL16_HIGHEST34", - "R_PPC64_REL16_HIGHESTA", - "R_PPC64_REL16_HIGHESTA34", - "R_PPC64_REL16_LO", - "R_PPC64_REL24", - "R_PPC64_REL24_NOTOC", - "R_PPC64_REL24_P9NOTOC", - "R_PPC64_REL30", - "R_PPC64_REL32", - "R_PPC64_REL64", - "R_PPC64_RELATIVE", - "R_PPC64_SECTOFF", - "R_PPC64_SECTOFF_DS", - "R_PPC64_SECTOFF_HA", - "R_PPC64_SECTOFF_HI", - "R_PPC64_SECTOFF_LO", - "R_PPC64_SECTOFF_LO_DS", - "R_PPC64_TLS", - "R_PPC64_TLSGD", - "R_PPC64_TLSLD", - "R_PPC64_TOC", - "R_PPC64_TOC16", - "R_PPC64_TOC16_DS", - "R_PPC64_TOC16_HA", - "R_PPC64_TOC16_HI", - "R_PPC64_TOC16_LO", - "R_PPC64_TOC16_LO_DS", - "R_PPC64_TOCSAVE", - "R_PPC64_TPREL16", - "R_PPC64_TPREL16_DS", - "R_PPC64_TPREL16_HA", - "R_PPC64_TPREL16_HI", - "R_PPC64_TPREL16_HIGH", - "R_PPC64_TPREL16_HIGHA", - "R_PPC64_TPREL16_HIGHER", - "R_PPC64_TPREL16_HIGHERA", - "R_PPC64_TPREL16_HIGHEST", - "R_PPC64_TPREL16_HIGHESTA", - "R_PPC64_TPREL16_LO", - "R_PPC64_TPREL16_LO_DS", - "R_PPC64_TPREL34", - "R_PPC64_TPREL64", - "R_PPC64_UADDR16", - "R_PPC64_UADDR32", - "R_PPC64_UADDR64", - "R_PPC_ADDR14", - "R_PPC_ADDR14_BRNTAKEN", - "R_PPC_ADDR14_BRTAKEN", - "R_PPC_ADDR16", - "R_PPC_ADDR16_HA", - "R_PPC_ADDR16_HI", - "R_PPC_ADDR16_LO", - "R_PPC_ADDR24", - "R_PPC_ADDR32", - "R_PPC_COPY", - "R_PPC_DTPMOD32", - "R_PPC_DTPREL16", - "R_PPC_DTPREL16_HA", - "R_PPC_DTPREL16_HI", - "R_PPC_DTPREL16_LO", - "R_PPC_DTPREL32", - "R_PPC_EMB_BIT_FLD", - "R_PPC_EMB_MRKREF", - "R_PPC_EMB_NADDR16", - "R_PPC_EMB_NADDR16_HA", - "R_PPC_EMB_NADDR16_HI", - "R_PPC_EMB_NADDR16_LO", - "R_PPC_EMB_NADDR32", - "R_PPC_EMB_RELSDA", - "R_PPC_EMB_RELSEC16", - "R_PPC_EMB_RELST_HA", - "R_PPC_EMB_RELST_HI", - "R_PPC_EMB_RELST_LO", - "R_PPC_EMB_SDA21", - "R_PPC_EMB_SDA2I16", - "R_PPC_EMB_SDA2REL", - "R_PPC_EMB_SDAI16", - "R_PPC_GLOB_DAT", - "R_PPC_GOT16", - "R_PPC_GOT16_HA", - "R_PPC_GOT16_HI", - "R_PPC_GOT16_LO", - "R_PPC_GOT_TLSGD16", - "R_PPC_GOT_TLSGD16_HA", - "R_PPC_GOT_TLSGD16_HI", - "R_PPC_GOT_TLSGD16_LO", - "R_PPC_GOT_TLSLD16", - "R_PPC_GOT_TLSLD16_HA", - "R_PPC_GOT_TLSLD16_HI", - "R_PPC_GOT_TLSLD16_LO", - "R_PPC_GOT_TPREL16", - "R_PPC_GOT_TPREL16_HA", - "R_PPC_GOT_TPREL16_HI", - "R_PPC_GOT_TPREL16_LO", - "R_PPC_JMP_SLOT", - "R_PPC_LOCAL24PC", - "R_PPC_NONE", - "R_PPC_PLT16_HA", - "R_PPC_PLT16_HI", - "R_PPC_PLT16_LO", - "R_PPC_PLT32", - "R_PPC_PLTREL24", - "R_PPC_PLTREL32", - "R_PPC_REL14", - "R_PPC_REL14_BRNTAKEN", - "R_PPC_REL14_BRTAKEN", - "R_PPC_REL24", - "R_PPC_REL32", - "R_PPC_RELATIVE", - "R_PPC_SDAREL16", - "R_PPC_SECTOFF", - "R_PPC_SECTOFF_HA", - "R_PPC_SECTOFF_HI", - "R_PPC_SECTOFF_LO", - "R_PPC_TLS", - "R_PPC_TPREL16", - "R_PPC_TPREL16_HA", - "R_PPC_TPREL16_HI", - "R_PPC_TPREL16_LO", - "R_PPC_TPREL32", - "R_PPC_UADDR16", - "R_PPC_UADDR32", - "R_RISCV", - "R_RISCV_32", - "R_RISCV_32_PCREL", - "R_RISCV_64", - "R_RISCV_ADD16", - "R_RISCV_ADD32", - "R_RISCV_ADD64", - "R_RISCV_ADD8", - "R_RISCV_ALIGN", - "R_RISCV_BRANCH", - "R_RISCV_CALL", - "R_RISCV_CALL_PLT", - "R_RISCV_COPY", - "R_RISCV_GNU_VTENTRY", - "R_RISCV_GNU_VTINHERIT", - "R_RISCV_GOT_HI20", - "R_RISCV_GPREL_I", - "R_RISCV_GPREL_S", - "R_RISCV_HI20", - "R_RISCV_JAL", - "R_RISCV_JUMP_SLOT", - "R_RISCV_LO12_I", - "R_RISCV_LO12_S", - "R_RISCV_NONE", - "R_RISCV_PCREL_HI20", - "R_RISCV_PCREL_LO12_I", - "R_RISCV_PCREL_LO12_S", - "R_RISCV_RELATIVE", - "R_RISCV_RELAX", - "R_RISCV_RVC_BRANCH", - "R_RISCV_RVC_JUMP", - "R_RISCV_RVC_LUI", - "R_RISCV_SET16", - "R_RISCV_SET32", - "R_RISCV_SET6", - "R_RISCV_SET8", - "R_RISCV_SUB16", - "R_RISCV_SUB32", - "R_RISCV_SUB6", - "R_RISCV_SUB64", - "R_RISCV_SUB8", - "R_RISCV_TLS_DTPMOD32", - "R_RISCV_TLS_DTPMOD64", - "R_RISCV_TLS_DTPREL32", - "R_RISCV_TLS_DTPREL64", - "R_RISCV_TLS_GD_HI20", - "R_RISCV_TLS_GOT_HI20", - "R_RISCV_TLS_TPREL32", - "R_RISCV_TLS_TPREL64", - "R_RISCV_TPREL_ADD", - "R_RISCV_TPREL_HI20", - "R_RISCV_TPREL_I", - "R_RISCV_TPREL_LO12_I", - "R_RISCV_TPREL_LO12_S", - "R_RISCV_TPREL_S", - "R_SPARC", - "R_SPARC_10", - "R_SPARC_11", - "R_SPARC_13", - "R_SPARC_16", - "R_SPARC_22", - "R_SPARC_32", - "R_SPARC_5", - "R_SPARC_6", - "R_SPARC_64", - "R_SPARC_7", - "R_SPARC_8", - "R_SPARC_COPY", - "R_SPARC_DISP16", - "R_SPARC_DISP32", - "R_SPARC_DISP64", - "R_SPARC_DISP8", - "R_SPARC_GLOB_DAT", - "R_SPARC_GLOB_JMP", - "R_SPARC_GOT10", - "R_SPARC_GOT13", - "R_SPARC_GOT22", - "R_SPARC_H44", - "R_SPARC_HH22", - "R_SPARC_HI22", - "R_SPARC_HIPLT22", - "R_SPARC_HIX22", - "R_SPARC_HM10", - "R_SPARC_JMP_SLOT", - "R_SPARC_L44", - "R_SPARC_LM22", - "R_SPARC_LO10", - "R_SPARC_LOPLT10", - "R_SPARC_LOX10", - "R_SPARC_M44", - "R_SPARC_NONE", - "R_SPARC_OLO10", - "R_SPARC_PC10", - "R_SPARC_PC22", - "R_SPARC_PCPLT10", - "R_SPARC_PCPLT22", - "R_SPARC_PCPLT32", - "R_SPARC_PC_HH22", - "R_SPARC_PC_HM10", - "R_SPARC_PC_LM22", - "R_SPARC_PLT32", - "R_SPARC_PLT64", - "R_SPARC_REGISTER", - "R_SPARC_RELATIVE", - "R_SPARC_UA16", - "R_SPARC_UA32", - "R_SPARC_UA64", - "R_SPARC_WDISP16", - "R_SPARC_WDISP19", - "R_SPARC_WDISP22", - "R_SPARC_WDISP30", - "R_SPARC_WPLT30", - "R_SYM32", - "R_SYM64", - "R_TYPE32", - "R_TYPE64", - "R_X86_64", - "R_X86_64_16", - "R_X86_64_32", - "R_X86_64_32S", - "R_X86_64_64", - "R_X86_64_8", - "R_X86_64_COPY", - "R_X86_64_DTPMOD64", - "R_X86_64_DTPOFF32", - "R_X86_64_DTPOFF64", - "R_X86_64_GLOB_DAT", - "R_X86_64_GOT32", - "R_X86_64_GOT64", - "R_X86_64_GOTOFF64", - "R_X86_64_GOTPC32", - "R_X86_64_GOTPC32_TLSDESC", - "R_X86_64_GOTPC64", - "R_X86_64_GOTPCREL", - "R_X86_64_GOTPCREL64", - "R_X86_64_GOTPCRELX", - "R_X86_64_GOTPLT64", - "R_X86_64_GOTTPOFF", - "R_X86_64_IRELATIVE", - "R_X86_64_JMP_SLOT", - "R_X86_64_NONE", - "R_X86_64_PC16", - "R_X86_64_PC32", - "R_X86_64_PC32_BND", - "R_X86_64_PC64", - "R_X86_64_PC8", - "R_X86_64_PLT32", - "R_X86_64_PLT32_BND", - "R_X86_64_PLTOFF64", - "R_X86_64_RELATIVE", - "R_X86_64_RELATIVE64", - "R_X86_64_REX_GOTPCRELX", - "R_X86_64_SIZE32", - "R_X86_64_SIZE64", - "R_X86_64_TLSDESC", - "R_X86_64_TLSDESC_CALL", - "R_X86_64_TLSGD", - "R_X86_64_TLSLD", - "R_X86_64_TPOFF32", - "R_X86_64_TPOFF64", - "Rel32", - "Rel64", - "Rela32", - "Rela64", - "SHF_ALLOC", - "SHF_COMPRESSED", - "SHF_EXECINSTR", - "SHF_GROUP", - "SHF_INFO_LINK", - "SHF_LINK_ORDER", - "SHF_MASKOS", - "SHF_MASKPROC", - "SHF_MERGE", - "SHF_OS_NONCONFORMING", - "SHF_STRINGS", - "SHF_TLS", - "SHF_WRITE", - "SHN_ABS", - "SHN_COMMON", - "SHN_HIOS", - "SHN_HIPROC", - "SHN_HIRESERVE", - "SHN_LOOS", - "SHN_LOPROC", - "SHN_LORESERVE", - "SHN_UNDEF", - "SHN_XINDEX", - "SHT_DYNAMIC", - "SHT_DYNSYM", - "SHT_FINI_ARRAY", - "SHT_GNU_ATTRIBUTES", - "SHT_GNU_HASH", - "SHT_GNU_LIBLIST", - "SHT_GNU_VERDEF", - "SHT_GNU_VERNEED", - "SHT_GNU_VERSYM", - "SHT_GROUP", - "SHT_HASH", - "SHT_HIOS", - "SHT_HIPROC", - "SHT_HIUSER", - "SHT_INIT_ARRAY", - "SHT_LOOS", - "SHT_LOPROC", - "SHT_LOUSER", - "SHT_MIPS_ABIFLAGS", - "SHT_NOBITS", - "SHT_NOTE", - "SHT_NULL", - "SHT_PREINIT_ARRAY", - "SHT_PROGBITS", - "SHT_REL", - "SHT_RELA", - "SHT_SHLIB", - "SHT_STRTAB", - "SHT_SYMTAB", - "SHT_SYMTAB_SHNDX", - "STB_GLOBAL", - "STB_HIOS", - "STB_HIPROC", - "STB_LOCAL", - "STB_LOOS", - "STB_LOPROC", - "STB_WEAK", - "STT_COMMON", - "STT_FILE", - "STT_FUNC", - "STT_HIOS", - "STT_HIPROC", - "STT_LOOS", - "STT_LOPROC", - "STT_NOTYPE", - "STT_OBJECT", - "STT_SECTION", - "STT_TLS", - "STV_DEFAULT", - "STV_HIDDEN", - "STV_INTERNAL", - "STV_PROTECTED", - "ST_BIND", - "ST_INFO", - "ST_TYPE", - "ST_VISIBILITY", - "Section", - "Section32", - "Section64", - "SectionFlag", - "SectionHeader", - "SectionIndex", - "SectionType", - "Sym32", - "Sym32Size", - "Sym64", - "Sym64Size", - "SymBind", - "SymType", - "SymVis", - "Symbol", - "Type", - "Version", - }, - "debug/gosym": { - "DecodingError", - "Func", - "LineTable", - "NewLineTable", - "NewTable", - "Obj", - "Sym", - "Table", - "UnknownFileError", - "UnknownLineError", - }, - "debug/macho": { - "ARM64_RELOC_ADDEND", - "ARM64_RELOC_BRANCH26", - "ARM64_RELOC_GOT_LOAD_PAGE21", - "ARM64_RELOC_GOT_LOAD_PAGEOFF12", - "ARM64_RELOC_PAGE21", - "ARM64_RELOC_PAGEOFF12", - "ARM64_RELOC_POINTER_TO_GOT", - "ARM64_RELOC_SUBTRACTOR", - "ARM64_RELOC_TLVP_LOAD_PAGE21", - "ARM64_RELOC_TLVP_LOAD_PAGEOFF12", - "ARM64_RELOC_UNSIGNED", - "ARM_RELOC_BR24", - "ARM_RELOC_HALF", - "ARM_RELOC_HALF_SECTDIFF", - "ARM_RELOC_LOCAL_SECTDIFF", - "ARM_RELOC_PAIR", - "ARM_RELOC_PB_LA_PTR", - "ARM_RELOC_SECTDIFF", - "ARM_RELOC_VANILLA", - "ARM_THUMB_32BIT_BRANCH", - "ARM_THUMB_RELOC_BR22", - "Cpu", - "Cpu386", - "CpuAmd64", - "CpuArm", - "CpuArm64", - "CpuPpc", - "CpuPpc64", - "Dylib", - "DylibCmd", - "Dysymtab", - "DysymtabCmd", - "ErrNotFat", - "FatArch", - "FatArchHeader", - "FatFile", - "File", - "FileHeader", - "FlagAllModsBound", - "FlagAllowStackExecution", - "FlagAppExtensionSafe", - "FlagBindAtLoad", - "FlagBindsToWeak", - "FlagCanonical", - "FlagDeadStrippableDylib", - "FlagDyldLink", - "FlagForceFlat", - "FlagHasTLVDescriptors", - "FlagIncrLink", - "FlagLazyInit", - "FlagNoFixPrebinding", - "FlagNoHeapExecution", - "FlagNoMultiDefs", - "FlagNoReexportedDylibs", - "FlagNoUndefs", - "FlagPIE", - "FlagPrebindable", - "FlagPrebound", - "FlagRootSafe", - "FlagSetuidSafe", - "FlagSplitSegs", - "FlagSubsectionsViaSymbols", - "FlagTwoLevel", - "FlagWeakDefines", - "FormatError", - "GENERIC_RELOC_LOCAL_SECTDIFF", - "GENERIC_RELOC_PAIR", - "GENERIC_RELOC_PB_LA_PTR", - "GENERIC_RELOC_SECTDIFF", - "GENERIC_RELOC_TLV", - "GENERIC_RELOC_VANILLA", - "Load", - "LoadBytes", - "LoadCmd", - "LoadCmdDylib", - "LoadCmdDylinker", - "LoadCmdDysymtab", - "LoadCmdRpath", - "LoadCmdSegment", - "LoadCmdSegment64", - "LoadCmdSymtab", - "LoadCmdThread", - "LoadCmdUnixThread", - "Magic32", - "Magic64", - "MagicFat", - "NewFatFile", - "NewFile", - "Nlist32", - "Nlist64", - "Open", - "OpenFat", - "Regs386", - "RegsAMD64", - "Reloc", - "RelocTypeARM", - "RelocTypeARM64", - "RelocTypeGeneric", - "RelocTypeX86_64", - "Rpath", - "RpathCmd", - "Section", - "Section32", - "Section64", - "SectionHeader", - "Segment", - "Segment32", - "Segment64", - "SegmentHeader", - "Symbol", - "Symtab", - "SymtabCmd", - "Thread", - "Type", - "TypeBundle", - "TypeDylib", - "TypeExec", - "TypeObj", - "X86_64_RELOC_BRANCH", - "X86_64_RELOC_GOT", - "X86_64_RELOC_GOT_LOAD", - "X86_64_RELOC_SIGNED", - "X86_64_RELOC_SIGNED_1", - "X86_64_RELOC_SIGNED_2", - "X86_64_RELOC_SIGNED_4", - "X86_64_RELOC_SUBTRACTOR", - "X86_64_RELOC_TLV", - "X86_64_RELOC_UNSIGNED", - }, - "debug/pe": { - "COFFSymbol", - "COFFSymbolAuxFormat5", - "COFFSymbolSize", - "DataDirectory", - "File", - "FileHeader", - "FormatError", - "IMAGE_COMDAT_SELECT_ANY", - "IMAGE_COMDAT_SELECT_ASSOCIATIVE", - "IMAGE_COMDAT_SELECT_EXACT_MATCH", - "IMAGE_COMDAT_SELECT_LARGEST", - "IMAGE_COMDAT_SELECT_NODUPLICATES", - "IMAGE_COMDAT_SELECT_SAME_SIZE", - "IMAGE_DIRECTORY_ENTRY_ARCHITECTURE", - "IMAGE_DIRECTORY_ENTRY_BASERELOC", - "IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT", - "IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR", - "IMAGE_DIRECTORY_ENTRY_DEBUG", - "IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT", - "IMAGE_DIRECTORY_ENTRY_EXCEPTION", - "IMAGE_DIRECTORY_ENTRY_EXPORT", - "IMAGE_DIRECTORY_ENTRY_GLOBALPTR", - "IMAGE_DIRECTORY_ENTRY_IAT", - "IMAGE_DIRECTORY_ENTRY_IMPORT", - "IMAGE_DIRECTORY_ENTRY_LOAD_CONFIG", - "IMAGE_DIRECTORY_ENTRY_RESOURCE", - "IMAGE_DIRECTORY_ENTRY_SECURITY", - "IMAGE_DIRECTORY_ENTRY_TLS", - "IMAGE_DLLCHARACTERISTICS_APPCONTAINER", - "IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE", - "IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY", - "IMAGE_DLLCHARACTERISTICS_GUARD_CF", - "IMAGE_DLLCHARACTERISTICS_HIGH_ENTROPY_VA", - "IMAGE_DLLCHARACTERISTICS_NO_BIND", - "IMAGE_DLLCHARACTERISTICS_NO_ISOLATION", - "IMAGE_DLLCHARACTERISTICS_NO_SEH", - "IMAGE_DLLCHARACTERISTICS_NX_COMPAT", - "IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE", - "IMAGE_DLLCHARACTERISTICS_WDM_DRIVER", - "IMAGE_FILE_32BIT_MACHINE", - "IMAGE_FILE_AGGRESIVE_WS_TRIM", - "IMAGE_FILE_BYTES_REVERSED_HI", - "IMAGE_FILE_BYTES_REVERSED_LO", - "IMAGE_FILE_DEBUG_STRIPPED", - "IMAGE_FILE_DLL", - "IMAGE_FILE_EXECUTABLE_IMAGE", - "IMAGE_FILE_LARGE_ADDRESS_AWARE", - "IMAGE_FILE_LINE_NUMS_STRIPPED", - "IMAGE_FILE_LOCAL_SYMS_STRIPPED", - "IMAGE_FILE_MACHINE_AM33", - "IMAGE_FILE_MACHINE_AMD64", - "IMAGE_FILE_MACHINE_ARM", - "IMAGE_FILE_MACHINE_ARM64", - "IMAGE_FILE_MACHINE_ARMNT", - "IMAGE_FILE_MACHINE_EBC", - "IMAGE_FILE_MACHINE_I386", - "IMAGE_FILE_MACHINE_IA64", - "IMAGE_FILE_MACHINE_LOONGARCH32", - "IMAGE_FILE_MACHINE_LOONGARCH64", - "IMAGE_FILE_MACHINE_M32R", - "IMAGE_FILE_MACHINE_MIPS16", - "IMAGE_FILE_MACHINE_MIPSFPU", - "IMAGE_FILE_MACHINE_MIPSFPU16", - "IMAGE_FILE_MACHINE_POWERPC", - "IMAGE_FILE_MACHINE_POWERPCFP", - "IMAGE_FILE_MACHINE_R4000", - "IMAGE_FILE_MACHINE_RISCV128", - "IMAGE_FILE_MACHINE_RISCV32", - "IMAGE_FILE_MACHINE_RISCV64", - "IMAGE_FILE_MACHINE_SH3", - "IMAGE_FILE_MACHINE_SH3DSP", - "IMAGE_FILE_MACHINE_SH4", - "IMAGE_FILE_MACHINE_SH5", - "IMAGE_FILE_MACHINE_THUMB", - "IMAGE_FILE_MACHINE_UNKNOWN", - "IMAGE_FILE_MACHINE_WCEMIPSV2", - "IMAGE_FILE_NET_RUN_FROM_SWAP", - "IMAGE_FILE_RELOCS_STRIPPED", - "IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP", - "IMAGE_FILE_SYSTEM", - "IMAGE_FILE_UP_SYSTEM_ONLY", - "IMAGE_SCN_CNT_CODE", - "IMAGE_SCN_CNT_INITIALIZED_DATA", - "IMAGE_SCN_CNT_UNINITIALIZED_DATA", - "IMAGE_SCN_LNK_COMDAT", - "IMAGE_SCN_MEM_DISCARDABLE", - "IMAGE_SCN_MEM_EXECUTE", - "IMAGE_SCN_MEM_READ", - "IMAGE_SCN_MEM_WRITE", - "IMAGE_SUBSYSTEM_EFI_APPLICATION", - "IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER", - "IMAGE_SUBSYSTEM_EFI_ROM", - "IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER", - "IMAGE_SUBSYSTEM_NATIVE", - "IMAGE_SUBSYSTEM_NATIVE_WINDOWS", - "IMAGE_SUBSYSTEM_OS2_CUI", - "IMAGE_SUBSYSTEM_POSIX_CUI", - "IMAGE_SUBSYSTEM_UNKNOWN", - "IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION", - "IMAGE_SUBSYSTEM_WINDOWS_CE_GUI", - "IMAGE_SUBSYSTEM_WINDOWS_CUI", - "IMAGE_SUBSYSTEM_WINDOWS_GUI", - "IMAGE_SUBSYSTEM_XBOX", - "ImportDirectory", - "NewFile", - "Open", - "OptionalHeader32", - "OptionalHeader64", - "Reloc", - "Section", - "SectionHeader", - "SectionHeader32", - "StringTable", - "Symbol", - }, - "debug/plan9obj": { - "ErrNoSymbols", - "File", - "FileHeader", - "Magic386", - "Magic64", - "MagicAMD64", - "MagicARM", - "NewFile", - "Open", - "Section", - "SectionHeader", - "Sym", - }, - "embed": { - "FS", - }, - "encoding": { - "BinaryMarshaler", - "BinaryUnmarshaler", - "TextMarshaler", - "TextUnmarshaler", - }, - "encoding/ascii85": { - "CorruptInputError", - "Decode", - "Encode", - "MaxEncodedLen", - "NewDecoder", - "NewEncoder", - }, - "encoding/asn1": { - "BitString", - "ClassApplication", - "ClassContextSpecific", - "ClassPrivate", - "ClassUniversal", - "Enumerated", - "Flag", - "Marshal", - "MarshalWithParams", - "NullBytes", - "NullRawValue", - "ObjectIdentifier", - "RawContent", - "RawValue", - "StructuralError", - "SyntaxError", - "TagBMPString", - "TagBitString", - "TagBoolean", - "TagEnum", - "TagGeneralString", - "TagGeneralizedTime", - "TagIA5String", - "TagInteger", - "TagNull", - "TagNumericString", - "TagOID", - "TagOctetString", - "TagPrintableString", - "TagSequence", - "TagSet", - "TagT61String", - "TagUTCTime", - "TagUTF8String", - "Unmarshal", - "UnmarshalWithParams", - }, - "encoding/base32": { - "CorruptInputError", - "Encoding", - "HexEncoding", - "NewDecoder", - "NewEncoder", - "NewEncoding", - "NoPadding", - "StdEncoding", - "StdPadding", - }, - "encoding/base64": { - "CorruptInputError", - "Encoding", - "NewDecoder", - "NewEncoder", - "NewEncoding", - "NoPadding", - "RawStdEncoding", - "RawURLEncoding", - "StdEncoding", - "StdPadding", - "URLEncoding", - }, - "encoding/binary": { - "AppendByteOrder", - "AppendUvarint", - "AppendVarint", - "BigEndian", - "ByteOrder", - "LittleEndian", - "MaxVarintLen16", - "MaxVarintLen32", - "MaxVarintLen64", - "NativeEndian", - "PutUvarint", - "PutVarint", - "Read", - "ReadUvarint", - "ReadVarint", - "Size", - "Uvarint", - "Varint", - "Write", - }, - "encoding/csv": { - "ErrBareQuote", - "ErrFieldCount", - "ErrQuote", - "ErrTrailingComma", - "NewReader", - "NewWriter", - "ParseError", - "Reader", - "Writer", - }, - "encoding/gob": { - "CommonType", - "Decoder", - "Encoder", - "GobDecoder", - "GobEncoder", - "NewDecoder", - "NewEncoder", - "Register", - "RegisterName", - }, - "encoding/hex": { - "Decode", - "DecodeString", - "DecodedLen", - "Dump", - "Dumper", - "Encode", - "EncodeToString", - "EncodedLen", - "ErrLength", - "InvalidByteError", - "NewDecoder", - "NewEncoder", - }, - "encoding/json": { - "Compact", - "Decoder", - "Delim", - "Encoder", - "HTMLEscape", - "Indent", - "InvalidUTF8Error", - "InvalidUnmarshalError", - "Marshal", - "MarshalIndent", - "Marshaler", - "MarshalerError", - "NewDecoder", - "NewEncoder", - "Number", - "RawMessage", - "SyntaxError", - "Token", - "Unmarshal", - "UnmarshalFieldError", - "UnmarshalTypeError", - "Unmarshaler", - "UnsupportedTypeError", - "UnsupportedValueError", - "Valid", - }, - "encoding/pem": { - "Block", - "Decode", - "Encode", - "EncodeToMemory", - }, - "encoding/xml": { - "Attr", - "CharData", - "Comment", - "CopyToken", - "Decoder", - "Directive", - "Encoder", - "EndElement", - "Escape", - "EscapeText", - "HTMLAutoClose", - "HTMLEntity", - "Header", - "Marshal", - "MarshalIndent", - "Marshaler", - "MarshalerAttr", - "Name", - "NewDecoder", - "NewEncoder", - "NewTokenDecoder", - "ProcInst", - "StartElement", - "SyntaxError", - "TagPathError", - "Token", - "TokenReader", - "Unmarshal", - "UnmarshalError", - "Unmarshaler", - "UnmarshalerAttr", - "UnsupportedTypeError", - }, - "errors": { - "As", - "ErrUnsupported", - "Is", - "Join", - "New", - "Unwrap", - }, - "expvar": { - "Do", - "Float", - "Func", - "Get", - "Handler", - "Int", - "KeyValue", - "Map", - "NewFloat", - "NewInt", - "NewMap", - "NewString", - "Publish", - "String", - "Var", - }, - "flag": { - "Arg", - "Args", - "Bool", - "BoolFunc", - "BoolVar", - "CommandLine", - "ContinueOnError", - "Duration", - "DurationVar", - "ErrHelp", - "ErrorHandling", - "ExitOnError", - "Flag", - "FlagSet", - "Float64", - "Float64Var", - "Func", - "Getter", - "Int", - "Int64", - "Int64Var", - "IntVar", - "Lookup", - "NArg", - "NFlag", - "NewFlagSet", - "PanicOnError", - "Parse", - "Parsed", - "PrintDefaults", - "Set", - "String", - "StringVar", - "TextVar", - "Uint", - "Uint64", - "Uint64Var", - "UintVar", - "UnquoteUsage", - "Usage", - "Value", - "Var", - "Visit", - "VisitAll", - }, - "fmt": { - "Append", - "Appendf", - "Appendln", - "Errorf", - "FormatString", - "Formatter", - "Fprint", - "Fprintf", - "Fprintln", - "Fscan", - "Fscanf", - "Fscanln", - "GoStringer", - "Print", - "Printf", - "Println", - "Scan", - "ScanState", - "Scanf", - "Scanln", - "Scanner", - "Sprint", - "Sprintf", - "Sprintln", - "Sscan", - "Sscanf", - "Sscanln", - "State", - "Stringer", - }, - "go/ast": { - "ArrayType", - "AssignStmt", - "Bad", - "BadDecl", - "BadExpr", - "BadStmt", - "BasicLit", - "BinaryExpr", - "BlockStmt", - "BranchStmt", - "CallExpr", - "CaseClause", - "ChanDir", - "ChanType", - "CommClause", - "Comment", - "CommentGroup", - "CommentMap", - "CompositeLit", - "Con", - "Decl", - "DeclStmt", - "DeferStmt", - "Ellipsis", - "EmptyStmt", - "Expr", - "ExprStmt", - "Field", - "FieldFilter", - "FieldList", - "File", - "FileExports", - "Filter", - "FilterDecl", - "FilterFile", - "FilterFuncDuplicates", - "FilterImportDuplicates", - "FilterPackage", - "FilterUnassociatedComments", - "ForStmt", - "Fprint", - "Fun", - "FuncDecl", - "FuncLit", - "FuncType", - "GenDecl", - "GoStmt", - "Ident", - "IfStmt", - "ImportSpec", - "Importer", - "IncDecStmt", - "IndexExpr", - "IndexListExpr", - "Inspect", - "InterfaceType", - "IsExported", - "IsGenerated", - "KeyValueExpr", - "LabeledStmt", - "Lbl", - "MapType", - "MergeMode", - "MergePackageFiles", - "NewCommentMap", - "NewIdent", - "NewObj", - "NewPackage", - "NewScope", - "Node", - "NotNilFilter", - "ObjKind", - "Object", - "Package", - "PackageExports", - "ParenExpr", - "Pkg", - "Print", - "RECV", - "RangeStmt", - "ReturnStmt", - "SEND", - "Scope", - "SelectStmt", - "SelectorExpr", - "SendStmt", - "SliceExpr", - "SortImports", - "Spec", - "StarExpr", - "Stmt", - "StructType", - "SwitchStmt", - "Typ", - "TypeAssertExpr", - "TypeSpec", - "TypeSwitchStmt", - "UnaryExpr", - "ValueSpec", - "Var", - "Visitor", - "Walk", - }, - "go/build": { - "AllowBinary", - "ArchChar", - "Context", - "Default", - "Directive", - "FindOnly", - "IgnoreVendor", - "Import", - "ImportComment", - "ImportDir", - "ImportMode", - "IsLocalImport", - "MultiplePackageError", - "NoGoError", - "Package", - "ToolDir", - }, - "go/build/constraint": { - "AndExpr", - "Expr", - "GoVersion", - "IsGoBuild", - "IsPlusBuild", - "NotExpr", - "OrExpr", - "Parse", - "PlusBuildLines", - "SyntaxError", - "TagExpr", - }, - "go/constant": { - "BinaryOp", - "BitLen", - "Bool", - "BoolVal", - "Bytes", - "Compare", - "Complex", - "Denom", - "Float", - "Float32Val", - "Float64Val", - "Imag", - "Int", - "Int64Val", - "Kind", - "Make", - "MakeBool", - "MakeFloat64", - "MakeFromBytes", - "MakeFromLiteral", - "MakeImag", - "MakeInt64", - "MakeString", - "MakeUint64", - "MakeUnknown", - "Num", - "Real", - "Shift", - "Sign", - "String", - "StringVal", - "ToComplex", - "ToFloat", - "ToInt", - "Uint64Val", - "UnaryOp", - "Unknown", - "Val", - "Value", - }, - "go/doc": { - "AllDecls", - "AllMethods", - "Example", - "Examples", - "Filter", - "Func", - "IllegalPrefixes", - "IsPredeclared", - "Mode", - "New", - "NewFromFiles", - "Note", - "Package", - "PreserveAST", - "Synopsis", - "ToHTML", - "ToText", - "Type", - "Value", - }, - "go/doc/comment": { - "Block", - "Code", - "DefaultLookupPackage", - "Doc", - "DocLink", - "Heading", - "Italic", - "Link", - "LinkDef", - "List", - "ListItem", - "Paragraph", - "Parser", - "Plain", - "Printer", - "Text", - }, - "go/format": { - "Node", - "Source", - }, - "go/importer": { - "Default", - "For", - "ForCompiler", - "Lookup", - }, - "go/parser": { - "AllErrors", - "DeclarationErrors", - "ImportsOnly", - "Mode", - "PackageClauseOnly", - "ParseComments", - "ParseDir", - "ParseExpr", - "ParseExprFrom", - "ParseFile", - "SkipObjectResolution", - "SpuriousErrors", - "Trace", - }, - "go/printer": { - "CommentedNode", - "Config", - "Fprint", - "Mode", - "RawFormat", - "SourcePos", - "TabIndent", - "UseSpaces", - }, - "go/scanner": { - "Error", - "ErrorHandler", - "ErrorList", - "Mode", - "PrintError", - "ScanComments", - "Scanner", - }, - "go/token": { - "ADD", - "ADD_ASSIGN", - "AND", - "AND_ASSIGN", - "AND_NOT", - "AND_NOT_ASSIGN", - "ARROW", - "ASSIGN", - "BREAK", - "CASE", - "CHAN", - "CHAR", - "COLON", - "COMMA", - "COMMENT", - "CONST", - "CONTINUE", - "DEC", - "DEFAULT", - "DEFER", - "DEFINE", - "ELLIPSIS", - "ELSE", - "EOF", - "EQL", - "FALLTHROUGH", - "FLOAT", - "FOR", - "FUNC", - "File", - "FileSet", - "GEQ", - "GO", - "GOTO", - "GTR", - "HighestPrec", - "IDENT", - "IF", - "ILLEGAL", - "IMAG", - "IMPORT", - "INC", - "INT", - "INTERFACE", - "IsExported", - "IsIdentifier", - "IsKeyword", - "LAND", - "LBRACE", - "LBRACK", - "LEQ", - "LOR", - "LPAREN", - "LSS", - "Lookup", - "LowestPrec", - "MAP", - "MUL", - "MUL_ASSIGN", - "NEQ", - "NOT", - "NewFileSet", - "NoPos", - "OR", - "OR_ASSIGN", - "PACKAGE", - "PERIOD", - "Pos", - "Position", - "QUO", - "QUO_ASSIGN", - "RANGE", - "RBRACE", - "RBRACK", - "REM", - "REM_ASSIGN", - "RETURN", - "RPAREN", - "SELECT", - "SEMICOLON", - "SHL", - "SHL_ASSIGN", - "SHR", - "SHR_ASSIGN", - "STRING", - "STRUCT", - "SUB", - "SUB_ASSIGN", - "SWITCH", - "TILDE", - "TYPE", - "Token", - "UnaryPrec", - "VAR", - "XOR", - "XOR_ASSIGN", - }, - "go/types": { - "ArgumentError", - "Array", - "AssertableTo", - "AssignableTo", - "Basic", - "BasicInfo", - "BasicKind", - "Bool", - "Builtin", - "Byte", - "Chan", - "ChanDir", - "CheckExpr", - "Checker", - "Comparable", - "Complex128", - "Complex64", - "Config", - "Const", - "Context", - "ConvertibleTo", - "DefPredeclaredTestFuncs", - "Default", - "Error", - "Eval", - "ExprString", - "FieldVal", - "Float32", - "Float64", - "Func", - "Id", - "Identical", - "IdenticalIgnoreTags", - "Implements", - "ImportMode", - "Importer", - "ImporterFrom", - "Info", - "Initializer", - "Instance", - "Instantiate", - "Int", - "Int16", - "Int32", - "Int64", - "Int8", - "Interface", - "Invalid", - "IsBoolean", - "IsComplex", - "IsConstType", - "IsFloat", - "IsInteger", - "IsInterface", - "IsNumeric", - "IsOrdered", - "IsString", - "IsUnsigned", - "IsUntyped", - "Label", - "LookupFieldOrMethod", - "Map", - "MethodExpr", - "MethodSet", - "MethodVal", - "MissingMethod", - "Named", - "NewArray", - "NewChan", - "NewChecker", - "NewConst", - "NewContext", - "NewField", - "NewFunc", - "NewInterface", - "NewInterfaceType", - "NewLabel", - "NewMap", - "NewMethodSet", - "NewNamed", - "NewPackage", - "NewParam", - "NewPkgName", - "NewPointer", - "NewScope", - "NewSignature", - "NewSignatureType", - "NewSlice", - "NewStruct", - "NewTerm", - "NewTuple", - "NewTypeName", - "NewTypeParam", - "NewUnion", - "NewVar", - "Nil", - "Object", - "ObjectString", - "Package", - "PkgName", - "Pointer", - "Qualifier", - "RecvOnly", - "RelativeTo", - "Rune", - "Satisfies", - "Scope", - "Selection", - "SelectionKind", - "SelectionString", - "SendOnly", - "SendRecv", - "Signature", - "Sizes", - "SizesFor", - "Slice", - "StdSizes", - "String", - "Struct", - "Term", - "Tuple", - "Typ", - "Type", - "TypeAndValue", - "TypeList", - "TypeName", - "TypeParam", - "TypeParamList", - "TypeString", - "Uint", - "Uint16", - "Uint32", - "Uint64", - "Uint8", - "Uintptr", - "Union", - "Universe", - "Unsafe", - "UnsafePointer", - "UntypedBool", - "UntypedComplex", - "UntypedFloat", - "UntypedInt", - "UntypedNil", - "UntypedRune", - "UntypedString", - "Var", - "WriteExpr", - "WriteSignature", - "WriteType", - }, - "hash": { - "Hash", - "Hash32", - "Hash64", - }, - "hash/adler32": { - "Checksum", - "New", - "Size", - }, - "hash/crc32": { - "Castagnoli", - "Checksum", - "ChecksumIEEE", - "IEEE", - "IEEETable", - "Koopman", - "MakeTable", - "New", - "NewIEEE", - "Size", - "Table", - "Update", - }, - "hash/crc64": { - "Checksum", - "ECMA", - "ISO", - "MakeTable", - "New", - "Size", - "Table", - "Update", - }, - "hash/fnv": { - "New128", - "New128a", - "New32", - "New32a", - "New64", - "New64a", - }, - "hash/maphash": { - "Bytes", - "Hash", - "MakeSeed", - "Seed", - "String", - }, - "html": { - "EscapeString", - "UnescapeString", - }, - "html/template": { - "CSS", - "ErrAmbigContext", - "ErrBadHTML", - "ErrBranchEnd", - "ErrEndContext", - "ErrJSTemplate", - "ErrNoSuchTemplate", - "ErrOutputContext", - "ErrPartialCharset", - "ErrPartialEscape", - "ErrPredefinedEscaper", - "ErrRangeLoopReentry", - "ErrSlashAmbig", - "Error", - "ErrorCode", - "FuncMap", - "HTML", - "HTMLAttr", - "HTMLEscape", - "HTMLEscapeString", - "HTMLEscaper", - "IsTrue", - "JS", - "JSEscape", - "JSEscapeString", - "JSEscaper", - "JSStr", - "Must", - "New", - "OK", - "ParseFS", - "ParseFiles", - "ParseGlob", - "Srcset", - "Template", - "URL", - "URLQueryEscaper", - }, - "image": { - "Alpha", - "Alpha16", - "Black", - "CMYK", - "Config", - "Decode", - "DecodeConfig", - "ErrFormat", - "Gray", - "Gray16", - "Image", - "NRGBA", - "NRGBA64", - "NYCbCrA", - "NewAlpha", - "NewAlpha16", - "NewCMYK", - "NewGray", - "NewGray16", - "NewNRGBA", - "NewNRGBA64", - "NewNYCbCrA", - "NewPaletted", - "NewRGBA", - "NewRGBA64", - "NewUniform", - "NewYCbCr", - "Opaque", - "Paletted", - "PalettedImage", - "Point", - "Pt", - "RGBA", - "RGBA64", - "RGBA64Image", - "Rect", - "Rectangle", - "RegisterFormat", - "Transparent", - "Uniform", - "White", - "YCbCr", - "YCbCrSubsampleRatio", - "YCbCrSubsampleRatio410", - "YCbCrSubsampleRatio411", - "YCbCrSubsampleRatio420", - "YCbCrSubsampleRatio422", - "YCbCrSubsampleRatio440", - "YCbCrSubsampleRatio444", - "ZP", - "ZR", - }, - "image/color": { - "Alpha", - "Alpha16", - "Alpha16Model", - "AlphaModel", - "Black", - "CMYK", - "CMYKModel", - "CMYKToRGB", - "Color", - "Gray", - "Gray16", - "Gray16Model", - "GrayModel", - "Model", - "ModelFunc", - "NRGBA", - "NRGBA64", - "NRGBA64Model", - "NRGBAModel", - "NYCbCrA", - "NYCbCrAModel", - "Opaque", - "Palette", - "RGBA", - "RGBA64", - "RGBA64Model", - "RGBAModel", - "RGBToCMYK", - "RGBToYCbCr", - "Transparent", - "White", - "YCbCr", - "YCbCrModel", - "YCbCrToRGB", - }, - "image/color/palette": { - "Plan9", - "WebSafe", - }, - "image/draw": { - "Draw", - "DrawMask", - "Drawer", - "FloydSteinberg", - "Image", - "Op", - "Over", - "Quantizer", - "RGBA64Image", - "Src", - }, - "image/gif": { - "Decode", - "DecodeAll", - "DecodeConfig", - "DisposalBackground", - "DisposalNone", - "DisposalPrevious", - "Encode", - "EncodeAll", - "GIF", - "Options", - }, - "image/jpeg": { - "Decode", - "DecodeConfig", - "DefaultQuality", - "Encode", - "FormatError", - "Options", - "Reader", - "UnsupportedError", - }, - "image/png": { - "BestCompression", - "BestSpeed", - "CompressionLevel", - "Decode", - "DecodeConfig", - "DefaultCompression", - "Encode", - "Encoder", - "EncoderBuffer", - "EncoderBufferPool", - "FormatError", - "NoCompression", - "UnsupportedError", - }, - "index/suffixarray": { - "Index", - "New", - }, - "io": { - "ByteReader", - "ByteScanner", - "ByteWriter", - "Closer", - "Copy", - "CopyBuffer", - "CopyN", - "Discard", - "EOF", - "ErrClosedPipe", - "ErrNoProgress", - "ErrShortBuffer", - "ErrShortWrite", - "ErrUnexpectedEOF", - "LimitReader", - "LimitedReader", - "MultiReader", - "MultiWriter", - "NewOffsetWriter", - "NewSectionReader", - "NopCloser", - "OffsetWriter", - "Pipe", - "PipeReader", - "PipeWriter", - "ReadAll", - "ReadAtLeast", - "ReadCloser", - "ReadFull", - "ReadSeekCloser", - "ReadSeeker", - "ReadWriteCloser", - "ReadWriteSeeker", - "ReadWriter", - "Reader", - "ReaderAt", - "ReaderFrom", - "RuneReader", - "RuneScanner", - "SectionReader", - "SeekCurrent", - "SeekEnd", - "SeekStart", - "Seeker", - "StringWriter", - "TeeReader", - "WriteCloser", - "WriteSeeker", - "WriteString", - "Writer", - "WriterAt", - "WriterTo", - }, - "io/fs": { - "DirEntry", - "ErrClosed", - "ErrExist", - "ErrInvalid", - "ErrNotExist", - "ErrPermission", - "FS", - "File", - "FileInfo", - "FileInfoToDirEntry", - "FileMode", - "FormatDirEntry", - "FormatFileInfo", - "Glob", - "GlobFS", - "ModeAppend", - "ModeCharDevice", - "ModeDevice", - "ModeDir", - "ModeExclusive", - "ModeIrregular", - "ModeNamedPipe", - "ModePerm", - "ModeSetgid", - "ModeSetuid", - "ModeSocket", - "ModeSticky", - "ModeSymlink", - "ModeTemporary", - "ModeType", - "PathError", - "ReadDir", - "ReadDirFS", - "ReadDirFile", - "ReadFile", - "ReadFileFS", - "SkipAll", - "SkipDir", - "Stat", - "StatFS", - "Sub", - "SubFS", - "ValidPath", - "WalkDir", - "WalkDirFunc", - }, - "io/ioutil": { - "Discard", - "NopCloser", - "ReadAll", - "ReadDir", - "ReadFile", - "TempDir", - "TempFile", - "WriteFile", - }, - "log": { - "Default", - "Fatal", - "Fatalf", - "Fatalln", - "Flags", - "LUTC", - "Ldate", - "Llongfile", - "Lmicroseconds", - "Lmsgprefix", - "Logger", - "Lshortfile", - "LstdFlags", - "Ltime", - "New", - "Output", - "Panic", - "Panicf", - "Panicln", - "Prefix", - "Print", - "Printf", - "Println", - "SetFlags", - "SetOutput", - "SetPrefix", - "Writer", - }, - "log/slog": { - "Any", - "AnyValue", - "Attr", - "Bool", - "BoolValue", - "Debug", - "DebugContext", - "Default", - "Duration", - "DurationValue", - "Error", - "ErrorContext", - "Float64", - "Float64Value", - "Group", - "GroupValue", - "Handler", - "HandlerOptions", - "Info", - "InfoContext", - "Int", - "Int64", - "Int64Value", - "IntValue", - "JSONHandler", - "Kind", - "KindAny", - "KindBool", - "KindDuration", - "KindFloat64", - "KindGroup", - "KindInt64", - "KindLogValuer", - "KindString", - "KindTime", - "KindUint64", - "Level", - "LevelDebug", - "LevelError", - "LevelInfo", - "LevelKey", - "LevelVar", - "LevelWarn", - "Leveler", - "Log", - "LogAttrs", - "LogValuer", - "Logger", - "MessageKey", - "New", - "NewJSONHandler", - "NewLogLogger", - "NewRecord", - "NewTextHandler", - "Record", - "SetDefault", - "Source", - "SourceKey", - "String", - "StringValue", - "TextHandler", - "Time", - "TimeKey", - "TimeValue", - "Uint64", - "Uint64Value", - "Value", - "Warn", - "WarnContext", - "With", - }, - "log/syslog": { - "Dial", - "LOG_ALERT", - "LOG_AUTH", - "LOG_AUTHPRIV", - "LOG_CRIT", - "LOG_CRON", - "LOG_DAEMON", - "LOG_DEBUG", - "LOG_EMERG", - "LOG_ERR", - "LOG_FTP", - "LOG_INFO", - "LOG_KERN", - "LOG_LOCAL0", - "LOG_LOCAL1", - "LOG_LOCAL2", - "LOG_LOCAL3", - "LOG_LOCAL4", - "LOG_LOCAL5", - "LOG_LOCAL6", - "LOG_LOCAL7", - "LOG_LPR", - "LOG_MAIL", - "LOG_NEWS", - "LOG_NOTICE", - "LOG_SYSLOG", - "LOG_USER", - "LOG_UUCP", - "LOG_WARNING", - "New", - "NewLogger", - "Priority", - "Writer", - }, - "maps": { - "Clone", - "Copy", - "DeleteFunc", - "Equal", - "EqualFunc", - }, - "math": { - "Abs", - "Acos", - "Acosh", - "Asin", - "Asinh", - "Atan", - "Atan2", - "Atanh", - "Cbrt", - "Ceil", - "Copysign", - "Cos", - "Cosh", - "Dim", - "E", - "Erf", - "Erfc", - "Erfcinv", - "Erfinv", - "Exp", - "Exp2", - "Expm1", - "FMA", - "Float32bits", - "Float32frombits", - "Float64bits", - "Float64frombits", - "Floor", - "Frexp", - "Gamma", - "Hypot", - "Ilogb", - "Inf", - "IsInf", - "IsNaN", - "J0", - "J1", - "Jn", - "Ldexp", - "Lgamma", - "Ln10", - "Ln2", - "Log", - "Log10", - "Log10E", - "Log1p", - "Log2", - "Log2E", - "Logb", - "Max", - "MaxFloat32", - "MaxFloat64", - "MaxInt", - "MaxInt16", - "MaxInt32", - "MaxInt64", - "MaxInt8", - "MaxUint", - "MaxUint16", - "MaxUint32", - "MaxUint64", - "MaxUint8", - "Min", - "MinInt", - "MinInt16", - "MinInt32", - "MinInt64", - "MinInt8", - "Mod", - "Modf", - "NaN", - "Nextafter", - "Nextafter32", - "Phi", - "Pi", - "Pow", - "Pow10", - "Remainder", - "Round", - "RoundToEven", - "Signbit", - "Sin", - "Sincos", - "Sinh", - "SmallestNonzeroFloat32", - "SmallestNonzeroFloat64", - "Sqrt", - "Sqrt2", - "SqrtE", - "SqrtPhi", - "SqrtPi", - "Tan", - "Tanh", - "Trunc", - "Y0", - "Y1", - "Yn", - }, - "math/big": { - "Above", - "Accuracy", - "AwayFromZero", - "Below", - "ErrNaN", - "Exact", - "Float", - "Int", - "Jacobi", - "MaxBase", - "MaxExp", - "MaxPrec", - "MinExp", - "NewFloat", - "NewInt", - "NewRat", - "ParseFloat", - "Rat", - "RoundingMode", - "ToNearestAway", - "ToNearestEven", - "ToNegativeInf", - "ToPositiveInf", - "ToZero", - "Word", - }, - "math/bits": { - "Add", - "Add32", - "Add64", - "Div", - "Div32", - "Div64", - "LeadingZeros", - "LeadingZeros16", - "LeadingZeros32", - "LeadingZeros64", - "LeadingZeros8", - "Len", - "Len16", - "Len32", - "Len64", - "Len8", - "Mul", - "Mul32", - "Mul64", - "OnesCount", - "OnesCount16", - "OnesCount32", - "OnesCount64", - "OnesCount8", - "Rem", - "Rem32", - "Rem64", - "Reverse", - "Reverse16", - "Reverse32", - "Reverse64", - "Reverse8", - "ReverseBytes", - "ReverseBytes16", - "ReverseBytes32", - "ReverseBytes64", - "RotateLeft", - "RotateLeft16", - "RotateLeft32", - "RotateLeft64", - "RotateLeft8", - "Sub", - "Sub32", - "Sub64", - "TrailingZeros", - "TrailingZeros16", - "TrailingZeros32", - "TrailingZeros64", - "TrailingZeros8", - "UintSize", - }, - "math/cmplx": { - "Abs", - "Acos", - "Acosh", - "Asin", - "Asinh", - "Atan", - "Atanh", - "Conj", - "Cos", - "Cosh", - "Cot", - "Exp", - "Inf", - "IsInf", - "IsNaN", - "Log", - "Log10", - "NaN", - "Phase", - "Polar", - "Pow", - "Rect", - "Sin", - "Sinh", - "Sqrt", - "Tan", - "Tanh", - }, - "math/rand": { - "ExpFloat64", - "Float32", - "Float64", - "Int", - "Int31", - "Int31n", - "Int63", - "Int63n", - "Intn", - "New", - "NewSource", - "NewZipf", - "NormFloat64", - "Perm", - "Rand", - "Read", - "Seed", - "Shuffle", - "Source", - "Source64", - "Uint32", - "Uint64", - "Zipf", - }, - "mime": { - "AddExtensionType", - "BEncoding", - "ErrInvalidMediaParameter", - "ExtensionsByType", - "FormatMediaType", - "ParseMediaType", - "QEncoding", - "TypeByExtension", - "WordDecoder", - "WordEncoder", - }, - "mime/multipart": { - "ErrMessageTooLarge", - "File", - "FileHeader", - "Form", - "NewReader", - "NewWriter", - "Part", - "Reader", - "Writer", - }, - "mime/quotedprintable": { - "NewReader", - "NewWriter", - "Reader", - "Writer", - }, - "net": { - "Addr", - "AddrError", - "Buffers", - "CIDRMask", - "Conn", - "DNSConfigError", - "DNSError", - "DefaultResolver", - "Dial", - "DialIP", - "DialTCP", - "DialTimeout", - "DialUDP", - "DialUnix", - "Dialer", - "ErrClosed", - "ErrWriteToConnected", - "Error", - "FileConn", - "FileListener", - "FilePacketConn", - "FlagBroadcast", - "FlagLoopback", - "FlagMulticast", - "FlagPointToPoint", - "FlagRunning", - "FlagUp", - "Flags", - "HardwareAddr", - "IP", - "IPAddr", - "IPConn", - "IPMask", - "IPNet", - "IPv4", - "IPv4Mask", - "IPv4allrouter", - "IPv4allsys", - "IPv4bcast", - "IPv4len", - "IPv4zero", - "IPv6interfacelocalallnodes", - "IPv6len", - "IPv6linklocalallnodes", - "IPv6linklocalallrouters", - "IPv6loopback", - "IPv6unspecified", - "IPv6zero", - "Interface", - "InterfaceAddrs", - "InterfaceByIndex", - "InterfaceByName", - "Interfaces", - "InvalidAddrError", - "JoinHostPort", - "Listen", - "ListenConfig", - "ListenIP", - "ListenMulticastUDP", - "ListenPacket", - "ListenTCP", - "ListenUDP", - "ListenUnix", - "ListenUnixgram", - "Listener", - "LookupAddr", - "LookupCNAME", - "LookupHost", - "LookupIP", - "LookupMX", - "LookupNS", - "LookupPort", - "LookupSRV", - "LookupTXT", - "MX", - "NS", - "OpError", - "PacketConn", - "ParseCIDR", - "ParseError", - "ParseIP", - "ParseMAC", - "Pipe", - "ResolveIPAddr", - "ResolveTCPAddr", - "ResolveUDPAddr", - "ResolveUnixAddr", - "Resolver", - "SRV", - "SplitHostPort", - "TCPAddr", - "TCPAddrFromAddrPort", - "TCPConn", - "TCPListener", - "UDPAddr", - "UDPAddrFromAddrPort", - "UDPConn", - "UnixAddr", - "UnixConn", - "UnixListener", - "UnknownNetworkError", - }, - "net/http": { - "AllowQuerySemicolons", - "CanonicalHeaderKey", - "Client", - "CloseNotifier", - "ConnState", - "Cookie", - "CookieJar", - "DefaultClient", - "DefaultMaxHeaderBytes", - "DefaultMaxIdleConnsPerHost", - "DefaultServeMux", - "DefaultTransport", - "DetectContentType", - "Dir", - "ErrAbortHandler", - "ErrBodyNotAllowed", - "ErrBodyReadAfterClose", - "ErrContentLength", - "ErrHandlerTimeout", - "ErrHeaderTooLong", - "ErrHijacked", - "ErrLineTooLong", - "ErrMissingBoundary", - "ErrMissingContentLength", - "ErrMissingFile", - "ErrNoCookie", - "ErrNoLocation", - "ErrNotMultipart", - "ErrNotSupported", - "ErrSchemeMismatch", - "ErrServerClosed", - "ErrShortBody", - "ErrSkipAltProtocol", - "ErrUnexpectedTrailer", - "ErrUseLastResponse", - "ErrWriteAfterFlush", - "Error", - "FS", - "File", - "FileServer", - "FileSystem", - "Flusher", - "Get", - "Handle", - "HandleFunc", - "Handler", - "HandlerFunc", - "Head", - "Header", - "Hijacker", - "ListenAndServe", - "ListenAndServeTLS", - "LocalAddrContextKey", - "MaxBytesError", - "MaxBytesHandler", - "MaxBytesReader", - "MethodConnect", - "MethodDelete", - "MethodGet", - "MethodHead", - "MethodOptions", - "MethodPatch", - "MethodPost", - "MethodPut", - "MethodTrace", - "NewFileTransport", - "NewRequest", - "NewRequestWithContext", - "NewResponseController", - "NewServeMux", - "NoBody", - "NotFound", - "NotFoundHandler", - "ParseHTTPVersion", - "ParseTime", - "Post", - "PostForm", - "ProtocolError", - "ProxyFromEnvironment", - "ProxyURL", - "PushOptions", - "Pusher", - "ReadRequest", - "ReadResponse", - "Redirect", - "RedirectHandler", - "Request", - "Response", - "ResponseController", - "ResponseWriter", - "RoundTripper", - "SameSite", - "SameSiteDefaultMode", - "SameSiteLaxMode", - "SameSiteNoneMode", - "SameSiteStrictMode", - "Serve", - "ServeContent", - "ServeFile", - "ServeMux", - "ServeTLS", - "Server", - "ServerContextKey", - "SetCookie", - "StateActive", - "StateClosed", - "StateHijacked", - "StateIdle", - "StateNew", - "StatusAccepted", - "StatusAlreadyReported", - "StatusBadGateway", - "StatusBadRequest", - "StatusConflict", - "StatusContinue", - "StatusCreated", - "StatusEarlyHints", - "StatusExpectationFailed", - "StatusFailedDependency", - "StatusForbidden", - "StatusFound", - "StatusGatewayTimeout", - "StatusGone", - "StatusHTTPVersionNotSupported", - "StatusIMUsed", - "StatusInsufficientStorage", - "StatusInternalServerError", - "StatusLengthRequired", - "StatusLocked", - "StatusLoopDetected", - "StatusMethodNotAllowed", - "StatusMisdirectedRequest", - "StatusMovedPermanently", - "StatusMultiStatus", - "StatusMultipleChoices", - "StatusNetworkAuthenticationRequired", - "StatusNoContent", - "StatusNonAuthoritativeInfo", - "StatusNotAcceptable", - "StatusNotExtended", - "StatusNotFound", - "StatusNotImplemented", - "StatusNotModified", - "StatusOK", - "StatusPartialContent", - "StatusPaymentRequired", - "StatusPermanentRedirect", - "StatusPreconditionFailed", - "StatusPreconditionRequired", - "StatusProcessing", - "StatusProxyAuthRequired", - "StatusRequestEntityTooLarge", - "StatusRequestHeaderFieldsTooLarge", - "StatusRequestTimeout", - "StatusRequestURITooLong", - "StatusRequestedRangeNotSatisfiable", - "StatusResetContent", - "StatusSeeOther", - "StatusServiceUnavailable", - "StatusSwitchingProtocols", - "StatusTeapot", - "StatusTemporaryRedirect", - "StatusText", - "StatusTooEarly", - "StatusTooManyRequests", - "StatusUnauthorized", - "StatusUnavailableForLegalReasons", - "StatusUnprocessableEntity", - "StatusUnsupportedMediaType", - "StatusUpgradeRequired", - "StatusUseProxy", - "StatusVariantAlsoNegotiates", - "StripPrefix", - "TimeFormat", - "TimeoutHandler", - "TrailerPrefix", - "Transport", - }, - "net/http/cgi": { - "Handler", - "Request", - "RequestFromMap", - "Serve", - }, - "net/http/cookiejar": { - "Jar", - "New", - "Options", - "PublicSuffixList", - }, - "net/http/fcgi": { - "ErrConnClosed", - "ErrRequestAborted", - "ProcessEnv", - "Serve", - }, - "net/http/httptest": { - "DefaultRemoteAddr", - "NewRecorder", - "NewRequest", - "NewServer", - "NewTLSServer", - "NewUnstartedServer", - "ResponseRecorder", - "Server", - }, - "net/http/httptrace": { - "ClientTrace", - "ContextClientTrace", - "DNSDoneInfo", - "DNSStartInfo", - "GotConnInfo", - "WithClientTrace", - "WroteRequestInfo", - }, - "net/http/httputil": { - "BufferPool", - "ClientConn", - "DumpRequest", - "DumpRequestOut", - "DumpResponse", - "ErrClosed", - "ErrLineTooLong", - "ErrPersistEOF", - "ErrPipeline", - "NewChunkedReader", - "NewChunkedWriter", - "NewClientConn", - "NewProxyClientConn", - "NewServerConn", - "NewSingleHostReverseProxy", - "ProxyRequest", - "ReverseProxy", - "ServerConn", - }, - "net/http/pprof": { - "Cmdline", - "Handler", - "Index", - "Profile", - "Symbol", - "Trace", - }, - "net/mail": { - "Address", - "AddressParser", - "ErrHeaderNotPresent", - "Header", - "Message", - "ParseAddress", - "ParseAddressList", - "ParseDate", - "ReadMessage", - }, - "net/netip": { - "Addr", - "AddrFrom16", - "AddrFrom4", - "AddrFromSlice", - "AddrPort", - "AddrPortFrom", - "IPv4Unspecified", - "IPv6LinkLocalAllNodes", - "IPv6LinkLocalAllRouters", - "IPv6Loopback", - "IPv6Unspecified", - "MustParseAddr", - "MustParseAddrPort", - "MustParsePrefix", - "ParseAddr", - "ParseAddrPort", - "ParsePrefix", - "Prefix", - "PrefixFrom", - }, - "net/rpc": { - "Accept", - "Call", - "Client", - "ClientCodec", - "DefaultDebugPath", - "DefaultRPCPath", - "DefaultServer", - "Dial", - "DialHTTP", - "DialHTTPPath", - "ErrShutdown", - "HandleHTTP", - "NewClient", - "NewClientWithCodec", - "NewServer", - "Register", - "RegisterName", - "Request", - "Response", - "ServeCodec", - "ServeConn", - "ServeRequest", - "Server", - "ServerCodec", - "ServerError", - }, - "net/rpc/jsonrpc": { - "Dial", - "NewClient", - "NewClientCodec", - "NewServerCodec", - "ServeConn", - }, - "net/smtp": { - "Auth", - "CRAMMD5Auth", - "Client", - "Dial", - "NewClient", - "PlainAuth", - "SendMail", - "ServerInfo", - }, - "net/textproto": { - "CanonicalMIMEHeaderKey", - "Conn", - "Dial", - "Error", - "MIMEHeader", - "NewConn", - "NewReader", - "NewWriter", - "Pipeline", - "ProtocolError", - "Reader", - "TrimBytes", - "TrimString", - "Writer", - }, - "net/url": { - "Error", - "EscapeError", - "InvalidHostError", - "JoinPath", - "Parse", - "ParseQuery", - "ParseRequestURI", - "PathEscape", - "PathUnescape", - "QueryEscape", - "QueryUnescape", - "URL", - "User", - "UserPassword", - "Userinfo", - "Values", - }, - "os": { - "Args", - "Chdir", - "Chmod", - "Chown", - "Chtimes", - "Clearenv", - "Create", - "CreateTemp", - "DevNull", - "DirEntry", - "DirFS", - "Environ", - "ErrClosed", - "ErrDeadlineExceeded", - "ErrExist", - "ErrInvalid", - "ErrNoDeadline", - "ErrNotExist", - "ErrPermission", - "ErrProcessDone", - "Executable", - "Exit", - "Expand", - "ExpandEnv", - "File", - "FileInfo", - "FileMode", - "FindProcess", - "Getegid", - "Getenv", - "Geteuid", - "Getgid", - "Getgroups", - "Getpagesize", - "Getpid", - "Getppid", - "Getuid", - "Getwd", - "Hostname", - "Interrupt", - "IsExist", - "IsNotExist", - "IsPathSeparator", - "IsPermission", - "IsTimeout", - "Kill", - "Lchown", - "Link", - "LinkError", - "LookupEnv", - "Lstat", - "Mkdir", - "MkdirAll", - "MkdirTemp", - "ModeAppend", - "ModeCharDevice", - "ModeDevice", - "ModeDir", - "ModeExclusive", - "ModeIrregular", - "ModeNamedPipe", - "ModePerm", - "ModeSetgid", - "ModeSetuid", - "ModeSocket", - "ModeSticky", - "ModeSymlink", - "ModeTemporary", - "ModeType", - "NewFile", - "NewSyscallError", - "O_APPEND", - "O_CREATE", - "O_EXCL", - "O_RDONLY", - "O_RDWR", - "O_SYNC", - "O_TRUNC", - "O_WRONLY", - "Open", - "OpenFile", - "PathError", - "PathListSeparator", - "PathSeparator", - "Pipe", - "ProcAttr", - "Process", - "ProcessState", - "ReadDir", - "ReadFile", - "Readlink", - "Remove", - "RemoveAll", - "Rename", - "SEEK_CUR", - "SEEK_END", - "SEEK_SET", - "SameFile", - "Setenv", - "Signal", - "StartProcess", - "Stat", - "Stderr", - "Stdin", - "Stdout", - "Symlink", - "SyscallError", - "TempDir", - "Truncate", - "Unsetenv", - "UserCacheDir", - "UserConfigDir", - "UserHomeDir", - "WriteFile", - }, - "os/exec": { - "Cmd", - "Command", - "CommandContext", - "ErrDot", - "ErrNotFound", - "ErrWaitDelay", - "Error", - "ExitError", - "LookPath", - }, - "os/signal": { - "Ignore", - "Ignored", - "Notify", - "NotifyContext", - "Reset", - "Stop", - }, - "os/user": { - "Current", - "Group", - "Lookup", - "LookupGroup", - "LookupGroupId", - "LookupId", - "UnknownGroupError", - "UnknownGroupIdError", - "UnknownUserError", - "UnknownUserIdError", - "User", - }, - "path": { - "Base", - "Clean", - "Dir", - "ErrBadPattern", - "Ext", - "IsAbs", - "Join", - "Match", - "Split", - }, - "path/filepath": { - "Abs", - "Base", - "Clean", - "Dir", - "ErrBadPattern", - "EvalSymlinks", - "Ext", - "FromSlash", - "Glob", - "HasPrefix", - "IsAbs", - "IsLocal", - "Join", - "ListSeparator", - "Match", - "Rel", - "Separator", - "SkipAll", - "SkipDir", - "Split", - "SplitList", - "ToSlash", - "VolumeName", - "Walk", - "WalkDir", - "WalkFunc", - }, - "plugin": { - "Open", - "Plugin", - "Symbol", - }, - "reflect": { - "Append", - "AppendSlice", - "Array", - "ArrayOf", - "Bool", - "BothDir", - "Chan", - "ChanDir", - "ChanOf", - "Complex128", - "Complex64", - "Copy", - "DeepEqual", - "Float32", - "Float64", - "Func", - "FuncOf", - "Indirect", - "Int", - "Int16", - "Int32", - "Int64", - "Int8", - "Interface", - "Invalid", - "Kind", - "MakeChan", - "MakeFunc", - "MakeMap", - "MakeMapWithSize", - "MakeSlice", - "Map", - "MapIter", - "MapOf", - "Method", - "New", - "NewAt", - "Pointer", - "PointerTo", - "Ptr", - "PtrTo", - "RecvDir", - "Select", - "SelectCase", - "SelectDefault", - "SelectDir", - "SelectRecv", - "SelectSend", - "SendDir", - "Slice", - "SliceHeader", - "SliceOf", - "String", - "StringHeader", - "Struct", - "StructField", - "StructOf", - "StructTag", - "Swapper", - "Type", - "TypeOf", - "Uint", - "Uint16", - "Uint32", - "Uint64", - "Uint8", - "Uintptr", - "UnsafePointer", - "Value", - "ValueError", - "ValueOf", - "VisibleFields", - "Zero", - }, - "regexp": { - "Compile", - "CompilePOSIX", - "Match", - "MatchReader", - "MatchString", - "MustCompile", - "MustCompilePOSIX", - "QuoteMeta", - "Regexp", - }, - "regexp/syntax": { - "ClassNL", - "Compile", - "DotNL", - "EmptyBeginLine", - "EmptyBeginText", - "EmptyEndLine", - "EmptyEndText", - "EmptyNoWordBoundary", - "EmptyOp", - "EmptyOpContext", - "EmptyWordBoundary", - "ErrInternalError", - "ErrInvalidCharClass", - "ErrInvalidCharRange", - "ErrInvalidEscape", - "ErrInvalidNamedCapture", - "ErrInvalidPerlOp", - "ErrInvalidRepeatOp", - "ErrInvalidRepeatSize", - "ErrInvalidUTF8", - "ErrLarge", - "ErrMissingBracket", - "ErrMissingParen", - "ErrMissingRepeatArgument", - "ErrNestingDepth", - "ErrTrailingBackslash", - "ErrUnexpectedParen", - "Error", - "ErrorCode", - "Flags", - "FoldCase", - "Inst", - "InstAlt", - "InstAltMatch", - "InstCapture", - "InstEmptyWidth", - "InstFail", - "InstMatch", - "InstNop", - "InstOp", - "InstRune", - "InstRune1", - "InstRuneAny", - "InstRuneAnyNotNL", - "IsWordChar", - "Literal", - "MatchNL", - "NonGreedy", - "OneLine", - "Op", - "OpAlternate", - "OpAnyChar", - "OpAnyCharNotNL", - "OpBeginLine", - "OpBeginText", - "OpCapture", - "OpCharClass", - "OpConcat", - "OpEmptyMatch", - "OpEndLine", - "OpEndText", - "OpLiteral", - "OpNoMatch", - "OpNoWordBoundary", - "OpPlus", - "OpQuest", - "OpRepeat", - "OpStar", - "OpWordBoundary", - "POSIX", - "Parse", - "Perl", - "PerlX", - "Prog", - "Regexp", - "Simple", - "UnicodeGroups", - "WasDollar", - }, - "runtime": { - "BlockProfile", - "BlockProfileRecord", - "Breakpoint", - "CPUProfile", - "Caller", - "Callers", - "CallersFrames", - "Compiler", - "Error", - "Frame", - "Frames", - "Func", - "FuncForPC", - "GC", - "GOARCH", - "GOMAXPROCS", - "GOOS", - "GOROOT", - "Goexit", - "GoroutineProfile", - "Gosched", - "KeepAlive", - "LockOSThread", - "MemProfile", - "MemProfileRate", - "MemProfileRecord", - "MemStats", - "MutexProfile", - "NumCPU", - "NumCgoCall", - "NumGoroutine", - "PanicNilError", - "Pinner", - "ReadMemStats", - "ReadTrace", - "SetBlockProfileRate", - "SetCPUProfileRate", - "SetCgoTraceback", - "SetFinalizer", - "SetMutexProfileFraction", - "Stack", - "StackRecord", - "StartTrace", - "StopTrace", - "ThreadCreateProfile", - "TypeAssertionError", - "UnlockOSThread", - "Version", - }, - "runtime/cgo": { - "Handle", - "Incomplete", - "NewHandle", - }, - "runtime/coverage": { - "ClearCounters", - "WriteCounters", - "WriteCountersDir", - "WriteMeta", - "WriteMetaDir", - }, - "runtime/debug": { - "BuildInfo", - "BuildSetting", - "FreeOSMemory", - "GCStats", - "Module", - "ParseBuildInfo", - "PrintStack", - "ReadBuildInfo", - "ReadGCStats", - "SetGCPercent", - "SetMaxStack", - "SetMaxThreads", - "SetMemoryLimit", - "SetPanicOnFault", - "SetTraceback", - "Stack", - "WriteHeapDump", - }, - "runtime/metrics": { - "All", - "Description", - "Float64Histogram", - "KindBad", - "KindFloat64", - "KindFloat64Histogram", - "KindUint64", - "Read", - "Sample", - "Value", - "ValueKind", - }, - "runtime/pprof": { - "Do", - "ForLabels", - "Label", - "LabelSet", - "Labels", - "Lookup", - "NewProfile", - "Profile", - "Profiles", - "SetGoroutineLabels", - "StartCPUProfile", - "StopCPUProfile", - "WithLabels", - "WriteHeapProfile", - }, - "runtime/trace": { - "IsEnabled", - "Log", - "Logf", - "NewTask", - "Region", - "Start", - "StartRegion", - "Stop", - "Task", - "WithRegion", - }, - "slices": { - "BinarySearch", - "BinarySearchFunc", - "Clip", - "Clone", - "Compact", - "CompactFunc", - "Compare", - "CompareFunc", - "Contains", - "ContainsFunc", - "Delete", - "DeleteFunc", - "Equal", - "EqualFunc", - "Grow", - "Index", - "IndexFunc", - "Insert", - "IsSorted", - "IsSortedFunc", - "Max", - "MaxFunc", - "Min", - "MinFunc", - "Replace", - "Reverse", - "Sort", - "SortFunc", - "SortStableFunc", - }, - "sort": { - "Find", - "Float64Slice", - "Float64s", - "Float64sAreSorted", - "IntSlice", - "Interface", - "Ints", - "IntsAreSorted", - "IsSorted", - "Reverse", - "Search", - "SearchFloat64s", - "SearchInts", - "SearchStrings", - "Slice", - "SliceIsSorted", - "SliceStable", - "Sort", - "Stable", - "StringSlice", - "Strings", - "StringsAreSorted", - }, - "strconv": { - "AppendBool", - "AppendFloat", - "AppendInt", - "AppendQuote", - "AppendQuoteRune", - "AppendQuoteRuneToASCII", - "AppendQuoteRuneToGraphic", - "AppendQuoteToASCII", - "AppendQuoteToGraphic", - "AppendUint", - "Atoi", - "CanBackquote", - "ErrRange", - "ErrSyntax", - "FormatBool", - "FormatComplex", - "FormatFloat", - "FormatInt", - "FormatUint", - "IntSize", - "IsGraphic", - "IsPrint", - "Itoa", - "NumError", - "ParseBool", - "ParseComplex", - "ParseFloat", - "ParseInt", - "ParseUint", - "Quote", - "QuoteRune", - "QuoteRuneToASCII", - "QuoteRuneToGraphic", - "QuoteToASCII", - "QuoteToGraphic", - "QuotedPrefix", - "Unquote", - "UnquoteChar", - }, - "strings": { - "Builder", - "Clone", - "Compare", - "Contains", - "ContainsAny", - "ContainsFunc", - "ContainsRune", - "Count", - "Cut", - "CutPrefix", - "CutSuffix", - "EqualFold", - "Fields", - "FieldsFunc", - "HasPrefix", - "HasSuffix", - "Index", - "IndexAny", - "IndexByte", - "IndexFunc", - "IndexRune", - "Join", - "LastIndex", - "LastIndexAny", - "LastIndexByte", - "LastIndexFunc", - "Map", - "NewReader", - "NewReplacer", - "Reader", - "Repeat", - "Replace", - "ReplaceAll", - "Replacer", - "Split", - "SplitAfter", - "SplitAfterN", - "SplitN", - "Title", - "ToLower", - "ToLowerSpecial", - "ToTitle", - "ToTitleSpecial", - "ToUpper", - "ToUpperSpecial", - "ToValidUTF8", - "Trim", - "TrimFunc", - "TrimLeft", - "TrimLeftFunc", - "TrimPrefix", - "TrimRight", - "TrimRightFunc", - "TrimSpace", - "TrimSuffix", - }, - "sync": { - "Cond", - "Locker", - "Map", - "Mutex", - "NewCond", - "Once", - "OnceFunc", - "OnceValue", - "OnceValues", - "Pool", - "RWMutex", - "WaitGroup", - }, - "sync/atomic": { - "AddInt32", - "AddInt64", - "AddUint32", - "AddUint64", - "AddUintptr", - "Bool", - "CompareAndSwapInt32", - "CompareAndSwapInt64", - "CompareAndSwapPointer", - "CompareAndSwapUint32", - "CompareAndSwapUint64", - "CompareAndSwapUintptr", - "Int32", - "Int64", - "LoadInt32", - "LoadInt64", - "LoadPointer", - "LoadUint32", - "LoadUint64", - "LoadUintptr", - "Pointer", - "StoreInt32", - "StoreInt64", - "StorePointer", - "StoreUint32", - "StoreUint64", - "StoreUintptr", - "SwapInt32", - "SwapInt64", - "SwapPointer", - "SwapUint32", - "SwapUint64", - "SwapUintptr", - "Uint32", - "Uint64", - "Uintptr", - "Value", - }, - "syscall": { - "AF_ALG", - "AF_APPLETALK", - "AF_ARP", - "AF_ASH", - "AF_ATM", - "AF_ATMPVC", - "AF_ATMSVC", - "AF_AX25", - "AF_BLUETOOTH", - "AF_BRIDGE", - "AF_CAIF", - "AF_CAN", - "AF_CCITT", - "AF_CHAOS", - "AF_CNT", - "AF_COIP", - "AF_DATAKIT", - "AF_DECnet", - "AF_DLI", - "AF_E164", - "AF_ECMA", - "AF_ECONET", - "AF_ENCAP", - "AF_FILE", - "AF_HYLINK", - "AF_IEEE80211", - "AF_IEEE802154", - "AF_IMPLINK", - "AF_INET", - "AF_INET6", - "AF_INET6_SDP", - "AF_INET_SDP", - "AF_IPX", - "AF_IRDA", - "AF_ISDN", - "AF_ISO", - "AF_IUCV", - "AF_KEY", - "AF_LAT", - "AF_LINK", - "AF_LLC", - "AF_LOCAL", - "AF_MAX", - "AF_MPLS", - "AF_NATM", - "AF_NDRV", - "AF_NETBEUI", - "AF_NETBIOS", - "AF_NETGRAPH", - "AF_NETLINK", - "AF_NETROM", - "AF_NS", - "AF_OROUTE", - "AF_OSI", - "AF_PACKET", - "AF_PHONET", - "AF_PPP", - "AF_PPPOX", - "AF_PUP", - "AF_RDS", - "AF_RESERVED_36", - "AF_ROSE", - "AF_ROUTE", - "AF_RXRPC", - "AF_SCLUSTER", - "AF_SECURITY", - "AF_SIP", - "AF_SLOW", - "AF_SNA", - "AF_SYSTEM", - "AF_TIPC", - "AF_UNIX", - "AF_UNSPEC", - "AF_UTUN", - "AF_VENDOR00", - "AF_VENDOR01", - "AF_VENDOR02", - "AF_VENDOR03", - "AF_VENDOR04", - "AF_VENDOR05", - "AF_VENDOR06", - "AF_VENDOR07", - "AF_VENDOR08", - "AF_VENDOR09", - "AF_VENDOR10", - "AF_VENDOR11", - "AF_VENDOR12", - "AF_VENDOR13", - "AF_VENDOR14", - "AF_VENDOR15", - "AF_VENDOR16", - "AF_VENDOR17", - "AF_VENDOR18", - "AF_VENDOR19", - "AF_VENDOR20", - "AF_VENDOR21", - "AF_VENDOR22", - "AF_VENDOR23", - "AF_VENDOR24", - "AF_VENDOR25", - "AF_VENDOR26", - "AF_VENDOR27", - "AF_VENDOR28", - "AF_VENDOR29", - "AF_VENDOR30", - "AF_VENDOR31", - "AF_VENDOR32", - "AF_VENDOR33", - "AF_VENDOR34", - "AF_VENDOR35", - "AF_VENDOR36", - "AF_VENDOR37", - "AF_VENDOR38", - "AF_VENDOR39", - "AF_VENDOR40", - "AF_VENDOR41", - "AF_VENDOR42", - "AF_VENDOR43", - "AF_VENDOR44", - "AF_VENDOR45", - "AF_VENDOR46", - "AF_VENDOR47", - "AF_WANPIPE", - "AF_X25", - "AI_CANONNAME", - "AI_NUMERICHOST", - "AI_PASSIVE", - "APPLICATION_ERROR", - "ARPHRD_ADAPT", - "ARPHRD_APPLETLK", - "ARPHRD_ARCNET", - "ARPHRD_ASH", - "ARPHRD_ATM", - "ARPHRD_AX25", - "ARPHRD_BIF", - "ARPHRD_CHAOS", - "ARPHRD_CISCO", - "ARPHRD_CSLIP", - "ARPHRD_CSLIP6", - "ARPHRD_DDCMP", - "ARPHRD_DLCI", - "ARPHRD_ECONET", - "ARPHRD_EETHER", - "ARPHRD_ETHER", - "ARPHRD_EUI64", - "ARPHRD_FCAL", - "ARPHRD_FCFABRIC", - "ARPHRD_FCPL", - "ARPHRD_FCPP", - "ARPHRD_FDDI", - "ARPHRD_FRAD", - "ARPHRD_FRELAY", - "ARPHRD_HDLC", - "ARPHRD_HIPPI", - "ARPHRD_HWX25", - "ARPHRD_IEEE1394", - "ARPHRD_IEEE802", - "ARPHRD_IEEE80211", - "ARPHRD_IEEE80211_PRISM", - "ARPHRD_IEEE80211_RADIOTAP", - "ARPHRD_IEEE802154", - "ARPHRD_IEEE802154_PHY", - "ARPHRD_IEEE802_TR", - "ARPHRD_INFINIBAND", - "ARPHRD_IPDDP", - "ARPHRD_IPGRE", - "ARPHRD_IRDA", - "ARPHRD_LAPB", - "ARPHRD_LOCALTLK", - "ARPHRD_LOOPBACK", - "ARPHRD_METRICOM", - "ARPHRD_NETROM", - "ARPHRD_NONE", - "ARPHRD_PIMREG", - "ARPHRD_PPP", - "ARPHRD_PRONET", - "ARPHRD_RAWHDLC", - "ARPHRD_ROSE", - "ARPHRD_RSRVD", - "ARPHRD_SIT", - "ARPHRD_SKIP", - "ARPHRD_SLIP", - "ARPHRD_SLIP6", - "ARPHRD_STRIP", - "ARPHRD_TUNNEL", - "ARPHRD_TUNNEL6", - "ARPHRD_VOID", - "ARPHRD_X25", - "AUTHTYPE_CLIENT", - "AUTHTYPE_SERVER", - "Accept", - "Accept4", - "AcceptEx", - "Access", - "Acct", - "AddrinfoW", - "Adjtime", - "Adjtimex", - "AllThreadsSyscall", - "AllThreadsSyscall6", - "AttachLsf", - "B0", - "B1000000", - "B110", - "B115200", - "B1152000", - "B1200", - "B134", - "B14400", - "B150", - "B1500000", - "B1800", - "B19200", - "B200", - "B2000000", - "B230400", - "B2400", - "B2500000", - "B28800", - "B300", - "B3000000", - "B3500000", - "B38400", - "B4000000", - "B460800", - "B4800", - "B50", - "B500000", - "B57600", - "B576000", - "B600", - "B7200", - "B75", - "B76800", - "B921600", - "B9600", - "BASE_PROTOCOL", - "BIOCFEEDBACK", - "BIOCFLUSH", - "BIOCGBLEN", - "BIOCGDIRECTION", - "BIOCGDIRFILT", - "BIOCGDLT", - "BIOCGDLTLIST", - "BIOCGETBUFMODE", - "BIOCGETIF", - "BIOCGETZMAX", - "BIOCGFEEDBACK", - "BIOCGFILDROP", - "BIOCGHDRCMPLT", - "BIOCGRSIG", - "BIOCGRTIMEOUT", - "BIOCGSEESENT", - "BIOCGSTATS", - "BIOCGSTATSOLD", - "BIOCGTSTAMP", - "BIOCIMMEDIATE", - "BIOCLOCK", - "BIOCPROMISC", - "BIOCROTZBUF", - "BIOCSBLEN", - "BIOCSDIRECTION", - "BIOCSDIRFILT", - "BIOCSDLT", - "BIOCSETBUFMODE", - "BIOCSETF", - "BIOCSETFNR", - "BIOCSETIF", - "BIOCSETWF", - "BIOCSETZBUF", - "BIOCSFEEDBACK", - "BIOCSFILDROP", - "BIOCSHDRCMPLT", - "BIOCSRSIG", - "BIOCSRTIMEOUT", - "BIOCSSEESENT", - "BIOCSTCPF", - "BIOCSTSTAMP", - "BIOCSUDPF", - "BIOCVERSION", - "BPF_A", - "BPF_ABS", - "BPF_ADD", - "BPF_ALIGNMENT", - "BPF_ALIGNMENT32", - "BPF_ALU", - "BPF_AND", - "BPF_B", - "BPF_BUFMODE_BUFFER", - "BPF_BUFMODE_ZBUF", - "BPF_DFLTBUFSIZE", - "BPF_DIRECTION_IN", - "BPF_DIRECTION_OUT", - "BPF_DIV", - "BPF_H", - "BPF_IMM", - "BPF_IND", - "BPF_JA", - "BPF_JEQ", - "BPF_JGE", - "BPF_JGT", - "BPF_JMP", - "BPF_JSET", - "BPF_K", - "BPF_LD", - "BPF_LDX", - "BPF_LEN", - "BPF_LSH", - "BPF_MAJOR_VERSION", - "BPF_MAXBUFSIZE", - "BPF_MAXINSNS", - "BPF_MEM", - "BPF_MEMWORDS", - "BPF_MINBUFSIZE", - "BPF_MINOR_VERSION", - "BPF_MISC", - "BPF_MSH", - "BPF_MUL", - "BPF_NEG", - "BPF_OR", - "BPF_RELEASE", - "BPF_RET", - "BPF_RSH", - "BPF_ST", - "BPF_STX", - "BPF_SUB", - "BPF_TAX", - "BPF_TXA", - "BPF_T_BINTIME", - "BPF_T_BINTIME_FAST", - "BPF_T_BINTIME_MONOTONIC", - "BPF_T_BINTIME_MONOTONIC_FAST", - "BPF_T_FAST", - "BPF_T_FLAG_MASK", - "BPF_T_FORMAT_MASK", - "BPF_T_MICROTIME", - "BPF_T_MICROTIME_FAST", - "BPF_T_MICROTIME_MONOTONIC", - "BPF_T_MICROTIME_MONOTONIC_FAST", - "BPF_T_MONOTONIC", - "BPF_T_MONOTONIC_FAST", - "BPF_T_NANOTIME", - "BPF_T_NANOTIME_FAST", - "BPF_T_NANOTIME_MONOTONIC", - "BPF_T_NANOTIME_MONOTONIC_FAST", - "BPF_T_NONE", - "BPF_T_NORMAL", - "BPF_W", - "BPF_X", - "BRKINT", - "Bind", - "BindToDevice", - "BpfBuflen", - "BpfDatalink", - "BpfHdr", - "BpfHeadercmpl", - "BpfInsn", - "BpfInterface", - "BpfJump", - "BpfProgram", - "BpfStat", - "BpfStats", - "BpfStmt", - "BpfTimeout", - "BpfTimeval", - "BpfVersion", - "BpfZbuf", - "BpfZbufHeader", - "ByHandleFileInformation", - "BytePtrFromString", - "ByteSliceFromString", - "CCR0_FLUSH", - "CERT_CHAIN_POLICY_AUTHENTICODE", - "CERT_CHAIN_POLICY_AUTHENTICODE_TS", - "CERT_CHAIN_POLICY_BASE", - "CERT_CHAIN_POLICY_BASIC_CONSTRAINTS", - "CERT_CHAIN_POLICY_EV", - "CERT_CHAIN_POLICY_MICROSOFT_ROOT", - "CERT_CHAIN_POLICY_NT_AUTH", - "CERT_CHAIN_POLICY_SSL", - "CERT_E_CN_NO_MATCH", - "CERT_E_EXPIRED", - "CERT_E_PURPOSE", - "CERT_E_ROLE", - "CERT_E_UNTRUSTEDROOT", - "CERT_STORE_ADD_ALWAYS", - "CERT_STORE_DEFER_CLOSE_UNTIL_LAST_FREE_FLAG", - "CERT_STORE_PROV_MEMORY", - "CERT_TRUST_HAS_EXCLUDED_NAME_CONSTRAINT", - "CERT_TRUST_HAS_NOT_DEFINED_NAME_CONSTRAINT", - "CERT_TRUST_HAS_NOT_PERMITTED_NAME_CONSTRAINT", - "CERT_TRUST_HAS_NOT_SUPPORTED_CRITICAL_EXT", - "CERT_TRUST_HAS_NOT_SUPPORTED_NAME_CONSTRAINT", - "CERT_TRUST_INVALID_BASIC_CONSTRAINTS", - "CERT_TRUST_INVALID_EXTENSION", - "CERT_TRUST_INVALID_NAME_CONSTRAINTS", - "CERT_TRUST_INVALID_POLICY_CONSTRAINTS", - "CERT_TRUST_IS_CYCLIC", - "CERT_TRUST_IS_EXPLICIT_DISTRUST", - "CERT_TRUST_IS_NOT_SIGNATURE_VALID", - "CERT_TRUST_IS_NOT_TIME_VALID", - "CERT_TRUST_IS_NOT_VALID_FOR_USAGE", - "CERT_TRUST_IS_OFFLINE_REVOCATION", - "CERT_TRUST_IS_REVOKED", - "CERT_TRUST_IS_UNTRUSTED_ROOT", - "CERT_TRUST_NO_ERROR", - "CERT_TRUST_NO_ISSUANCE_CHAIN_POLICY", - "CERT_TRUST_REVOCATION_STATUS_UNKNOWN", - "CFLUSH", - "CLOCAL", - "CLONE_CHILD_CLEARTID", - "CLONE_CHILD_SETTID", - "CLONE_CLEAR_SIGHAND", - "CLONE_CSIGNAL", - "CLONE_DETACHED", - "CLONE_FILES", - "CLONE_FS", - "CLONE_INTO_CGROUP", - "CLONE_IO", - "CLONE_NEWCGROUP", - "CLONE_NEWIPC", - "CLONE_NEWNET", - "CLONE_NEWNS", - "CLONE_NEWPID", - "CLONE_NEWTIME", - "CLONE_NEWUSER", - "CLONE_NEWUTS", - "CLONE_PARENT", - "CLONE_PARENT_SETTID", - "CLONE_PID", - "CLONE_PIDFD", - "CLONE_PTRACE", - "CLONE_SETTLS", - "CLONE_SIGHAND", - "CLONE_SYSVSEM", - "CLONE_THREAD", - "CLONE_UNTRACED", - "CLONE_VFORK", - "CLONE_VM", - "CPUID_CFLUSH", - "CREAD", - "CREATE_ALWAYS", - "CREATE_NEW", - "CREATE_NEW_PROCESS_GROUP", - "CREATE_UNICODE_ENVIRONMENT", - "CRYPT_DEFAULT_CONTAINER_OPTIONAL", - "CRYPT_DELETEKEYSET", - "CRYPT_MACHINE_KEYSET", - "CRYPT_NEWKEYSET", - "CRYPT_SILENT", - "CRYPT_VERIFYCONTEXT", - "CS5", - "CS6", - "CS7", - "CS8", - "CSIZE", - "CSTART", - "CSTATUS", - "CSTOP", - "CSTOPB", - "CSUSP", - "CTL_MAXNAME", - "CTL_NET", - "CTL_QUERY", - "CTRL_BREAK_EVENT", - "CTRL_CLOSE_EVENT", - "CTRL_C_EVENT", - "CTRL_LOGOFF_EVENT", - "CTRL_SHUTDOWN_EVENT", - "CancelIo", - "CancelIoEx", - "CertAddCertificateContextToStore", - "CertChainContext", - "CertChainElement", - "CertChainPara", - "CertChainPolicyPara", - "CertChainPolicyStatus", - "CertCloseStore", - "CertContext", - "CertCreateCertificateContext", - "CertEnhKeyUsage", - "CertEnumCertificatesInStore", - "CertFreeCertificateChain", - "CertFreeCertificateContext", - "CertGetCertificateChain", - "CertInfo", - "CertOpenStore", - "CertOpenSystemStore", - "CertRevocationCrlInfo", - "CertRevocationInfo", - "CertSimpleChain", - "CertTrustListInfo", - "CertTrustStatus", - "CertUsageMatch", - "CertVerifyCertificateChainPolicy", - "Chdir", - "CheckBpfVersion", - "Chflags", - "Chmod", - "Chown", - "Chroot", - "Clearenv", - "Close", - "CloseHandle", - "CloseOnExec", - "Closesocket", - "CmsgLen", - "CmsgSpace", - "Cmsghdr", - "CommandLineToArgv", - "ComputerName", - "Conn", - "Connect", - "ConnectEx", - "ConvertSidToStringSid", - "ConvertStringSidToSid", - "CopySid", - "Creat", - "CreateDirectory", - "CreateFile", - "CreateFileMapping", - "CreateHardLink", - "CreateIoCompletionPort", - "CreatePipe", - "CreateProcess", - "CreateProcessAsUser", - "CreateSymbolicLink", - "CreateToolhelp32Snapshot", - "Credential", - "CryptAcquireContext", - "CryptGenRandom", - "CryptReleaseContext", - "DIOCBSFLUSH", - "DIOCOSFPFLUSH", - "DLL", - "DLLError", - "DLT_A429", - "DLT_A653_ICM", - "DLT_AIRONET_HEADER", - "DLT_AOS", - "DLT_APPLE_IP_OVER_IEEE1394", - "DLT_ARCNET", - "DLT_ARCNET_LINUX", - "DLT_ATM_CLIP", - "DLT_ATM_RFC1483", - "DLT_AURORA", - "DLT_AX25", - "DLT_AX25_KISS", - "DLT_BACNET_MS_TP", - "DLT_BLUETOOTH_HCI_H4", - "DLT_BLUETOOTH_HCI_H4_WITH_PHDR", - "DLT_CAN20B", - "DLT_CAN_SOCKETCAN", - "DLT_CHAOS", - "DLT_CHDLC", - "DLT_CISCO_IOS", - "DLT_C_HDLC", - "DLT_C_HDLC_WITH_DIR", - "DLT_DBUS", - "DLT_DECT", - "DLT_DOCSIS", - "DLT_DVB_CI", - "DLT_ECONET", - "DLT_EN10MB", - "DLT_EN3MB", - "DLT_ENC", - "DLT_ERF", - "DLT_ERF_ETH", - "DLT_ERF_POS", - "DLT_FC_2", - "DLT_FC_2_WITH_FRAME_DELIMS", - "DLT_FDDI", - "DLT_FLEXRAY", - "DLT_FRELAY", - "DLT_FRELAY_WITH_DIR", - "DLT_GCOM_SERIAL", - "DLT_GCOM_T1E1", - "DLT_GPF_F", - "DLT_GPF_T", - "DLT_GPRS_LLC", - "DLT_GSMTAP_ABIS", - "DLT_GSMTAP_UM", - "DLT_HDLC", - "DLT_HHDLC", - "DLT_HIPPI", - "DLT_IBM_SN", - "DLT_IBM_SP", - "DLT_IEEE802", - "DLT_IEEE802_11", - "DLT_IEEE802_11_RADIO", - "DLT_IEEE802_11_RADIO_AVS", - "DLT_IEEE802_15_4", - "DLT_IEEE802_15_4_LINUX", - "DLT_IEEE802_15_4_NOFCS", - "DLT_IEEE802_15_4_NONASK_PHY", - "DLT_IEEE802_16_MAC_CPS", - "DLT_IEEE802_16_MAC_CPS_RADIO", - "DLT_IPFILTER", - "DLT_IPMB", - "DLT_IPMB_LINUX", - "DLT_IPNET", - "DLT_IPOIB", - "DLT_IPV4", - "DLT_IPV6", - "DLT_IP_OVER_FC", - "DLT_JUNIPER_ATM1", - "DLT_JUNIPER_ATM2", - "DLT_JUNIPER_ATM_CEMIC", - "DLT_JUNIPER_CHDLC", - "DLT_JUNIPER_ES", - "DLT_JUNIPER_ETHER", - "DLT_JUNIPER_FIBRECHANNEL", - "DLT_JUNIPER_FRELAY", - "DLT_JUNIPER_GGSN", - "DLT_JUNIPER_ISM", - "DLT_JUNIPER_MFR", - "DLT_JUNIPER_MLFR", - "DLT_JUNIPER_MLPPP", - "DLT_JUNIPER_MONITOR", - "DLT_JUNIPER_PIC_PEER", - "DLT_JUNIPER_PPP", - "DLT_JUNIPER_PPPOE", - "DLT_JUNIPER_PPPOE_ATM", - "DLT_JUNIPER_SERVICES", - "DLT_JUNIPER_SRX_E2E", - "DLT_JUNIPER_ST", - "DLT_JUNIPER_VP", - "DLT_JUNIPER_VS", - "DLT_LAPB_WITH_DIR", - "DLT_LAPD", - "DLT_LIN", - "DLT_LINUX_EVDEV", - "DLT_LINUX_IRDA", - "DLT_LINUX_LAPD", - "DLT_LINUX_PPP_WITHDIRECTION", - "DLT_LINUX_SLL", - "DLT_LOOP", - "DLT_LTALK", - "DLT_MATCHING_MAX", - "DLT_MATCHING_MIN", - "DLT_MFR", - "DLT_MOST", - "DLT_MPEG_2_TS", - "DLT_MPLS", - "DLT_MTP2", - "DLT_MTP2_WITH_PHDR", - "DLT_MTP3", - "DLT_MUX27010", - "DLT_NETANALYZER", - "DLT_NETANALYZER_TRANSPARENT", - "DLT_NFC_LLCP", - "DLT_NFLOG", - "DLT_NG40", - "DLT_NULL", - "DLT_PCI_EXP", - "DLT_PFLOG", - "DLT_PFSYNC", - "DLT_PPI", - "DLT_PPP", - "DLT_PPP_BSDOS", - "DLT_PPP_ETHER", - "DLT_PPP_PPPD", - "DLT_PPP_SERIAL", - "DLT_PPP_WITH_DIR", - "DLT_PPP_WITH_DIRECTION", - "DLT_PRISM_HEADER", - "DLT_PRONET", - "DLT_RAIF1", - "DLT_RAW", - "DLT_RAWAF_MASK", - "DLT_RIO", - "DLT_SCCP", - "DLT_SITA", - "DLT_SLIP", - "DLT_SLIP_BSDOS", - "DLT_STANAG_5066_D_PDU", - "DLT_SUNATM", - "DLT_SYMANTEC_FIREWALL", - "DLT_TZSP", - "DLT_USB", - "DLT_USB_LINUX", - "DLT_USB_LINUX_MMAPPED", - "DLT_USER0", - "DLT_USER1", - "DLT_USER10", - "DLT_USER11", - "DLT_USER12", - "DLT_USER13", - "DLT_USER14", - "DLT_USER15", - "DLT_USER2", - "DLT_USER3", - "DLT_USER4", - "DLT_USER5", - "DLT_USER6", - "DLT_USER7", - "DLT_USER8", - "DLT_USER9", - "DLT_WIHART", - "DLT_X2E_SERIAL", - "DLT_X2E_XORAYA", - "DNSMXData", - "DNSPTRData", - "DNSRecord", - "DNSSRVData", - "DNSTXTData", - "DNS_INFO_NO_RECORDS", - "DNS_TYPE_A", - "DNS_TYPE_A6", - "DNS_TYPE_AAAA", - "DNS_TYPE_ADDRS", - "DNS_TYPE_AFSDB", - "DNS_TYPE_ALL", - "DNS_TYPE_ANY", - "DNS_TYPE_ATMA", - "DNS_TYPE_AXFR", - "DNS_TYPE_CERT", - "DNS_TYPE_CNAME", - "DNS_TYPE_DHCID", - "DNS_TYPE_DNAME", - "DNS_TYPE_DNSKEY", - "DNS_TYPE_DS", - "DNS_TYPE_EID", - "DNS_TYPE_GID", - "DNS_TYPE_GPOS", - "DNS_TYPE_HINFO", - "DNS_TYPE_ISDN", - "DNS_TYPE_IXFR", - "DNS_TYPE_KEY", - "DNS_TYPE_KX", - "DNS_TYPE_LOC", - "DNS_TYPE_MAILA", - "DNS_TYPE_MAILB", - "DNS_TYPE_MB", - "DNS_TYPE_MD", - "DNS_TYPE_MF", - "DNS_TYPE_MG", - "DNS_TYPE_MINFO", - "DNS_TYPE_MR", - "DNS_TYPE_MX", - "DNS_TYPE_NAPTR", - "DNS_TYPE_NBSTAT", - "DNS_TYPE_NIMLOC", - "DNS_TYPE_NS", - "DNS_TYPE_NSAP", - "DNS_TYPE_NSAPPTR", - "DNS_TYPE_NSEC", - "DNS_TYPE_NULL", - "DNS_TYPE_NXT", - "DNS_TYPE_OPT", - "DNS_TYPE_PTR", - "DNS_TYPE_PX", - "DNS_TYPE_RP", - "DNS_TYPE_RRSIG", - "DNS_TYPE_RT", - "DNS_TYPE_SIG", - "DNS_TYPE_SINK", - "DNS_TYPE_SOA", - "DNS_TYPE_SRV", - "DNS_TYPE_TEXT", - "DNS_TYPE_TKEY", - "DNS_TYPE_TSIG", - "DNS_TYPE_UID", - "DNS_TYPE_UINFO", - "DNS_TYPE_UNSPEC", - "DNS_TYPE_WINS", - "DNS_TYPE_WINSR", - "DNS_TYPE_WKS", - "DNS_TYPE_X25", - "DT_BLK", - "DT_CHR", - "DT_DIR", - "DT_FIFO", - "DT_LNK", - "DT_REG", - "DT_SOCK", - "DT_UNKNOWN", - "DT_WHT", - "DUPLICATE_CLOSE_SOURCE", - "DUPLICATE_SAME_ACCESS", - "DeleteFile", - "DetachLsf", - "DeviceIoControl", - "Dirent", - "DnsNameCompare", - "DnsQuery", - "DnsRecordListFree", - "DnsSectionAdditional", - "DnsSectionAnswer", - "DnsSectionAuthority", - "DnsSectionQuestion", - "Dup", - "Dup2", - "Dup3", - "DuplicateHandle", - "E2BIG", - "EACCES", - "EADDRINUSE", - "EADDRNOTAVAIL", - "EADV", - "EAFNOSUPPORT", - "EAGAIN", - "EALREADY", - "EAUTH", - "EBADARCH", - "EBADE", - "EBADEXEC", - "EBADF", - "EBADFD", - "EBADMACHO", - "EBADMSG", - "EBADR", - "EBADRPC", - "EBADRQC", - "EBADSLT", - "EBFONT", - "EBUSY", - "ECANCELED", - "ECAPMODE", - "ECHILD", - "ECHO", - "ECHOCTL", - "ECHOE", - "ECHOK", - "ECHOKE", - "ECHONL", - "ECHOPRT", - "ECHRNG", - "ECOMM", - "ECONNABORTED", - "ECONNREFUSED", - "ECONNRESET", - "EDEADLK", - "EDEADLOCK", - "EDESTADDRREQ", - "EDEVERR", - "EDOM", - "EDOOFUS", - "EDOTDOT", - "EDQUOT", - "EEXIST", - "EFAULT", - "EFBIG", - "EFER_LMA", - "EFER_LME", - "EFER_NXE", - "EFER_SCE", - "EFTYPE", - "EHOSTDOWN", - "EHOSTUNREACH", - "EHWPOISON", - "EIDRM", - "EILSEQ", - "EINPROGRESS", - "EINTR", - "EINVAL", - "EIO", - "EIPSEC", - "EISCONN", - "EISDIR", - "EISNAM", - "EKEYEXPIRED", - "EKEYREJECTED", - "EKEYREVOKED", - "EL2HLT", - "EL2NSYNC", - "EL3HLT", - "EL3RST", - "ELAST", - "ELF_NGREG", - "ELF_PRARGSZ", - "ELIBACC", - "ELIBBAD", - "ELIBEXEC", - "ELIBMAX", - "ELIBSCN", - "ELNRNG", - "ELOOP", - "EMEDIUMTYPE", - "EMFILE", - "EMLINK", - "EMSGSIZE", - "EMT_TAGOVF", - "EMULTIHOP", - "EMUL_ENABLED", - "EMUL_LINUX", - "EMUL_LINUX32", - "EMUL_MAXID", - "EMUL_NATIVE", - "ENAMETOOLONG", - "ENAVAIL", - "ENDRUNDISC", - "ENEEDAUTH", - "ENETDOWN", - "ENETRESET", - "ENETUNREACH", - "ENFILE", - "ENOANO", - "ENOATTR", - "ENOBUFS", - "ENOCSI", - "ENODATA", - "ENODEV", - "ENOENT", - "ENOEXEC", - "ENOKEY", - "ENOLCK", - "ENOLINK", - "ENOMEDIUM", - "ENOMEM", - "ENOMSG", - "ENONET", - "ENOPKG", - "ENOPOLICY", - "ENOPROTOOPT", - "ENOSPC", - "ENOSR", - "ENOSTR", - "ENOSYS", - "ENOTBLK", - "ENOTCAPABLE", - "ENOTCONN", - "ENOTDIR", - "ENOTEMPTY", - "ENOTNAM", - "ENOTRECOVERABLE", - "ENOTSOCK", - "ENOTSUP", - "ENOTTY", - "ENOTUNIQ", - "ENXIO", - "EN_SW_CTL_INF", - "EN_SW_CTL_PREC", - "EN_SW_CTL_ROUND", - "EN_SW_DATACHAIN", - "EN_SW_DENORM", - "EN_SW_INVOP", - "EN_SW_OVERFLOW", - "EN_SW_PRECLOSS", - "EN_SW_UNDERFLOW", - "EN_SW_ZERODIV", - "EOPNOTSUPP", - "EOVERFLOW", - "EOWNERDEAD", - "EPERM", - "EPFNOSUPPORT", - "EPIPE", - "EPOLLERR", - "EPOLLET", - "EPOLLHUP", - "EPOLLIN", - "EPOLLMSG", - "EPOLLONESHOT", - "EPOLLOUT", - "EPOLLPRI", - "EPOLLRDBAND", - "EPOLLRDHUP", - "EPOLLRDNORM", - "EPOLLWRBAND", - "EPOLLWRNORM", - "EPOLL_CLOEXEC", - "EPOLL_CTL_ADD", - "EPOLL_CTL_DEL", - "EPOLL_CTL_MOD", - "EPOLL_NONBLOCK", - "EPROCLIM", - "EPROCUNAVAIL", - "EPROGMISMATCH", - "EPROGUNAVAIL", - "EPROTO", - "EPROTONOSUPPORT", - "EPROTOTYPE", - "EPWROFF", - "EQFULL", - "ERANGE", - "EREMCHG", - "EREMOTE", - "EREMOTEIO", - "ERESTART", - "ERFKILL", - "EROFS", - "ERPCMISMATCH", - "ERROR_ACCESS_DENIED", - "ERROR_ALREADY_EXISTS", - "ERROR_BROKEN_PIPE", - "ERROR_BUFFER_OVERFLOW", - "ERROR_DIR_NOT_EMPTY", - "ERROR_ENVVAR_NOT_FOUND", - "ERROR_FILE_EXISTS", - "ERROR_FILE_NOT_FOUND", - "ERROR_HANDLE_EOF", - "ERROR_INSUFFICIENT_BUFFER", - "ERROR_IO_PENDING", - "ERROR_MOD_NOT_FOUND", - "ERROR_MORE_DATA", - "ERROR_NETNAME_DELETED", - "ERROR_NOT_FOUND", - "ERROR_NO_MORE_FILES", - "ERROR_OPERATION_ABORTED", - "ERROR_PATH_NOT_FOUND", - "ERROR_PRIVILEGE_NOT_HELD", - "ERROR_PROC_NOT_FOUND", - "ESHLIBVERS", - "ESHUTDOWN", - "ESOCKTNOSUPPORT", - "ESPIPE", - "ESRCH", - "ESRMNT", - "ESTALE", - "ESTRPIPE", - "ETHERCAP_JUMBO_MTU", - "ETHERCAP_VLAN_HWTAGGING", - "ETHERCAP_VLAN_MTU", - "ETHERMIN", - "ETHERMTU", - "ETHERMTU_JUMBO", - "ETHERTYPE_8023", - "ETHERTYPE_AARP", - "ETHERTYPE_ACCTON", - "ETHERTYPE_AEONIC", - "ETHERTYPE_ALPHA", - "ETHERTYPE_AMBER", - "ETHERTYPE_AMOEBA", - "ETHERTYPE_AOE", - "ETHERTYPE_APOLLO", - "ETHERTYPE_APOLLODOMAIN", - "ETHERTYPE_APPLETALK", - "ETHERTYPE_APPLITEK", - "ETHERTYPE_ARGONAUT", - "ETHERTYPE_ARP", - "ETHERTYPE_AT", - "ETHERTYPE_ATALK", - "ETHERTYPE_ATOMIC", - "ETHERTYPE_ATT", - "ETHERTYPE_ATTSTANFORD", - "ETHERTYPE_AUTOPHON", - "ETHERTYPE_AXIS", - "ETHERTYPE_BCLOOP", - "ETHERTYPE_BOFL", - "ETHERTYPE_CABLETRON", - "ETHERTYPE_CHAOS", - "ETHERTYPE_COMDESIGN", - "ETHERTYPE_COMPUGRAPHIC", - "ETHERTYPE_COUNTERPOINT", - "ETHERTYPE_CRONUS", - "ETHERTYPE_CRONUSVLN", - "ETHERTYPE_DCA", - "ETHERTYPE_DDE", - "ETHERTYPE_DEBNI", - "ETHERTYPE_DECAM", - "ETHERTYPE_DECCUST", - "ETHERTYPE_DECDIAG", - "ETHERTYPE_DECDNS", - "ETHERTYPE_DECDTS", - "ETHERTYPE_DECEXPER", - "ETHERTYPE_DECLAST", - "ETHERTYPE_DECLTM", - "ETHERTYPE_DECMUMPS", - "ETHERTYPE_DECNETBIOS", - "ETHERTYPE_DELTACON", - "ETHERTYPE_DIDDLE", - "ETHERTYPE_DLOG1", - "ETHERTYPE_DLOG2", - "ETHERTYPE_DN", - "ETHERTYPE_DOGFIGHT", - "ETHERTYPE_DSMD", - "ETHERTYPE_ECMA", - "ETHERTYPE_ENCRYPT", - "ETHERTYPE_ES", - "ETHERTYPE_EXCELAN", - "ETHERTYPE_EXPERDATA", - "ETHERTYPE_FLIP", - "ETHERTYPE_FLOWCONTROL", - "ETHERTYPE_FRARP", - "ETHERTYPE_GENDYN", - "ETHERTYPE_HAYES", - "ETHERTYPE_HIPPI_FP", - "ETHERTYPE_HITACHI", - "ETHERTYPE_HP", - "ETHERTYPE_IEEEPUP", - "ETHERTYPE_IEEEPUPAT", - "ETHERTYPE_IMLBL", - "ETHERTYPE_IMLBLDIAG", - "ETHERTYPE_IP", - "ETHERTYPE_IPAS", - "ETHERTYPE_IPV6", - "ETHERTYPE_IPX", - "ETHERTYPE_IPXNEW", - "ETHERTYPE_KALPANA", - "ETHERTYPE_LANBRIDGE", - "ETHERTYPE_LANPROBE", - "ETHERTYPE_LAT", - "ETHERTYPE_LBACK", - "ETHERTYPE_LITTLE", - "ETHERTYPE_LLDP", - "ETHERTYPE_LOGICRAFT", - "ETHERTYPE_LOOPBACK", - "ETHERTYPE_MATRA", - "ETHERTYPE_MAX", - "ETHERTYPE_MERIT", - "ETHERTYPE_MICP", - "ETHERTYPE_MOPDL", - "ETHERTYPE_MOPRC", - "ETHERTYPE_MOTOROLA", - "ETHERTYPE_MPLS", - "ETHERTYPE_MPLS_MCAST", - "ETHERTYPE_MUMPS", - "ETHERTYPE_NBPCC", - "ETHERTYPE_NBPCLAIM", - "ETHERTYPE_NBPCLREQ", - "ETHERTYPE_NBPCLRSP", - "ETHERTYPE_NBPCREQ", - "ETHERTYPE_NBPCRSP", - "ETHERTYPE_NBPDG", - "ETHERTYPE_NBPDGB", - "ETHERTYPE_NBPDLTE", - "ETHERTYPE_NBPRAR", - "ETHERTYPE_NBPRAS", - "ETHERTYPE_NBPRST", - "ETHERTYPE_NBPSCD", - "ETHERTYPE_NBPVCD", - "ETHERTYPE_NBS", - "ETHERTYPE_NCD", - "ETHERTYPE_NESTAR", - "ETHERTYPE_NETBEUI", - "ETHERTYPE_NOVELL", - "ETHERTYPE_NS", - "ETHERTYPE_NSAT", - "ETHERTYPE_NSCOMPAT", - "ETHERTYPE_NTRAILER", - "ETHERTYPE_OS9", - "ETHERTYPE_OS9NET", - "ETHERTYPE_PACER", - "ETHERTYPE_PAE", - "ETHERTYPE_PCS", - "ETHERTYPE_PLANNING", - "ETHERTYPE_PPP", - "ETHERTYPE_PPPOE", - "ETHERTYPE_PPPOEDISC", - "ETHERTYPE_PRIMENTS", - "ETHERTYPE_PUP", - "ETHERTYPE_PUPAT", - "ETHERTYPE_QINQ", - "ETHERTYPE_RACAL", - "ETHERTYPE_RATIONAL", - "ETHERTYPE_RAWFR", - "ETHERTYPE_RCL", - "ETHERTYPE_RDP", - "ETHERTYPE_RETIX", - "ETHERTYPE_REVARP", - "ETHERTYPE_SCA", - "ETHERTYPE_SECTRA", - "ETHERTYPE_SECUREDATA", - "ETHERTYPE_SGITW", - "ETHERTYPE_SG_BOUNCE", - "ETHERTYPE_SG_DIAG", - "ETHERTYPE_SG_NETGAMES", - "ETHERTYPE_SG_RESV", - "ETHERTYPE_SIMNET", - "ETHERTYPE_SLOW", - "ETHERTYPE_SLOWPROTOCOLS", - "ETHERTYPE_SNA", - "ETHERTYPE_SNMP", - "ETHERTYPE_SONIX", - "ETHERTYPE_SPIDER", - "ETHERTYPE_SPRITE", - "ETHERTYPE_STP", - "ETHERTYPE_TALARIS", - "ETHERTYPE_TALARISMC", - "ETHERTYPE_TCPCOMP", - "ETHERTYPE_TCPSM", - "ETHERTYPE_TEC", - "ETHERTYPE_TIGAN", - "ETHERTYPE_TRAIL", - "ETHERTYPE_TRANSETHER", - "ETHERTYPE_TYMSHARE", - "ETHERTYPE_UBBST", - "ETHERTYPE_UBDEBUG", - "ETHERTYPE_UBDIAGLOOP", - "ETHERTYPE_UBDL", - "ETHERTYPE_UBNIU", - "ETHERTYPE_UBNMC", - "ETHERTYPE_VALID", - "ETHERTYPE_VARIAN", - "ETHERTYPE_VAXELN", - "ETHERTYPE_VEECO", - "ETHERTYPE_VEXP", - "ETHERTYPE_VGLAB", - "ETHERTYPE_VINES", - "ETHERTYPE_VINESECHO", - "ETHERTYPE_VINESLOOP", - "ETHERTYPE_VITAL", - "ETHERTYPE_VLAN", - "ETHERTYPE_VLTLMAN", - "ETHERTYPE_VPROD", - "ETHERTYPE_VURESERVED", - "ETHERTYPE_WATERLOO", - "ETHERTYPE_WELLFLEET", - "ETHERTYPE_X25", - "ETHERTYPE_X75", - "ETHERTYPE_XNSSM", - "ETHERTYPE_XTP", - "ETHER_ADDR_LEN", - "ETHER_ALIGN", - "ETHER_CRC_LEN", - "ETHER_CRC_POLY_BE", - "ETHER_CRC_POLY_LE", - "ETHER_HDR_LEN", - "ETHER_MAX_DIX_LEN", - "ETHER_MAX_LEN", - "ETHER_MAX_LEN_JUMBO", - "ETHER_MIN_LEN", - "ETHER_PPPOE_ENCAP_LEN", - "ETHER_TYPE_LEN", - "ETHER_VLAN_ENCAP_LEN", - "ETH_P_1588", - "ETH_P_8021Q", - "ETH_P_802_2", - "ETH_P_802_3", - "ETH_P_AARP", - "ETH_P_ALL", - "ETH_P_AOE", - "ETH_P_ARCNET", - "ETH_P_ARP", - "ETH_P_ATALK", - "ETH_P_ATMFATE", - "ETH_P_ATMMPOA", - "ETH_P_AX25", - "ETH_P_BPQ", - "ETH_P_CAIF", - "ETH_P_CAN", - "ETH_P_CONTROL", - "ETH_P_CUST", - "ETH_P_DDCMP", - "ETH_P_DEC", - "ETH_P_DIAG", - "ETH_P_DNA_DL", - "ETH_P_DNA_RC", - "ETH_P_DNA_RT", - "ETH_P_DSA", - "ETH_P_ECONET", - "ETH_P_EDSA", - "ETH_P_FCOE", - "ETH_P_FIP", - "ETH_P_HDLC", - "ETH_P_IEEE802154", - "ETH_P_IEEEPUP", - "ETH_P_IEEEPUPAT", - "ETH_P_IP", - "ETH_P_IPV6", - "ETH_P_IPX", - "ETH_P_IRDA", - "ETH_P_LAT", - "ETH_P_LINK_CTL", - "ETH_P_LOCALTALK", - "ETH_P_LOOP", - "ETH_P_MOBITEX", - "ETH_P_MPLS_MC", - "ETH_P_MPLS_UC", - "ETH_P_PAE", - "ETH_P_PAUSE", - "ETH_P_PHONET", - "ETH_P_PPPTALK", - "ETH_P_PPP_DISC", - "ETH_P_PPP_MP", - "ETH_P_PPP_SES", - "ETH_P_PUP", - "ETH_P_PUPAT", - "ETH_P_RARP", - "ETH_P_SCA", - "ETH_P_SLOW", - "ETH_P_SNAP", - "ETH_P_TEB", - "ETH_P_TIPC", - "ETH_P_TRAILER", - "ETH_P_TR_802_2", - "ETH_P_WAN_PPP", - "ETH_P_WCCP", - "ETH_P_X25", - "ETIME", - "ETIMEDOUT", - "ETOOMANYREFS", - "ETXTBSY", - "EUCLEAN", - "EUNATCH", - "EUSERS", - "EVFILT_AIO", - "EVFILT_FS", - "EVFILT_LIO", - "EVFILT_MACHPORT", - "EVFILT_PROC", - "EVFILT_READ", - "EVFILT_SIGNAL", - "EVFILT_SYSCOUNT", - "EVFILT_THREADMARKER", - "EVFILT_TIMER", - "EVFILT_USER", - "EVFILT_VM", - "EVFILT_VNODE", - "EVFILT_WRITE", - "EV_ADD", - "EV_CLEAR", - "EV_DELETE", - "EV_DISABLE", - "EV_DISPATCH", - "EV_DROP", - "EV_ENABLE", - "EV_EOF", - "EV_ERROR", - "EV_FLAG0", - "EV_FLAG1", - "EV_ONESHOT", - "EV_OOBAND", - "EV_POLL", - "EV_RECEIPT", - "EV_SYSFLAGS", - "EWINDOWS", - "EWOULDBLOCK", - "EXDEV", - "EXFULL", - "EXTA", - "EXTB", - "EXTPROC", - "Environ", - "EpollCreate", - "EpollCreate1", - "EpollCtl", - "EpollEvent", - "EpollWait", - "Errno", - "EscapeArg", - "Exchangedata", - "Exec", - "Exit", - "ExitProcess", - "FD_CLOEXEC", - "FD_SETSIZE", - "FILE_ACTION_ADDED", - "FILE_ACTION_MODIFIED", - "FILE_ACTION_REMOVED", - "FILE_ACTION_RENAMED_NEW_NAME", - "FILE_ACTION_RENAMED_OLD_NAME", - "FILE_APPEND_DATA", - "FILE_ATTRIBUTE_ARCHIVE", - "FILE_ATTRIBUTE_DIRECTORY", - "FILE_ATTRIBUTE_HIDDEN", - "FILE_ATTRIBUTE_NORMAL", - "FILE_ATTRIBUTE_READONLY", - "FILE_ATTRIBUTE_REPARSE_POINT", - "FILE_ATTRIBUTE_SYSTEM", - "FILE_BEGIN", - "FILE_CURRENT", - "FILE_END", - "FILE_FLAG_BACKUP_SEMANTICS", - "FILE_FLAG_OPEN_REPARSE_POINT", - "FILE_FLAG_OVERLAPPED", - "FILE_LIST_DIRECTORY", - "FILE_MAP_COPY", - "FILE_MAP_EXECUTE", - "FILE_MAP_READ", - "FILE_MAP_WRITE", - "FILE_NOTIFY_CHANGE_ATTRIBUTES", - "FILE_NOTIFY_CHANGE_CREATION", - "FILE_NOTIFY_CHANGE_DIR_NAME", - "FILE_NOTIFY_CHANGE_FILE_NAME", - "FILE_NOTIFY_CHANGE_LAST_ACCESS", - "FILE_NOTIFY_CHANGE_LAST_WRITE", - "FILE_NOTIFY_CHANGE_SIZE", - "FILE_SHARE_DELETE", - "FILE_SHARE_READ", - "FILE_SHARE_WRITE", - "FILE_SKIP_COMPLETION_PORT_ON_SUCCESS", - "FILE_SKIP_SET_EVENT_ON_HANDLE", - "FILE_TYPE_CHAR", - "FILE_TYPE_DISK", - "FILE_TYPE_PIPE", - "FILE_TYPE_REMOTE", - "FILE_TYPE_UNKNOWN", - "FILE_WRITE_ATTRIBUTES", - "FLUSHO", - "FORMAT_MESSAGE_ALLOCATE_BUFFER", - "FORMAT_MESSAGE_ARGUMENT_ARRAY", - "FORMAT_MESSAGE_FROM_HMODULE", - "FORMAT_MESSAGE_FROM_STRING", - "FORMAT_MESSAGE_FROM_SYSTEM", - "FORMAT_MESSAGE_IGNORE_INSERTS", - "FORMAT_MESSAGE_MAX_WIDTH_MASK", - "FSCTL_GET_REPARSE_POINT", - "F_ADDFILESIGS", - "F_ADDSIGS", - "F_ALLOCATEALL", - "F_ALLOCATECONTIG", - "F_CANCEL", - "F_CHKCLEAN", - "F_CLOSEM", - "F_DUP2FD", - "F_DUP2FD_CLOEXEC", - "F_DUPFD", - "F_DUPFD_CLOEXEC", - "F_EXLCK", - "F_FINDSIGS", - "F_FLUSH_DATA", - "F_FREEZE_FS", - "F_FSCTL", - "F_FSDIRMASK", - "F_FSIN", - "F_FSINOUT", - "F_FSOUT", - "F_FSPRIV", - "F_FSVOID", - "F_FULLFSYNC", - "F_GETCODEDIR", - "F_GETFD", - "F_GETFL", - "F_GETLEASE", - "F_GETLK", - "F_GETLK64", - "F_GETLKPID", - "F_GETNOSIGPIPE", - "F_GETOWN", - "F_GETOWN_EX", - "F_GETPATH", - "F_GETPATH_MTMINFO", - "F_GETPIPE_SZ", - "F_GETPROTECTIONCLASS", - "F_GETPROTECTIONLEVEL", - "F_GETSIG", - "F_GLOBAL_NOCACHE", - "F_LOCK", - "F_LOG2PHYS", - "F_LOG2PHYS_EXT", - "F_MARKDEPENDENCY", - "F_MAXFD", - "F_NOCACHE", - "F_NODIRECT", - "F_NOTIFY", - "F_OGETLK", - "F_OK", - "F_OSETLK", - "F_OSETLKW", - "F_PARAM_MASK", - "F_PARAM_MAX", - "F_PATHPKG_CHECK", - "F_PEOFPOSMODE", - "F_PREALLOCATE", - "F_RDADVISE", - "F_RDAHEAD", - "F_RDLCK", - "F_READAHEAD", - "F_READBOOTSTRAP", - "F_SETBACKINGSTORE", - "F_SETFD", - "F_SETFL", - "F_SETLEASE", - "F_SETLK", - "F_SETLK64", - "F_SETLKW", - "F_SETLKW64", - "F_SETLKWTIMEOUT", - "F_SETLK_REMOTE", - "F_SETNOSIGPIPE", - "F_SETOWN", - "F_SETOWN_EX", - "F_SETPIPE_SZ", - "F_SETPROTECTIONCLASS", - "F_SETSIG", - "F_SETSIZE", - "F_SHLCK", - "F_SINGLE_WRITER", - "F_TEST", - "F_THAW_FS", - "F_TLOCK", - "F_TRANSCODEKEY", - "F_ULOCK", - "F_UNLCK", - "F_UNLCKSYS", - "F_VOLPOSMODE", - "F_WRITEBOOTSTRAP", - "F_WRLCK", - "Faccessat", - "Fallocate", - "Fbootstraptransfer_t", - "Fchdir", - "Fchflags", - "Fchmod", - "Fchmodat", - "Fchown", - "Fchownat", - "FcntlFlock", - "FdSet", - "Fdatasync", - "FileNotifyInformation", - "Filetime", - "FindClose", - "FindFirstFile", - "FindNextFile", - "Flock", - "Flock_t", - "FlushBpf", - "FlushFileBuffers", - "FlushViewOfFile", - "ForkExec", - "ForkLock", - "FormatMessage", - "Fpathconf", - "FreeAddrInfoW", - "FreeEnvironmentStrings", - "FreeLibrary", - "Fsid", - "Fstat", - "Fstatat", - "Fstatfs", - "Fstore_t", - "Fsync", - "Ftruncate", - "FullPath", - "Futimes", - "Futimesat", - "GENERIC_ALL", - "GENERIC_EXECUTE", - "GENERIC_READ", - "GENERIC_WRITE", - "GUID", - "GetAcceptExSockaddrs", - "GetAdaptersInfo", - "GetAddrInfoW", - "GetCommandLine", - "GetComputerName", - "GetConsoleMode", - "GetCurrentDirectory", - "GetCurrentProcess", - "GetEnvironmentStrings", - "GetEnvironmentVariable", - "GetExitCodeProcess", - "GetFileAttributes", - "GetFileAttributesEx", - "GetFileExInfoStandard", - "GetFileExMaxInfoLevel", - "GetFileInformationByHandle", - "GetFileType", - "GetFullPathName", - "GetHostByName", - "GetIfEntry", - "GetLastError", - "GetLengthSid", - "GetLongPathName", - "GetProcAddress", - "GetProcessTimes", - "GetProtoByName", - "GetQueuedCompletionStatus", - "GetServByName", - "GetShortPathName", - "GetStartupInfo", - "GetStdHandle", - "GetSystemTimeAsFileTime", - "GetTempPath", - "GetTimeZoneInformation", - "GetTokenInformation", - "GetUserNameEx", - "GetUserProfileDirectory", - "GetVersion", - "Getcwd", - "Getdents", - "Getdirentries", - "Getdtablesize", - "Getegid", - "Getenv", - "Geteuid", - "Getfsstat", - "Getgid", - "Getgroups", - "Getpagesize", - "Getpeername", - "Getpgid", - "Getpgrp", - "Getpid", - "Getppid", - "Getpriority", - "Getrlimit", - "Getrusage", - "Getsid", - "Getsockname", - "Getsockopt", - "GetsockoptByte", - "GetsockoptICMPv6Filter", - "GetsockoptIPMreq", - "GetsockoptIPMreqn", - "GetsockoptIPv6MTUInfo", - "GetsockoptIPv6Mreq", - "GetsockoptInet4Addr", - "GetsockoptInt", - "GetsockoptUcred", - "Gettid", - "Gettimeofday", - "Getuid", - "Getwd", - "Getxattr", - "HANDLE_FLAG_INHERIT", - "HKEY_CLASSES_ROOT", - "HKEY_CURRENT_CONFIG", - "HKEY_CURRENT_USER", - "HKEY_DYN_DATA", - "HKEY_LOCAL_MACHINE", - "HKEY_PERFORMANCE_DATA", - "HKEY_USERS", - "HUPCL", - "Handle", - "Hostent", - "ICANON", - "ICMP6_FILTER", - "ICMPV6_FILTER", - "ICMPv6Filter", - "ICRNL", - "IEXTEN", - "IFAN_ARRIVAL", - "IFAN_DEPARTURE", - "IFA_ADDRESS", - "IFA_ANYCAST", - "IFA_BROADCAST", - "IFA_CACHEINFO", - "IFA_F_DADFAILED", - "IFA_F_DEPRECATED", - "IFA_F_HOMEADDRESS", - "IFA_F_NODAD", - "IFA_F_OPTIMISTIC", - "IFA_F_PERMANENT", - "IFA_F_SECONDARY", - "IFA_F_TEMPORARY", - "IFA_F_TENTATIVE", - "IFA_LABEL", - "IFA_LOCAL", - "IFA_MAX", - "IFA_MULTICAST", - "IFA_ROUTE", - "IFA_UNSPEC", - "IFF_ALLMULTI", - "IFF_ALTPHYS", - "IFF_AUTOMEDIA", - "IFF_BROADCAST", - "IFF_CANTCHANGE", - "IFF_CANTCONFIG", - "IFF_DEBUG", - "IFF_DRV_OACTIVE", - "IFF_DRV_RUNNING", - "IFF_DYING", - "IFF_DYNAMIC", - "IFF_LINK0", - "IFF_LINK1", - "IFF_LINK2", - "IFF_LOOPBACK", - "IFF_MASTER", - "IFF_MONITOR", - "IFF_MULTICAST", - "IFF_NOARP", - "IFF_NOTRAILERS", - "IFF_NO_PI", - "IFF_OACTIVE", - "IFF_ONE_QUEUE", - "IFF_POINTOPOINT", - "IFF_POINTTOPOINT", - "IFF_PORTSEL", - "IFF_PPROMISC", - "IFF_PROMISC", - "IFF_RENAMING", - "IFF_RUNNING", - "IFF_SIMPLEX", - "IFF_SLAVE", - "IFF_SMART", - "IFF_STATICARP", - "IFF_TAP", - "IFF_TUN", - "IFF_TUN_EXCL", - "IFF_UP", - "IFF_VNET_HDR", - "IFLA_ADDRESS", - "IFLA_BROADCAST", - "IFLA_COST", - "IFLA_IFALIAS", - "IFLA_IFNAME", - "IFLA_LINK", - "IFLA_LINKINFO", - "IFLA_LINKMODE", - "IFLA_MAP", - "IFLA_MASTER", - "IFLA_MAX", - "IFLA_MTU", - "IFLA_NET_NS_PID", - "IFLA_OPERSTATE", - "IFLA_PRIORITY", - "IFLA_PROTINFO", - "IFLA_QDISC", - "IFLA_STATS", - "IFLA_TXQLEN", - "IFLA_UNSPEC", - "IFLA_WEIGHT", - "IFLA_WIRELESS", - "IFNAMSIZ", - "IFT_1822", - "IFT_A12MPPSWITCH", - "IFT_AAL2", - "IFT_AAL5", - "IFT_ADSL", - "IFT_AFLANE8023", - "IFT_AFLANE8025", - "IFT_ARAP", - "IFT_ARCNET", - "IFT_ARCNETPLUS", - "IFT_ASYNC", - "IFT_ATM", - "IFT_ATMDXI", - "IFT_ATMFUNI", - "IFT_ATMIMA", - "IFT_ATMLOGICAL", - "IFT_ATMRADIO", - "IFT_ATMSUBINTERFACE", - "IFT_ATMVCIENDPT", - "IFT_ATMVIRTUAL", - "IFT_BGPPOLICYACCOUNTING", - "IFT_BLUETOOTH", - "IFT_BRIDGE", - "IFT_BSC", - "IFT_CARP", - "IFT_CCTEMUL", - "IFT_CELLULAR", - "IFT_CEPT", - "IFT_CES", - "IFT_CHANNEL", - "IFT_CNR", - "IFT_COFFEE", - "IFT_COMPOSITELINK", - "IFT_DCN", - "IFT_DIGITALPOWERLINE", - "IFT_DIGITALWRAPPEROVERHEADCHANNEL", - "IFT_DLSW", - "IFT_DOCSCABLEDOWNSTREAM", - "IFT_DOCSCABLEMACLAYER", - "IFT_DOCSCABLEUPSTREAM", - "IFT_DOCSCABLEUPSTREAMCHANNEL", - "IFT_DS0", - "IFT_DS0BUNDLE", - "IFT_DS1FDL", - "IFT_DS3", - "IFT_DTM", - "IFT_DUMMY", - "IFT_DVBASILN", - "IFT_DVBASIOUT", - "IFT_DVBRCCDOWNSTREAM", - "IFT_DVBRCCMACLAYER", - "IFT_DVBRCCUPSTREAM", - "IFT_ECONET", - "IFT_ENC", - "IFT_EON", - "IFT_EPLRS", - "IFT_ESCON", - "IFT_ETHER", - "IFT_FAITH", - "IFT_FAST", - "IFT_FASTETHER", - "IFT_FASTETHERFX", - "IFT_FDDI", - "IFT_FIBRECHANNEL", - "IFT_FRAMERELAYINTERCONNECT", - "IFT_FRAMERELAYMPI", - "IFT_FRDLCIENDPT", - "IFT_FRELAY", - "IFT_FRELAYDCE", - "IFT_FRF16MFRBUNDLE", - "IFT_FRFORWARD", - "IFT_G703AT2MB", - "IFT_G703AT64K", - "IFT_GIF", - "IFT_GIGABITETHERNET", - "IFT_GR303IDT", - "IFT_GR303RDT", - "IFT_H323GATEKEEPER", - "IFT_H323PROXY", - "IFT_HDH1822", - "IFT_HDLC", - "IFT_HDSL2", - "IFT_HIPERLAN2", - "IFT_HIPPI", - "IFT_HIPPIINTERFACE", - "IFT_HOSTPAD", - "IFT_HSSI", - "IFT_HY", - "IFT_IBM370PARCHAN", - "IFT_IDSL", - "IFT_IEEE1394", - "IFT_IEEE80211", - "IFT_IEEE80212", - "IFT_IEEE8023ADLAG", - "IFT_IFGSN", - "IFT_IMT", - "IFT_INFINIBAND", - "IFT_INTERLEAVE", - "IFT_IP", - "IFT_IPFORWARD", - "IFT_IPOVERATM", - "IFT_IPOVERCDLC", - "IFT_IPOVERCLAW", - "IFT_IPSWITCH", - "IFT_IPXIP", - "IFT_ISDN", - "IFT_ISDNBASIC", - "IFT_ISDNPRIMARY", - "IFT_ISDNS", - "IFT_ISDNU", - "IFT_ISO88022LLC", - "IFT_ISO88023", - "IFT_ISO88024", - "IFT_ISO88025", - "IFT_ISO88025CRFPINT", - "IFT_ISO88025DTR", - "IFT_ISO88025FIBER", - "IFT_ISO88026", - "IFT_ISUP", - "IFT_L2VLAN", - "IFT_L3IPVLAN", - "IFT_L3IPXVLAN", - "IFT_LAPB", - "IFT_LAPD", - "IFT_LAPF", - "IFT_LINEGROUP", - "IFT_LOCALTALK", - "IFT_LOOP", - "IFT_MEDIAMAILOVERIP", - "IFT_MFSIGLINK", - "IFT_MIOX25", - "IFT_MODEM", - "IFT_MPC", - "IFT_MPLS", - "IFT_MPLSTUNNEL", - "IFT_MSDSL", - "IFT_MVL", - "IFT_MYRINET", - "IFT_NFAS", - "IFT_NSIP", - "IFT_OPTICALCHANNEL", - "IFT_OPTICALTRANSPORT", - "IFT_OTHER", - "IFT_P10", - "IFT_P80", - "IFT_PARA", - "IFT_PDP", - "IFT_PFLOG", - "IFT_PFLOW", - "IFT_PFSYNC", - "IFT_PLC", - "IFT_PON155", - "IFT_PON622", - "IFT_POS", - "IFT_PPP", - "IFT_PPPMULTILINKBUNDLE", - "IFT_PROPATM", - "IFT_PROPBWAP2MP", - "IFT_PROPCNLS", - "IFT_PROPDOCSWIRELESSDOWNSTREAM", - "IFT_PROPDOCSWIRELESSMACLAYER", - "IFT_PROPDOCSWIRELESSUPSTREAM", - "IFT_PROPMUX", - "IFT_PROPVIRTUAL", - "IFT_PROPWIRELESSP2P", - "IFT_PTPSERIAL", - "IFT_PVC", - "IFT_Q2931", - "IFT_QLLC", - "IFT_RADIOMAC", - "IFT_RADSL", - "IFT_REACHDSL", - "IFT_RFC1483", - "IFT_RS232", - "IFT_RSRB", - "IFT_SDLC", - "IFT_SDSL", - "IFT_SHDSL", - "IFT_SIP", - "IFT_SIPSIG", - "IFT_SIPTG", - "IFT_SLIP", - "IFT_SMDSDXI", - "IFT_SMDSICIP", - "IFT_SONET", - "IFT_SONETOVERHEADCHANNEL", - "IFT_SONETPATH", - "IFT_SONETVT", - "IFT_SRP", - "IFT_SS7SIGLINK", - "IFT_STACKTOSTACK", - "IFT_STARLAN", - "IFT_STF", - "IFT_T1", - "IFT_TDLC", - "IFT_TELINK", - "IFT_TERMPAD", - "IFT_TR008", - "IFT_TRANSPHDLC", - "IFT_TUNNEL", - "IFT_ULTRA", - "IFT_USB", - "IFT_V11", - "IFT_V35", - "IFT_V36", - "IFT_V37", - "IFT_VDSL", - "IFT_VIRTUALIPADDRESS", - "IFT_VIRTUALTG", - "IFT_VOICEDID", - "IFT_VOICEEM", - "IFT_VOICEEMFGD", - "IFT_VOICEENCAP", - "IFT_VOICEFGDEANA", - "IFT_VOICEFXO", - "IFT_VOICEFXS", - "IFT_VOICEOVERATM", - "IFT_VOICEOVERCABLE", - "IFT_VOICEOVERFRAMERELAY", - "IFT_VOICEOVERIP", - "IFT_X213", - "IFT_X25", - "IFT_X25DDN", - "IFT_X25HUNTGROUP", - "IFT_X25MLP", - "IFT_X25PLE", - "IFT_XETHER", - "IGNBRK", - "IGNCR", - "IGNORE", - "IGNPAR", - "IMAXBEL", - "INFINITE", - "INLCR", - "INPCK", - "INVALID_FILE_ATTRIBUTES", - "IN_ACCESS", - "IN_ALL_EVENTS", - "IN_ATTRIB", - "IN_CLASSA_HOST", - "IN_CLASSA_MAX", - "IN_CLASSA_NET", - "IN_CLASSA_NSHIFT", - "IN_CLASSB_HOST", - "IN_CLASSB_MAX", - "IN_CLASSB_NET", - "IN_CLASSB_NSHIFT", - "IN_CLASSC_HOST", - "IN_CLASSC_NET", - "IN_CLASSC_NSHIFT", - "IN_CLASSD_HOST", - "IN_CLASSD_NET", - "IN_CLASSD_NSHIFT", - "IN_CLOEXEC", - "IN_CLOSE", - "IN_CLOSE_NOWRITE", - "IN_CLOSE_WRITE", - "IN_CREATE", - "IN_DELETE", - "IN_DELETE_SELF", - "IN_DONT_FOLLOW", - "IN_EXCL_UNLINK", - "IN_IGNORED", - "IN_ISDIR", - "IN_LINKLOCALNETNUM", - "IN_LOOPBACKNET", - "IN_MASK_ADD", - "IN_MODIFY", - "IN_MOVE", - "IN_MOVED_FROM", - "IN_MOVED_TO", - "IN_MOVE_SELF", - "IN_NONBLOCK", - "IN_ONESHOT", - "IN_ONLYDIR", - "IN_OPEN", - "IN_Q_OVERFLOW", - "IN_RFC3021_HOST", - "IN_RFC3021_MASK", - "IN_RFC3021_NET", - "IN_RFC3021_NSHIFT", - "IN_UNMOUNT", - "IOC_IN", - "IOC_INOUT", - "IOC_OUT", - "IOC_VENDOR", - "IOC_WS2", - "IO_REPARSE_TAG_SYMLINK", - "IPMreq", - "IPMreqn", - "IPPROTO_3PC", - "IPPROTO_ADFS", - "IPPROTO_AH", - "IPPROTO_AHIP", - "IPPROTO_APES", - "IPPROTO_ARGUS", - "IPPROTO_AX25", - "IPPROTO_BHA", - "IPPROTO_BLT", - "IPPROTO_BRSATMON", - "IPPROTO_CARP", - "IPPROTO_CFTP", - "IPPROTO_CHAOS", - "IPPROTO_CMTP", - "IPPROTO_COMP", - "IPPROTO_CPHB", - "IPPROTO_CPNX", - "IPPROTO_DCCP", - "IPPROTO_DDP", - "IPPROTO_DGP", - "IPPROTO_DIVERT", - "IPPROTO_DIVERT_INIT", - "IPPROTO_DIVERT_RESP", - "IPPROTO_DONE", - "IPPROTO_DSTOPTS", - "IPPROTO_EGP", - "IPPROTO_EMCON", - "IPPROTO_ENCAP", - "IPPROTO_EON", - "IPPROTO_ESP", - "IPPROTO_ETHERIP", - "IPPROTO_FRAGMENT", - "IPPROTO_GGP", - "IPPROTO_GMTP", - "IPPROTO_GRE", - "IPPROTO_HELLO", - "IPPROTO_HMP", - "IPPROTO_HOPOPTS", - "IPPROTO_ICMP", - "IPPROTO_ICMPV6", - "IPPROTO_IDP", - "IPPROTO_IDPR", - "IPPROTO_IDRP", - "IPPROTO_IGMP", - "IPPROTO_IGP", - "IPPROTO_IGRP", - "IPPROTO_IL", - "IPPROTO_INLSP", - "IPPROTO_INP", - "IPPROTO_IP", - "IPPROTO_IPCOMP", - "IPPROTO_IPCV", - "IPPROTO_IPEIP", - "IPPROTO_IPIP", - "IPPROTO_IPPC", - "IPPROTO_IPV4", - "IPPROTO_IPV6", - "IPPROTO_IPV6_ICMP", - "IPPROTO_IRTP", - "IPPROTO_KRYPTOLAN", - "IPPROTO_LARP", - "IPPROTO_LEAF1", - "IPPROTO_LEAF2", - "IPPROTO_MAX", - "IPPROTO_MAXID", - "IPPROTO_MEAS", - "IPPROTO_MH", - "IPPROTO_MHRP", - "IPPROTO_MICP", - "IPPROTO_MOBILE", - "IPPROTO_MPLS", - "IPPROTO_MTP", - "IPPROTO_MUX", - "IPPROTO_ND", - "IPPROTO_NHRP", - "IPPROTO_NONE", - "IPPROTO_NSP", - "IPPROTO_NVPII", - "IPPROTO_OLD_DIVERT", - "IPPROTO_OSPFIGP", - "IPPROTO_PFSYNC", - "IPPROTO_PGM", - "IPPROTO_PIGP", - "IPPROTO_PIM", - "IPPROTO_PRM", - "IPPROTO_PUP", - "IPPROTO_PVP", - "IPPROTO_RAW", - "IPPROTO_RCCMON", - "IPPROTO_RDP", - "IPPROTO_ROUTING", - "IPPROTO_RSVP", - "IPPROTO_RVD", - "IPPROTO_SATEXPAK", - "IPPROTO_SATMON", - "IPPROTO_SCCSP", - "IPPROTO_SCTP", - "IPPROTO_SDRP", - "IPPROTO_SEND", - "IPPROTO_SEP", - "IPPROTO_SKIP", - "IPPROTO_SPACER", - "IPPROTO_SRPC", - "IPPROTO_ST", - "IPPROTO_SVMTP", - "IPPROTO_SWIPE", - "IPPROTO_TCF", - "IPPROTO_TCP", - "IPPROTO_TLSP", - "IPPROTO_TP", - "IPPROTO_TPXX", - "IPPROTO_TRUNK1", - "IPPROTO_TRUNK2", - "IPPROTO_TTP", - "IPPROTO_UDP", - "IPPROTO_UDPLITE", - "IPPROTO_VINES", - "IPPROTO_VISA", - "IPPROTO_VMTP", - "IPPROTO_VRRP", - "IPPROTO_WBEXPAK", - "IPPROTO_WBMON", - "IPPROTO_WSN", - "IPPROTO_XNET", - "IPPROTO_XTP", - "IPV6_2292DSTOPTS", - "IPV6_2292HOPLIMIT", - "IPV6_2292HOPOPTS", - "IPV6_2292NEXTHOP", - "IPV6_2292PKTINFO", - "IPV6_2292PKTOPTIONS", - "IPV6_2292RTHDR", - "IPV6_ADDRFORM", - "IPV6_ADD_MEMBERSHIP", - "IPV6_AUTHHDR", - "IPV6_AUTH_LEVEL", - "IPV6_AUTOFLOWLABEL", - "IPV6_BINDANY", - "IPV6_BINDV6ONLY", - "IPV6_BOUND_IF", - "IPV6_CHECKSUM", - "IPV6_DEFAULT_MULTICAST_HOPS", - "IPV6_DEFAULT_MULTICAST_LOOP", - "IPV6_DEFHLIM", - "IPV6_DONTFRAG", - "IPV6_DROP_MEMBERSHIP", - "IPV6_DSTOPTS", - "IPV6_ESP_NETWORK_LEVEL", - "IPV6_ESP_TRANS_LEVEL", - "IPV6_FAITH", - "IPV6_FLOWINFO_MASK", - "IPV6_FLOWLABEL_MASK", - "IPV6_FRAGTTL", - "IPV6_FW_ADD", - "IPV6_FW_DEL", - "IPV6_FW_FLUSH", - "IPV6_FW_GET", - "IPV6_FW_ZERO", - "IPV6_HLIMDEC", - "IPV6_HOPLIMIT", - "IPV6_HOPOPTS", - "IPV6_IPCOMP_LEVEL", - "IPV6_IPSEC_POLICY", - "IPV6_JOIN_ANYCAST", - "IPV6_JOIN_GROUP", - "IPV6_LEAVE_ANYCAST", - "IPV6_LEAVE_GROUP", - "IPV6_MAXHLIM", - "IPV6_MAXOPTHDR", - "IPV6_MAXPACKET", - "IPV6_MAX_GROUP_SRC_FILTER", - "IPV6_MAX_MEMBERSHIPS", - "IPV6_MAX_SOCK_SRC_FILTER", - "IPV6_MIN_MEMBERSHIPS", - "IPV6_MMTU", - "IPV6_MSFILTER", - "IPV6_MTU", - "IPV6_MTU_DISCOVER", - "IPV6_MULTICAST_HOPS", - "IPV6_MULTICAST_IF", - "IPV6_MULTICAST_LOOP", - "IPV6_NEXTHOP", - "IPV6_OPTIONS", - "IPV6_PATHMTU", - "IPV6_PIPEX", - "IPV6_PKTINFO", - "IPV6_PMTUDISC_DO", - "IPV6_PMTUDISC_DONT", - "IPV6_PMTUDISC_PROBE", - "IPV6_PMTUDISC_WANT", - "IPV6_PORTRANGE", - "IPV6_PORTRANGE_DEFAULT", - "IPV6_PORTRANGE_HIGH", - "IPV6_PORTRANGE_LOW", - "IPV6_PREFER_TEMPADDR", - "IPV6_RECVDSTOPTS", - "IPV6_RECVDSTPORT", - "IPV6_RECVERR", - "IPV6_RECVHOPLIMIT", - "IPV6_RECVHOPOPTS", - "IPV6_RECVPATHMTU", - "IPV6_RECVPKTINFO", - "IPV6_RECVRTHDR", - "IPV6_RECVTCLASS", - "IPV6_ROUTER_ALERT", - "IPV6_RTABLE", - "IPV6_RTHDR", - "IPV6_RTHDRDSTOPTS", - "IPV6_RTHDR_LOOSE", - "IPV6_RTHDR_STRICT", - "IPV6_RTHDR_TYPE_0", - "IPV6_RXDSTOPTS", - "IPV6_RXHOPOPTS", - "IPV6_SOCKOPT_RESERVED1", - "IPV6_TCLASS", - "IPV6_UNICAST_HOPS", - "IPV6_USE_MIN_MTU", - "IPV6_V6ONLY", - "IPV6_VERSION", - "IPV6_VERSION_MASK", - "IPV6_XFRM_POLICY", - "IP_ADD_MEMBERSHIP", - "IP_ADD_SOURCE_MEMBERSHIP", - "IP_AUTH_LEVEL", - "IP_BINDANY", - "IP_BLOCK_SOURCE", - "IP_BOUND_IF", - "IP_DEFAULT_MULTICAST_LOOP", - "IP_DEFAULT_MULTICAST_TTL", - "IP_DF", - "IP_DIVERTFL", - "IP_DONTFRAG", - "IP_DROP_MEMBERSHIP", - "IP_DROP_SOURCE_MEMBERSHIP", - "IP_DUMMYNET3", - "IP_DUMMYNET_CONFIGURE", - "IP_DUMMYNET_DEL", - "IP_DUMMYNET_FLUSH", - "IP_DUMMYNET_GET", - "IP_EF", - "IP_ERRORMTU", - "IP_ESP_NETWORK_LEVEL", - "IP_ESP_TRANS_LEVEL", - "IP_FAITH", - "IP_FREEBIND", - "IP_FW3", - "IP_FW_ADD", - "IP_FW_DEL", - "IP_FW_FLUSH", - "IP_FW_GET", - "IP_FW_NAT_CFG", - "IP_FW_NAT_DEL", - "IP_FW_NAT_GET_CONFIG", - "IP_FW_NAT_GET_LOG", - "IP_FW_RESETLOG", - "IP_FW_TABLE_ADD", - "IP_FW_TABLE_DEL", - "IP_FW_TABLE_FLUSH", - "IP_FW_TABLE_GETSIZE", - "IP_FW_TABLE_LIST", - "IP_FW_ZERO", - "IP_HDRINCL", - "IP_IPCOMP_LEVEL", - "IP_IPSECFLOWINFO", - "IP_IPSEC_LOCAL_AUTH", - "IP_IPSEC_LOCAL_CRED", - "IP_IPSEC_LOCAL_ID", - "IP_IPSEC_POLICY", - "IP_IPSEC_REMOTE_AUTH", - "IP_IPSEC_REMOTE_CRED", - "IP_IPSEC_REMOTE_ID", - "IP_MAXPACKET", - "IP_MAX_GROUP_SRC_FILTER", - "IP_MAX_MEMBERSHIPS", - "IP_MAX_SOCK_MUTE_FILTER", - "IP_MAX_SOCK_SRC_FILTER", - "IP_MAX_SOURCE_FILTER", - "IP_MF", - "IP_MINFRAGSIZE", - "IP_MINTTL", - "IP_MIN_MEMBERSHIPS", - "IP_MSFILTER", - "IP_MSS", - "IP_MTU", - "IP_MTU_DISCOVER", - "IP_MULTICAST_IF", - "IP_MULTICAST_IFINDEX", - "IP_MULTICAST_LOOP", - "IP_MULTICAST_TTL", - "IP_MULTICAST_VIF", - "IP_NAT__XXX", - "IP_OFFMASK", - "IP_OLD_FW_ADD", - "IP_OLD_FW_DEL", - "IP_OLD_FW_FLUSH", - "IP_OLD_FW_GET", - "IP_OLD_FW_RESETLOG", - "IP_OLD_FW_ZERO", - "IP_ONESBCAST", - "IP_OPTIONS", - "IP_ORIGDSTADDR", - "IP_PASSSEC", - "IP_PIPEX", - "IP_PKTINFO", - "IP_PKTOPTIONS", - "IP_PMTUDISC", - "IP_PMTUDISC_DO", - "IP_PMTUDISC_DONT", - "IP_PMTUDISC_PROBE", - "IP_PMTUDISC_WANT", - "IP_PORTRANGE", - "IP_PORTRANGE_DEFAULT", - "IP_PORTRANGE_HIGH", - "IP_PORTRANGE_LOW", - "IP_RECVDSTADDR", - "IP_RECVDSTPORT", - "IP_RECVERR", - "IP_RECVIF", - "IP_RECVOPTS", - "IP_RECVORIGDSTADDR", - "IP_RECVPKTINFO", - "IP_RECVRETOPTS", - "IP_RECVRTABLE", - "IP_RECVTOS", - "IP_RECVTTL", - "IP_RETOPTS", - "IP_RF", - "IP_ROUTER_ALERT", - "IP_RSVP_OFF", - "IP_RSVP_ON", - "IP_RSVP_VIF_OFF", - "IP_RSVP_VIF_ON", - "IP_RTABLE", - "IP_SENDSRCADDR", - "IP_STRIPHDR", - "IP_TOS", - "IP_TRAFFIC_MGT_BACKGROUND", - "IP_TRANSPARENT", - "IP_TTL", - "IP_UNBLOCK_SOURCE", - "IP_XFRM_POLICY", - "IPv6MTUInfo", - "IPv6Mreq", - "ISIG", - "ISTRIP", - "IUCLC", - "IUTF8", - "IXANY", - "IXOFF", - "IXON", - "IfAddrmsg", - "IfAnnounceMsghdr", - "IfData", - "IfInfomsg", - "IfMsghdr", - "IfaMsghdr", - "IfmaMsghdr", - "IfmaMsghdr2", - "ImplementsGetwd", - "Inet4Pktinfo", - "Inet6Pktinfo", - "InotifyAddWatch", - "InotifyEvent", - "InotifyInit", - "InotifyInit1", - "InotifyRmWatch", - "InterfaceAddrMessage", - "InterfaceAnnounceMessage", - "InterfaceInfo", - "InterfaceMessage", - "InterfaceMulticastAddrMessage", - "InvalidHandle", - "Ioperm", - "Iopl", - "Iovec", - "IpAdapterInfo", - "IpAddrString", - "IpAddressString", - "IpMaskString", - "Issetugid", - "KEY_ALL_ACCESS", - "KEY_CREATE_LINK", - "KEY_CREATE_SUB_KEY", - "KEY_ENUMERATE_SUB_KEYS", - "KEY_EXECUTE", - "KEY_NOTIFY", - "KEY_QUERY_VALUE", - "KEY_READ", - "KEY_SET_VALUE", - "KEY_WOW64_32KEY", - "KEY_WOW64_64KEY", - "KEY_WRITE", - "Kevent", - "Kevent_t", - "Kill", - "Klogctl", - "Kqueue", - "LANG_ENGLISH", - "LAYERED_PROTOCOL", - "LCNT_OVERLOAD_FLUSH", - "LINUX_REBOOT_CMD_CAD_OFF", - "LINUX_REBOOT_CMD_CAD_ON", - "LINUX_REBOOT_CMD_HALT", - "LINUX_REBOOT_CMD_KEXEC", - "LINUX_REBOOT_CMD_POWER_OFF", - "LINUX_REBOOT_CMD_RESTART", - "LINUX_REBOOT_CMD_RESTART2", - "LINUX_REBOOT_CMD_SW_SUSPEND", - "LINUX_REBOOT_MAGIC1", - "LINUX_REBOOT_MAGIC2", - "LOCK_EX", - "LOCK_NB", - "LOCK_SH", - "LOCK_UN", - "LazyDLL", - "LazyProc", - "Lchown", - "Linger", - "Link", - "Listen", - "Listxattr", - "LoadCancelIoEx", - "LoadConnectEx", - "LoadCreateSymbolicLink", - "LoadDLL", - "LoadGetAddrInfo", - "LoadLibrary", - "LoadSetFileCompletionNotificationModes", - "LocalFree", - "Log2phys_t", - "LookupAccountName", - "LookupAccountSid", - "LookupSID", - "LsfJump", - "LsfSocket", - "LsfStmt", - "Lstat", - "MADV_AUTOSYNC", - "MADV_CAN_REUSE", - "MADV_CORE", - "MADV_DOFORK", - "MADV_DONTFORK", - "MADV_DONTNEED", - "MADV_FREE", - "MADV_FREE_REUSABLE", - "MADV_FREE_REUSE", - "MADV_HUGEPAGE", - "MADV_HWPOISON", - "MADV_MERGEABLE", - "MADV_NOCORE", - "MADV_NOHUGEPAGE", - "MADV_NORMAL", - "MADV_NOSYNC", - "MADV_PROTECT", - "MADV_RANDOM", - "MADV_REMOVE", - "MADV_SEQUENTIAL", - "MADV_SPACEAVAIL", - "MADV_UNMERGEABLE", - "MADV_WILLNEED", - "MADV_ZERO_WIRED_PAGES", - "MAP_32BIT", - "MAP_ALIGNED_SUPER", - "MAP_ALIGNMENT_16MB", - "MAP_ALIGNMENT_1TB", - "MAP_ALIGNMENT_256TB", - "MAP_ALIGNMENT_4GB", - "MAP_ALIGNMENT_64KB", - "MAP_ALIGNMENT_64PB", - "MAP_ALIGNMENT_MASK", - "MAP_ALIGNMENT_SHIFT", - "MAP_ANON", - "MAP_ANONYMOUS", - "MAP_COPY", - "MAP_DENYWRITE", - "MAP_EXECUTABLE", - "MAP_FILE", - "MAP_FIXED", - "MAP_FLAGMASK", - "MAP_GROWSDOWN", - "MAP_HASSEMAPHORE", - "MAP_HUGETLB", - "MAP_INHERIT", - "MAP_INHERIT_COPY", - "MAP_INHERIT_DEFAULT", - "MAP_INHERIT_DONATE_COPY", - "MAP_INHERIT_NONE", - "MAP_INHERIT_SHARE", - "MAP_JIT", - "MAP_LOCKED", - "MAP_NOCACHE", - "MAP_NOCORE", - "MAP_NOEXTEND", - "MAP_NONBLOCK", - "MAP_NORESERVE", - "MAP_NOSYNC", - "MAP_POPULATE", - "MAP_PREFAULT_READ", - "MAP_PRIVATE", - "MAP_RENAME", - "MAP_RESERVED0080", - "MAP_RESERVED0100", - "MAP_SHARED", - "MAP_STACK", - "MAP_TRYFIXED", - "MAP_TYPE", - "MAP_WIRED", - "MAXIMUM_REPARSE_DATA_BUFFER_SIZE", - "MAXLEN_IFDESCR", - "MAXLEN_PHYSADDR", - "MAX_ADAPTER_ADDRESS_LENGTH", - "MAX_ADAPTER_DESCRIPTION_LENGTH", - "MAX_ADAPTER_NAME_LENGTH", - "MAX_COMPUTERNAME_LENGTH", - "MAX_INTERFACE_NAME_LEN", - "MAX_LONG_PATH", - "MAX_PATH", - "MAX_PROTOCOL_CHAIN", - "MCL_CURRENT", - "MCL_FUTURE", - "MNT_DETACH", - "MNT_EXPIRE", - "MNT_FORCE", - "MSG_BCAST", - "MSG_CMSG_CLOEXEC", - "MSG_COMPAT", - "MSG_CONFIRM", - "MSG_CONTROLMBUF", - "MSG_CTRUNC", - "MSG_DONTROUTE", - "MSG_DONTWAIT", - "MSG_EOF", - "MSG_EOR", - "MSG_ERRQUEUE", - "MSG_FASTOPEN", - "MSG_FIN", - "MSG_FLUSH", - "MSG_HAVEMORE", - "MSG_HOLD", - "MSG_IOVUSRSPACE", - "MSG_LENUSRSPACE", - "MSG_MCAST", - "MSG_MORE", - "MSG_NAMEMBUF", - "MSG_NBIO", - "MSG_NEEDSA", - "MSG_NOSIGNAL", - "MSG_NOTIFICATION", - "MSG_OOB", - "MSG_PEEK", - "MSG_PROXY", - "MSG_RCVMORE", - "MSG_RST", - "MSG_SEND", - "MSG_SYN", - "MSG_TRUNC", - "MSG_TRYHARD", - "MSG_USERFLAGS", - "MSG_WAITALL", - "MSG_WAITFORONE", - "MSG_WAITSTREAM", - "MS_ACTIVE", - "MS_ASYNC", - "MS_BIND", - "MS_DEACTIVATE", - "MS_DIRSYNC", - "MS_INVALIDATE", - "MS_I_VERSION", - "MS_KERNMOUNT", - "MS_KILLPAGES", - "MS_MANDLOCK", - "MS_MGC_MSK", - "MS_MGC_VAL", - "MS_MOVE", - "MS_NOATIME", - "MS_NODEV", - "MS_NODIRATIME", - "MS_NOEXEC", - "MS_NOSUID", - "MS_NOUSER", - "MS_POSIXACL", - "MS_PRIVATE", - "MS_RDONLY", - "MS_REC", - "MS_RELATIME", - "MS_REMOUNT", - "MS_RMT_MASK", - "MS_SHARED", - "MS_SILENT", - "MS_SLAVE", - "MS_STRICTATIME", - "MS_SYNC", - "MS_SYNCHRONOUS", - "MS_UNBINDABLE", - "Madvise", - "MapViewOfFile", - "MaxTokenInfoClass", - "Mclpool", - "MibIfRow", - "Mkdir", - "Mkdirat", - "Mkfifo", - "Mknod", - "Mknodat", - "Mlock", - "Mlockall", - "Mmap", - "Mount", - "MoveFile", - "Mprotect", - "Msghdr", - "Munlock", - "Munlockall", - "Munmap", - "MustLoadDLL", - "NAME_MAX", - "NETLINK_ADD_MEMBERSHIP", - "NETLINK_AUDIT", - "NETLINK_BROADCAST_ERROR", - "NETLINK_CONNECTOR", - "NETLINK_DNRTMSG", - "NETLINK_DROP_MEMBERSHIP", - "NETLINK_ECRYPTFS", - "NETLINK_FIB_LOOKUP", - "NETLINK_FIREWALL", - "NETLINK_GENERIC", - "NETLINK_INET_DIAG", - "NETLINK_IP6_FW", - "NETLINK_ISCSI", - "NETLINK_KOBJECT_UEVENT", - "NETLINK_NETFILTER", - "NETLINK_NFLOG", - "NETLINK_NO_ENOBUFS", - "NETLINK_PKTINFO", - "NETLINK_RDMA", - "NETLINK_ROUTE", - "NETLINK_SCSITRANSPORT", - "NETLINK_SELINUX", - "NETLINK_UNUSED", - "NETLINK_USERSOCK", - "NETLINK_XFRM", - "NET_RT_DUMP", - "NET_RT_DUMP2", - "NET_RT_FLAGS", - "NET_RT_IFLIST", - "NET_RT_IFLIST2", - "NET_RT_IFLISTL", - "NET_RT_IFMALIST", - "NET_RT_MAXID", - "NET_RT_OIFLIST", - "NET_RT_OOIFLIST", - "NET_RT_STAT", - "NET_RT_STATS", - "NET_RT_TABLE", - "NET_RT_TRASH", - "NLA_ALIGNTO", - "NLA_F_NESTED", - "NLA_F_NET_BYTEORDER", - "NLA_HDRLEN", - "NLMSG_ALIGNTO", - "NLMSG_DONE", - "NLMSG_ERROR", - "NLMSG_HDRLEN", - "NLMSG_MIN_TYPE", - "NLMSG_NOOP", - "NLMSG_OVERRUN", - "NLM_F_ACK", - "NLM_F_APPEND", - "NLM_F_ATOMIC", - "NLM_F_CREATE", - "NLM_F_DUMP", - "NLM_F_ECHO", - "NLM_F_EXCL", - "NLM_F_MATCH", - "NLM_F_MULTI", - "NLM_F_REPLACE", - "NLM_F_REQUEST", - "NLM_F_ROOT", - "NOFLSH", - "NOTE_ABSOLUTE", - "NOTE_ATTRIB", - "NOTE_BACKGROUND", - "NOTE_CHILD", - "NOTE_CRITICAL", - "NOTE_DELETE", - "NOTE_EOF", - "NOTE_EXEC", - "NOTE_EXIT", - "NOTE_EXITSTATUS", - "NOTE_EXIT_CSERROR", - "NOTE_EXIT_DECRYPTFAIL", - "NOTE_EXIT_DETAIL", - "NOTE_EXIT_DETAIL_MASK", - "NOTE_EXIT_MEMORY", - "NOTE_EXIT_REPARENTED", - "NOTE_EXTEND", - "NOTE_FFAND", - "NOTE_FFCOPY", - "NOTE_FFCTRLMASK", - "NOTE_FFLAGSMASK", - "NOTE_FFNOP", - "NOTE_FFOR", - "NOTE_FORK", - "NOTE_LEEWAY", - "NOTE_LINK", - "NOTE_LOWAT", - "NOTE_NONE", - "NOTE_NSECONDS", - "NOTE_PCTRLMASK", - "NOTE_PDATAMASK", - "NOTE_REAP", - "NOTE_RENAME", - "NOTE_RESOURCEEND", - "NOTE_REVOKE", - "NOTE_SECONDS", - "NOTE_SIGNAL", - "NOTE_TRACK", - "NOTE_TRACKERR", - "NOTE_TRIGGER", - "NOTE_TRUNCATE", - "NOTE_USECONDS", - "NOTE_VM_ERROR", - "NOTE_VM_PRESSURE", - "NOTE_VM_PRESSURE_SUDDEN_TERMINATE", - "NOTE_VM_PRESSURE_TERMINATE", - "NOTE_WRITE", - "NameCanonical", - "NameCanonicalEx", - "NameDisplay", - "NameDnsDomain", - "NameFullyQualifiedDN", - "NameSamCompatible", - "NameServicePrincipal", - "NameUniqueId", - "NameUnknown", - "NameUserPrincipal", - "Nanosleep", - "NetApiBufferFree", - "NetGetJoinInformation", - "NetSetupDomainName", - "NetSetupUnjoined", - "NetSetupUnknownStatus", - "NetSetupWorkgroupName", - "NetUserGetInfo", - "NetlinkMessage", - "NetlinkRIB", - "NetlinkRouteAttr", - "NetlinkRouteRequest", - "NewCallback", - "NewCallbackCDecl", - "NewLazyDLL", - "NlAttr", - "NlMsgerr", - "NlMsghdr", - "NsecToFiletime", - "NsecToTimespec", - "NsecToTimeval", - "Ntohs", - "OCRNL", - "OFDEL", - "OFILL", - "OFIOGETBMAP", - "OID_PKIX_KP_SERVER_AUTH", - "OID_SERVER_GATED_CRYPTO", - "OID_SGC_NETSCAPE", - "OLCUC", - "ONLCR", - "ONLRET", - "ONOCR", - "ONOEOT", - "OPEN_ALWAYS", - "OPEN_EXISTING", - "OPOST", - "O_ACCMODE", - "O_ALERT", - "O_ALT_IO", - "O_APPEND", - "O_ASYNC", - "O_CLOEXEC", - "O_CREAT", - "O_DIRECT", - "O_DIRECTORY", - "O_DP_GETRAWENCRYPTED", - "O_DSYNC", - "O_EVTONLY", - "O_EXCL", - "O_EXEC", - "O_EXLOCK", - "O_FSYNC", - "O_LARGEFILE", - "O_NDELAY", - "O_NOATIME", - "O_NOCTTY", - "O_NOFOLLOW", - "O_NONBLOCK", - "O_NOSIGPIPE", - "O_POPUP", - "O_RDONLY", - "O_RDWR", - "O_RSYNC", - "O_SHLOCK", - "O_SYMLINK", - "O_SYNC", - "O_TRUNC", - "O_TTY_INIT", - "O_WRONLY", - "Open", - "OpenCurrentProcessToken", - "OpenProcess", - "OpenProcessToken", - "Openat", - "Overlapped", - "PACKET_ADD_MEMBERSHIP", - "PACKET_BROADCAST", - "PACKET_DROP_MEMBERSHIP", - "PACKET_FASTROUTE", - "PACKET_HOST", - "PACKET_LOOPBACK", - "PACKET_MR_ALLMULTI", - "PACKET_MR_MULTICAST", - "PACKET_MR_PROMISC", - "PACKET_MULTICAST", - "PACKET_OTHERHOST", - "PACKET_OUTGOING", - "PACKET_RECV_OUTPUT", - "PACKET_RX_RING", - "PACKET_STATISTICS", - "PAGE_EXECUTE_READ", - "PAGE_EXECUTE_READWRITE", - "PAGE_EXECUTE_WRITECOPY", - "PAGE_READONLY", - "PAGE_READWRITE", - "PAGE_WRITECOPY", - "PARENB", - "PARMRK", - "PARODD", - "PENDIN", - "PFL_HIDDEN", - "PFL_MATCHES_PROTOCOL_ZERO", - "PFL_MULTIPLE_PROTO_ENTRIES", - "PFL_NETWORKDIRECT_PROVIDER", - "PFL_RECOMMENDED_PROTO_ENTRY", - "PF_FLUSH", - "PKCS_7_ASN_ENCODING", - "PMC5_PIPELINE_FLUSH", - "PRIO_PGRP", - "PRIO_PROCESS", - "PRIO_USER", - "PRI_IOFLUSH", - "PROCESS_QUERY_INFORMATION", - "PROCESS_TERMINATE", - "PROT_EXEC", - "PROT_GROWSDOWN", - "PROT_GROWSUP", - "PROT_NONE", - "PROT_READ", - "PROT_WRITE", - "PROV_DH_SCHANNEL", - "PROV_DSS", - "PROV_DSS_DH", - "PROV_EC_ECDSA_FULL", - "PROV_EC_ECDSA_SIG", - "PROV_EC_ECNRA_FULL", - "PROV_EC_ECNRA_SIG", - "PROV_FORTEZZA", - "PROV_INTEL_SEC", - "PROV_MS_EXCHANGE", - "PROV_REPLACE_OWF", - "PROV_RNG", - "PROV_RSA_AES", - "PROV_RSA_FULL", - "PROV_RSA_SCHANNEL", - "PROV_RSA_SIG", - "PROV_SPYRUS_LYNKS", - "PROV_SSL", - "PR_CAPBSET_DROP", - "PR_CAPBSET_READ", - "PR_CLEAR_SECCOMP_FILTER", - "PR_ENDIAN_BIG", - "PR_ENDIAN_LITTLE", - "PR_ENDIAN_PPC_LITTLE", - "PR_FPEMU_NOPRINT", - "PR_FPEMU_SIGFPE", - "PR_FP_EXC_ASYNC", - "PR_FP_EXC_DISABLED", - "PR_FP_EXC_DIV", - "PR_FP_EXC_INV", - "PR_FP_EXC_NONRECOV", - "PR_FP_EXC_OVF", - "PR_FP_EXC_PRECISE", - "PR_FP_EXC_RES", - "PR_FP_EXC_SW_ENABLE", - "PR_FP_EXC_UND", - "PR_GET_DUMPABLE", - "PR_GET_ENDIAN", - "PR_GET_FPEMU", - "PR_GET_FPEXC", - "PR_GET_KEEPCAPS", - "PR_GET_NAME", - "PR_GET_PDEATHSIG", - "PR_GET_SECCOMP", - "PR_GET_SECCOMP_FILTER", - "PR_GET_SECUREBITS", - "PR_GET_TIMERSLACK", - "PR_GET_TIMING", - "PR_GET_TSC", - "PR_GET_UNALIGN", - "PR_MCE_KILL", - "PR_MCE_KILL_CLEAR", - "PR_MCE_KILL_DEFAULT", - "PR_MCE_KILL_EARLY", - "PR_MCE_KILL_GET", - "PR_MCE_KILL_LATE", - "PR_MCE_KILL_SET", - "PR_SECCOMP_FILTER_EVENT", - "PR_SECCOMP_FILTER_SYSCALL", - "PR_SET_DUMPABLE", - "PR_SET_ENDIAN", - "PR_SET_FPEMU", - "PR_SET_FPEXC", - "PR_SET_KEEPCAPS", - "PR_SET_NAME", - "PR_SET_PDEATHSIG", - "PR_SET_PTRACER", - "PR_SET_SECCOMP", - "PR_SET_SECCOMP_FILTER", - "PR_SET_SECUREBITS", - "PR_SET_TIMERSLACK", - "PR_SET_TIMING", - "PR_SET_TSC", - "PR_SET_UNALIGN", - "PR_TASK_PERF_EVENTS_DISABLE", - "PR_TASK_PERF_EVENTS_ENABLE", - "PR_TIMING_STATISTICAL", - "PR_TIMING_TIMESTAMP", - "PR_TSC_ENABLE", - "PR_TSC_SIGSEGV", - "PR_UNALIGN_NOPRINT", - "PR_UNALIGN_SIGBUS", - "PTRACE_ARCH_PRCTL", - "PTRACE_ATTACH", - "PTRACE_CONT", - "PTRACE_DETACH", - "PTRACE_EVENT_CLONE", - "PTRACE_EVENT_EXEC", - "PTRACE_EVENT_EXIT", - "PTRACE_EVENT_FORK", - "PTRACE_EVENT_VFORK", - "PTRACE_EVENT_VFORK_DONE", - "PTRACE_GETCRUNCHREGS", - "PTRACE_GETEVENTMSG", - "PTRACE_GETFPREGS", - "PTRACE_GETFPXREGS", - "PTRACE_GETHBPREGS", - "PTRACE_GETREGS", - "PTRACE_GETREGSET", - "PTRACE_GETSIGINFO", - "PTRACE_GETVFPREGS", - "PTRACE_GETWMMXREGS", - "PTRACE_GET_THREAD_AREA", - "PTRACE_KILL", - "PTRACE_OLDSETOPTIONS", - "PTRACE_O_MASK", - "PTRACE_O_TRACECLONE", - "PTRACE_O_TRACEEXEC", - "PTRACE_O_TRACEEXIT", - "PTRACE_O_TRACEFORK", - "PTRACE_O_TRACESYSGOOD", - "PTRACE_O_TRACEVFORK", - "PTRACE_O_TRACEVFORKDONE", - "PTRACE_PEEKDATA", - "PTRACE_PEEKTEXT", - "PTRACE_PEEKUSR", - "PTRACE_POKEDATA", - "PTRACE_POKETEXT", - "PTRACE_POKEUSR", - "PTRACE_SETCRUNCHREGS", - "PTRACE_SETFPREGS", - "PTRACE_SETFPXREGS", - "PTRACE_SETHBPREGS", - "PTRACE_SETOPTIONS", - "PTRACE_SETREGS", - "PTRACE_SETREGSET", - "PTRACE_SETSIGINFO", - "PTRACE_SETVFPREGS", - "PTRACE_SETWMMXREGS", - "PTRACE_SET_SYSCALL", - "PTRACE_SET_THREAD_AREA", - "PTRACE_SINGLEBLOCK", - "PTRACE_SINGLESTEP", - "PTRACE_SYSCALL", - "PTRACE_SYSEMU", - "PTRACE_SYSEMU_SINGLESTEP", - "PTRACE_TRACEME", - "PT_ATTACH", - "PT_ATTACHEXC", - "PT_CONTINUE", - "PT_DATA_ADDR", - "PT_DENY_ATTACH", - "PT_DETACH", - "PT_FIRSTMACH", - "PT_FORCEQUOTA", - "PT_KILL", - "PT_MASK", - "PT_READ_D", - "PT_READ_I", - "PT_READ_U", - "PT_SIGEXC", - "PT_STEP", - "PT_TEXT_ADDR", - "PT_TEXT_END_ADDR", - "PT_THUPDATE", - "PT_TRACE_ME", - "PT_WRITE_D", - "PT_WRITE_I", - "PT_WRITE_U", - "ParseDirent", - "ParseNetlinkMessage", - "ParseNetlinkRouteAttr", - "ParseRoutingMessage", - "ParseRoutingSockaddr", - "ParseSocketControlMessage", - "ParseUnixCredentials", - "ParseUnixRights", - "PathMax", - "Pathconf", - "Pause", - "Pipe", - "Pipe2", - "PivotRoot", - "Pointer", - "PostQueuedCompletionStatus", - "Pread", - "Proc", - "ProcAttr", - "Process32First", - "Process32Next", - "ProcessEntry32", - "ProcessInformation", - "Protoent", - "PtraceAttach", - "PtraceCont", - "PtraceDetach", - "PtraceGetEventMsg", - "PtraceGetRegs", - "PtracePeekData", - "PtracePeekText", - "PtracePokeData", - "PtracePokeText", - "PtraceRegs", - "PtraceSetOptions", - "PtraceSetRegs", - "PtraceSingleStep", - "PtraceSyscall", - "Pwrite", - "REG_BINARY", - "REG_DWORD", - "REG_DWORD_BIG_ENDIAN", - "REG_DWORD_LITTLE_ENDIAN", - "REG_EXPAND_SZ", - "REG_FULL_RESOURCE_DESCRIPTOR", - "REG_LINK", - "REG_MULTI_SZ", - "REG_NONE", - "REG_QWORD", - "REG_QWORD_LITTLE_ENDIAN", - "REG_RESOURCE_LIST", - "REG_RESOURCE_REQUIREMENTS_LIST", - "REG_SZ", - "RLIMIT_AS", - "RLIMIT_CORE", - "RLIMIT_CPU", - "RLIMIT_CPU_USAGE_MONITOR", - "RLIMIT_DATA", - "RLIMIT_FSIZE", - "RLIMIT_NOFILE", - "RLIMIT_STACK", - "RLIM_INFINITY", - "RTAX_ADVMSS", - "RTAX_AUTHOR", - "RTAX_BRD", - "RTAX_CWND", - "RTAX_DST", - "RTAX_FEATURES", - "RTAX_FEATURE_ALLFRAG", - "RTAX_FEATURE_ECN", - "RTAX_FEATURE_SACK", - "RTAX_FEATURE_TIMESTAMP", - "RTAX_GATEWAY", - "RTAX_GENMASK", - "RTAX_HOPLIMIT", - "RTAX_IFA", - "RTAX_IFP", - "RTAX_INITCWND", - "RTAX_INITRWND", - "RTAX_LABEL", - "RTAX_LOCK", - "RTAX_MAX", - "RTAX_MTU", - "RTAX_NETMASK", - "RTAX_REORDERING", - "RTAX_RTO_MIN", - "RTAX_RTT", - "RTAX_RTTVAR", - "RTAX_SRC", - "RTAX_SRCMASK", - "RTAX_SSTHRESH", - "RTAX_TAG", - "RTAX_UNSPEC", - "RTAX_WINDOW", - "RTA_ALIGNTO", - "RTA_AUTHOR", - "RTA_BRD", - "RTA_CACHEINFO", - "RTA_DST", - "RTA_FLOW", - "RTA_GATEWAY", - "RTA_GENMASK", - "RTA_IFA", - "RTA_IFP", - "RTA_IIF", - "RTA_LABEL", - "RTA_MAX", - "RTA_METRICS", - "RTA_MULTIPATH", - "RTA_NETMASK", - "RTA_OIF", - "RTA_PREFSRC", - "RTA_PRIORITY", - "RTA_SRC", - "RTA_SRCMASK", - "RTA_TABLE", - "RTA_TAG", - "RTA_UNSPEC", - "RTCF_DIRECTSRC", - "RTCF_DOREDIRECT", - "RTCF_LOG", - "RTCF_MASQ", - "RTCF_NAT", - "RTCF_VALVE", - "RTF_ADDRCLASSMASK", - "RTF_ADDRCONF", - "RTF_ALLONLINK", - "RTF_ANNOUNCE", - "RTF_BLACKHOLE", - "RTF_BROADCAST", - "RTF_CACHE", - "RTF_CLONED", - "RTF_CLONING", - "RTF_CONDEMNED", - "RTF_DEFAULT", - "RTF_DELCLONE", - "RTF_DONE", - "RTF_DYNAMIC", - "RTF_FLOW", - "RTF_FMASK", - "RTF_GATEWAY", - "RTF_GWFLAG_COMPAT", - "RTF_HOST", - "RTF_IFREF", - "RTF_IFSCOPE", - "RTF_INTERFACE", - "RTF_IRTT", - "RTF_LINKRT", - "RTF_LLDATA", - "RTF_LLINFO", - "RTF_LOCAL", - "RTF_MASK", - "RTF_MODIFIED", - "RTF_MPATH", - "RTF_MPLS", - "RTF_MSS", - "RTF_MTU", - "RTF_MULTICAST", - "RTF_NAT", - "RTF_NOFORWARD", - "RTF_NONEXTHOP", - "RTF_NOPMTUDISC", - "RTF_PERMANENT_ARP", - "RTF_PINNED", - "RTF_POLICY", - "RTF_PRCLONING", - "RTF_PROTO1", - "RTF_PROTO2", - "RTF_PROTO3", - "RTF_PROXY", - "RTF_REINSTATE", - "RTF_REJECT", - "RTF_RNH_LOCKED", - "RTF_ROUTER", - "RTF_SOURCE", - "RTF_SRC", - "RTF_STATIC", - "RTF_STICKY", - "RTF_THROW", - "RTF_TUNNEL", - "RTF_UP", - "RTF_USETRAILERS", - "RTF_WASCLONED", - "RTF_WINDOW", - "RTF_XRESOLVE", - "RTM_ADD", - "RTM_BASE", - "RTM_CHANGE", - "RTM_CHGADDR", - "RTM_DELACTION", - "RTM_DELADDR", - "RTM_DELADDRLABEL", - "RTM_DELETE", - "RTM_DELLINK", - "RTM_DELMADDR", - "RTM_DELNEIGH", - "RTM_DELQDISC", - "RTM_DELROUTE", - "RTM_DELRULE", - "RTM_DELTCLASS", - "RTM_DELTFILTER", - "RTM_DESYNC", - "RTM_F_CLONED", - "RTM_F_EQUALIZE", - "RTM_F_NOTIFY", - "RTM_F_PREFIX", - "RTM_GET", - "RTM_GET2", - "RTM_GETACTION", - "RTM_GETADDR", - "RTM_GETADDRLABEL", - "RTM_GETANYCAST", - "RTM_GETDCB", - "RTM_GETLINK", - "RTM_GETMULTICAST", - "RTM_GETNEIGH", - "RTM_GETNEIGHTBL", - "RTM_GETQDISC", - "RTM_GETROUTE", - "RTM_GETRULE", - "RTM_GETTCLASS", - "RTM_GETTFILTER", - "RTM_IEEE80211", - "RTM_IFANNOUNCE", - "RTM_IFINFO", - "RTM_IFINFO2", - "RTM_LLINFO_UPD", - "RTM_LOCK", - "RTM_LOSING", - "RTM_MAX", - "RTM_MAXSIZE", - "RTM_MISS", - "RTM_NEWACTION", - "RTM_NEWADDR", - "RTM_NEWADDRLABEL", - "RTM_NEWLINK", - "RTM_NEWMADDR", - "RTM_NEWMADDR2", - "RTM_NEWNDUSEROPT", - "RTM_NEWNEIGH", - "RTM_NEWNEIGHTBL", - "RTM_NEWPREFIX", - "RTM_NEWQDISC", - "RTM_NEWROUTE", - "RTM_NEWRULE", - "RTM_NEWTCLASS", - "RTM_NEWTFILTER", - "RTM_NR_FAMILIES", - "RTM_NR_MSGTYPES", - "RTM_OIFINFO", - "RTM_OLDADD", - "RTM_OLDDEL", - "RTM_OOIFINFO", - "RTM_REDIRECT", - "RTM_RESOLVE", - "RTM_RTTUNIT", - "RTM_SETDCB", - "RTM_SETGATE", - "RTM_SETLINK", - "RTM_SETNEIGHTBL", - "RTM_VERSION", - "RTNH_ALIGNTO", - "RTNH_F_DEAD", - "RTNH_F_ONLINK", - "RTNH_F_PERVASIVE", - "RTNLGRP_IPV4_IFADDR", - "RTNLGRP_IPV4_MROUTE", - "RTNLGRP_IPV4_ROUTE", - "RTNLGRP_IPV4_RULE", - "RTNLGRP_IPV6_IFADDR", - "RTNLGRP_IPV6_IFINFO", - "RTNLGRP_IPV6_MROUTE", - "RTNLGRP_IPV6_PREFIX", - "RTNLGRP_IPV6_ROUTE", - "RTNLGRP_IPV6_RULE", - "RTNLGRP_LINK", - "RTNLGRP_ND_USEROPT", - "RTNLGRP_NEIGH", - "RTNLGRP_NONE", - "RTNLGRP_NOTIFY", - "RTNLGRP_TC", - "RTN_ANYCAST", - "RTN_BLACKHOLE", - "RTN_BROADCAST", - "RTN_LOCAL", - "RTN_MAX", - "RTN_MULTICAST", - "RTN_NAT", - "RTN_PROHIBIT", - "RTN_THROW", - "RTN_UNICAST", - "RTN_UNREACHABLE", - "RTN_UNSPEC", - "RTN_XRESOLVE", - "RTPROT_BIRD", - "RTPROT_BOOT", - "RTPROT_DHCP", - "RTPROT_DNROUTED", - "RTPROT_GATED", - "RTPROT_KERNEL", - "RTPROT_MRT", - "RTPROT_NTK", - "RTPROT_RA", - "RTPROT_REDIRECT", - "RTPROT_STATIC", - "RTPROT_UNSPEC", - "RTPROT_XORP", - "RTPROT_ZEBRA", - "RTV_EXPIRE", - "RTV_HOPCOUNT", - "RTV_MTU", - "RTV_RPIPE", - "RTV_RTT", - "RTV_RTTVAR", - "RTV_SPIPE", - "RTV_SSTHRESH", - "RTV_WEIGHT", - "RT_CACHING_CONTEXT", - "RT_CLASS_DEFAULT", - "RT_CLASS_LOCAL", - "RT_CLASS_MAIN", - "RT_CLASS_MAX", - "RT_CLASS_UNSPEC", - "RT_DEFAULT_FIB", - "RT_NORTREF", - "RT_SCOPE_HOST", - "RT_SCOPE_LINK", - "RT_SCOPE_NOWHERE", - "RT_SCOPE_SITE", - "RT_SCOPE_UNIVERSE", - "RT_TABLEID_MAX", - "RT_TABLE_COMPAT", - "RT_TABLE_DEFAULT", - "RT_TABLE_LOCAL", - "RT_TABLE_MAIN", - "RT_TABLE_MAX", - "RT_TABLE_UNSPEC", - "RUSAGE_CHILDREN", - "RUSAGE_SELF", - "RUSAGE_THREAD", - "Radvisory_t", - "RawConn", - "RawSockaddr", - "RawSockaddrAny", - "RawSockaddrDatalink", - "RawSockaddrInet4", - "RawSockaddrInet6", - "RawSockaddrLinklayer", - "RawSockaddrNetlink", - "RawSockaddrUnix", - "RawSyscall", - "RawSyscall6", - "Read", - "ReadConsole", - "ReadDirectoryChanges", - "ReadDirent", - "ReadFile", - "Readlink", - "Reboot", - "Recvfrom", - "Recvmsg", - "RegCloseKey", - "RegEnumKeyEx", - "RegOpenKeyEx", - "RegQueryInfoKey", - "RegQueryValueEx", - "RemoveDirectory", - "Removexattr", - "Rename", - "Renameat", - "Revoke", - "Rlimit", - "Rmdir", - "RouteMessage", - "RouteRIB", - "RoutingMessage", - "RtAttr", - "RtGenmsg", - "RtMetrics", - "RtMsg", - "RtMsghdr", - "RtNexthop", - "Rusage", - "SCM_BINTIME", - "SCM_CREDENTIALS", - "SCM_CREDS", - "SCM_RIGHTS", - "SCM_TIMESTAMP", - "SCM_TIMESTAMPING", - "SCM_TIMESTAMPNS", - "SCM_TIMESTAMP_MONOTONIC", - "SHUT_RD", - "SHUT_RDWR", - "SHUT_WR", - "SID", - "SIDAndAttributes", - "SIGABRT", - "SIGALRM", - "SIGBUS", - "SIGCHLD", - "SIGCLD", - "SIGCONT", - "SIGEMT", - "SIGFPE", - "SIGHUP", - "SIGILL", - "SIGINFO", - "SIGINT", - "SIGIO", - "SIGIOT", - "SIGKILL", - "SIGLIBRT", - "SIGLWP", - "SIGPIPE", - "SIGPOLL", - "SIGPROF", - "SIGPWR", - "SIGQUIT", - "SIGSEGV", - "SIGSTKFLT", - "SIGSTOP", - "SIGSYS", - "SIGTERM", - "SIGTHR", - "SIGTRAP", - "SIGTSTP", - "SIGTTIN", - "SIGTTOU", - "SIGUNUSED", - "SIGURG", - "SIGUSR1", - "SIGUSR2", - "SIGVTALRM", - "SIGWINCH", - "SIGXCPU", - "SIGXFSZ", - "SIOCADDDLCI", - "SIOCADDMULTI", - "SIOCADDRT", - "SIOCAIFADDR", - "SIOCAIFGROUP", - "SIOCALIFADDR", - "SIOCARPIPLL", - "SIOCATMARK", - "SIOCAUTOADDR", - "SIOCAUTONETMASK", - "SIOCBRDGADD", - "SIOCBRDGADDS", - "SIOCBRDGARL", - "SIOCBRDGDADDR", - "SIOCBRDGDEL", - "SIOCBRDGDELS", - "SIOCBRDGFLUSH", - "SIOCBRDGFRL", - "SIOCBRDGGCACHE", - "SIOCBRDGGFD", - "SIOCBRDGGHT", - "SIOCBRDGGIFFLGS", - "SIOCBRDGGMA", - "SIOCBRDGGPARAM", - "SIOCBRDGGPRI", - "SIOCBRDGGRL", - "SIOCBRDGGSIFS", - "SIOCBRDGGTO", - "SIOCBRDGIFS", - "SIOCBRDGRTS", - "SIOCBRDGSADDR", - "SIOCBRDGSCACHE", - "SIOCBRDGSFD", - "SIOCBRDGSHT", - "SIOCBRDGSIFCOST", - "SIOCBRDGSIFFLGS", - "SIOCBRDGSIFPRIO", - "SIOCBRDGSMA", - "SIOCBRDGSPRI", - "SIOCBRDGSPROTO", - "SIOCBRDGSTO", - "SIOCBRDGSTXHC", - "SIOCDARP", - "SIOCDELDLCI", - "SIOCDELMULTI", - "SIOCDELRT", - "SIOCDEVPRIVATE", - "SIOCDIFADDR", - "SIOCDIFGROUP", - "SIOCDIFPHYADDR", - "SIOCDLIFADDR", - "SIOCDRARP", - "SIOCGARP", - "SIOCGDRVSPEC", - "SIOCGETKALIVE", - "SIOCGETLABEL", - "SIOCGETPFLOW", - "SIOCGETPFSYNC", - "SIOCGETSGCNT", - "SIOCGETVIFCNT", - "SIOCGETVLAN", - "SIOCGHIWAT", - "SIOCGIFADDR", - "SIOCGIFADDRPREF", - "SIOCGIFALIAS", - "SIOCGIFALTMTU", - "SIOCGIFASYNCMAP", - "SIOCGIFBOND", - "SIOCGIFBR", - "SIOCGIFBRDADDR", - "SIOCGIFCAP", - "SIOCGIFCONF", - "SIOCGIFCOUNT", - "SIOCGIFDATA", - "SIOCGIFDESCR", - "SIOCGIFDEVMTU", - "SIOCGIFDLT", - "SIOCGIFDSTADDR", - "SIOCGIFENCAP", - "SIOCGIFFIB", - "SIOCGIFFLAGS", - "SIOCGIFGATTR", - "SIOCGIFGENERIC", - "SIOCGIFGMEMB", - "SIOCGIFGROUP", - "SIOCGIFHARDMTU", - "SIOCGIFHWADDR", - "SIOCGIFINDEX", - "SIOCGIFKPI", - "SIOCGIFMAC", - "SIOCGIFMAP", - "SIOCGIFMEDIA", - "SIOCGIFMEM", - "SIOCGIFMETRIC", - "SIOCGIFMTU", - "SIOCGIFNAME", - "SIOCGIFNETMASK", - "SIOCGIFPDSTADDR", - "SIOCGIFPFLAGS", - "SIOCGIFPHYS", - "SIOCGIFPRIORITY", - "SIOCGIFPSRCADDR", - "SIOCGIFRDOMAIN", - "SIOCGIFRTLABEL", - "SIOCGIFSLAVE", - "SIOCGIFSTATUS", - "SIOCGIFTIMESLOT", - "SIOCGIFTXQLEN", - "SIOCGIFVLAN", - "SIOCGIFWAKEFLAGS", - "SIOCGIFXFLAGS", - "SIOCGLIFADDR", - "SIOCGLIFPHYADDR", - "SIOCGLIFPHYRTABLE", - "SIOCGLIFPHYTTL", - "SIOCGLINKSTR", - "SIOCGLOWAT", - "SIOCGPGRP", - "SIOCGPRIVATE_0", - "SIOCGPRIVATE_1", - "SIOCGRARP", - "SIOCGSPPPPARAMS", - "SIOCGSTAMP", - "SIOCGSTAMPNS", - "SIOCGVH", - "SIOCGVNETID", - "SIOCIFCREATE", - "SIOCIFCREATE2", - "SIOCIFDESTROY", - "SIOCIFGCLONERS", - "SIOCINITIFADDR", - "SIOCPROTOPRIVATE", - "SIOCRSLVMULTI", - "SIOCRTMSG", - "SIOCSARP", - "SIOCSDRVSPEC", - "SIOCSETKALIVE", - "SIOCSETLABEL", - "SIOCSETPFLOW", - "SIOCSETPFSYNC", - "SIOCSETVLAN", - "SIOCSHIWAT", - "SIOCSIFADDR", - "SIOCSIFADDRPREF", - "SIOCSIFALTMTU", - "SIOCSIFASYNCMAP", - "SIOCSIFBOND", - "SIOCSIFBR", - "SIOCSIFBRDADDR", - "SIOCSIFCAP", - "SIOCSIFDESCR", - "SIOCSIFDSTADDR", - "SIOCSIFENCAP", - "SIOCSIFFIB", - "SIOCSIFFLAGS", - "SIOCSIFGATTR", - "SIOCSIFGENERIC", - "SIOCSIFHWADDR", - "SIOCSIFHWBROADCAST", - "SIOCSIFKPI", - "SIOCSIFLINK", - "SIOCSIFLLADDR", - "SIOCSIFMAC", - "SIOCSIFMAP", - "SIOCSIFMEDIA", - "SIOCSIFMEM", - "SIOCSIFMETRIC", - "SIOCSIFMTU", - "SIOCSIFNAME", - "SIOCSIFNETMASK", - "SIOCSIFPFLAGS", - "SIOCSIFPHYADDR", - "SIOCSIFPHYS", - "SIOCSIFPRIORITY", - "SIOCSIFRDOMAIN", - "SIOCSIFRTLABEL", - "SIOCSIFRVNET", - "SIOCSIFSLAVE", - "SIOCSIFTIMESLOT", - "SIOCSIFTXQLEN", - "SIOCSIFVLAN", - "SIOCSIFVNET", - "SIOCSIFXFLAGS", - "SIOCSLIFPHYADDR", - "SIOCSLIFPHYRTABLE", - "SIOCSLIFPHYTTL", - "SIOCSLINKSTR", - "SIOCSLOWAT", - "SIOCSPGRP", - "SIOCSRARP", - "SIOCSSPPPPARAMS", - "SIOCSVH", - "SIOCSVNETID", - "SIOCZIFDATA", - "SIO_GET_EXTENSION_FUNCTION_POINTER", - "SIO_GET_INTERFACE_LIST", - "SIO_KEEPALIVE_VALS", - "SIO_UDP_CONNRESET", - "SOCK_CLOEXEC", - "SOCK_DCCP", - "SOCK_DGRAM", - "SOCK_FLAGS_MASK", - "SOCK_MAXADDRLEN", - "SOCK_NONBLOCK", - "SOCK_NOSIGPIPE", - "SOCK_PACKET", - "SOCK_RAW", - "SOCK_RDM", - "SOCK_SEQPACKET", - "SOCK_STREAM", - "SOL_AAL", - "SOL_ATM", - "SOL_DECNET", - "SOL_ICMPV6", - "SOL_IP", - "SOL_IPV6", - "SOL_IRDA", - "SOL_PACKET", - "SOL_RAW", - "SOL_SOCKET", - "SOL_TCP", - "SOL_X25", - "SOMAXCONN", - "SO_ACCEPTCONN", - "SO_ACCEPTFILTER", - "SO_ATTACH_FILTER", - "SO_BINDANY", - "SO_BINDTODEVICE", - "SO_BINTIME", - "SO_BROADCAST", - "SO_BSDCOMPAT", - "SO_DEBUG", - "SO_DETACH_FILTER", - "SO_DOMAIN", - "SO_DONTROUTE", - "SO_DONTTRUNC", - "SO_ERROR", - "SO_KEEPALIVE", - "SO_LABEL", - "SO_LINGER", - "SO_LINGER_SEC", - "SO_LISTENINCQLEN", - "SO_LISTENQLEN", - "SO_LISTENQLIMIT", - "SO_MARK", - "SO_NETPROC", - "SO_NKE", - "SO_NOADDRERR", - "SO_NOHEADER", - "SO_NOSIGPIPE", - "SO_NOTIFYCONFLICT", - "SO_NO_CHECK", - "SO_NO_DDP", - "SO_NO_OFFLOAD", - "SO_NP_EXTENSIONS", - "SO_NREAD", - "SO_NUMRCVPKT", - "SO_NWRITE", - "SO_OOBINLINE", - "SO_OVERFLOWED", - "SO_PASSCRED", - "SO_PASSSEC", - "SO_PEERCRED", - "SO_PEERLABEL", - "SO_PEERNAME", - "SO_PEERSEC", - "SO_PRIORITY", - "SO_PROTOCOL", - "SO_PROTOTYPE", - "SO_RANDOMPORT", - "SO_RCVBUF", - "SO_RCVBUFFORCE", - "SO_RCVLOWAT", - "SO_RCVTIMEO", - "SO_RESTRICTIONS", - "SO_RESTRICT_DENYIN", - "SO_RESTRICT_DENYOUT", - "SO_RESTRICT_DENYSET", - "SO_REUSEADDR", - "SO_REUSEPORT", - "SO_REUSESHAREUID", - "SO_RTABLE", - "SO_RXQ_OVFL", - "SO_SECURITY_AUTHENTICATION", - "SO_SECURITY_ENCRYPTION_NETWORK", - "SO_SECURITY_ENCRYPTION_TRANSPORT", - "SO_SETFIB", - "SO_SNDBUF", - "SO_SNDBUFFORCE", - "SO_SNDLOWAT", - "SO_SNDTIMEO", - "SO_SPLICE", - "SO_TIMESTAMP", - "SO_TIMESTAMPING", - "SO_TIMESTAMPNS", - "SO_TIMESTAMP_MONOTONIC", - "SO_TYPE", - "SO_UPCALLCLOSEWAIT", - "SO_UPDATE_ACCEPT_CONTEXT", - "SO_UPDATE_CONNECT_CONTEXT", - "SO_USELOOPBACK", - "SO_USER_COOKIE", - "SO_VENDOR", - "SO_WANTMORE", - "SO_WANTOOBFLAG", - "SSLExtraCertChainPolicyPara", - "STANDARD_RIGHTS_ALL", - "STANDARD_RIGHTS_EXECUTE", - "STANDARD_RIGHTS_READ", - "STANDARD_RIGHTS_REQUIRED", - "STANDARD_RIGHTS_WRITE", - "STARTF_USESHOWWINDOW", - "STARTF_USESTDHANDLES", - "STD_ERROR_HANDLE", - "STD_INPUT_HANDLE", - "STD_OUTPUT_HANDLE", - "SUBLANG_ENGLISH_US", - "SW_FORCEMINIMIZE", - "SW_HIDE", - "SW_MAXIMIZE", - "SW_MINIMIZE", - "SW_NORMAL", - "SW_RESTORE", - "SW_SHOW", - "SW_SHOWDEFAULT", - "SW_SHOWMAXIMIZED", - "SW_SHOWMINIMIZED", - "SW_SHOWMINNOACTIVE", - "SW_SHOWNA", - "SW_SHOWNOACTIVATE", - "SW_SHOWNORMAL", - "SYMBOLIC_LINK_FLAG_DIRECTORY", - "SYNCHRONIZE", - "SYSCTL_VERSION", - "SYSCTL_VERS_0", - "SYSCTL_VERS_1", - "SYSCTL_VERS_MASK", - "SYS_ABORT2", - "SYS_ACCEPT", - "SYS_ACCEPT4", - "SYS_ACCEPT_NOCANCEL", - "SYS_ACCESS", - "SYS_ACCESS_EXTENDED", - "SYS_ACCT", - "SYS_ADD_KEY", - "SYS_ADD_PROFIL", - "SYS_ADJFREQ", - "SYS_ADJTIME", - "SYS_ADJTIMEX", - "SYS_AFS_SYSCALL", - "SYS_AIO_CANCEL", - "SYS_AIO_ERROR", - "SYS_AIO_FSYNC", - "SYS_AIO_MLOCK", - "SYS_AIO_READ", - "SYS_AIO_RETURN", - "SYS_AIO_SUSPEND", - "SYS_AIO_SUSPEND_NOCANCEL", - "SYS_AIO_WAITCOMPLETE", - "SYS_AIO_WRITE", - "SYS_ALARM", - "SYS_ARCH_PRCTL", - "SYS_ARM_FADVISE64_64", - "SYS_ARM_SYNC_FILE_RANGE", - "SYS_ATGETMSG", - "SYS_ATPGETREQ", - "SYS_ATPGETRSP", - "SYS_ATPSNDREQ", - "SYS_ATPSNDRSP", - "SYS_ATPUTMSG", - "SYS_ATSOCKET", - "SYS_AUDIT", - "SYS_AUDITCTL", - "SYS_AUDITON", - "SYS_AUDIT_SESSION_JOIN", - "SYS_AUDIT_SESSION_PORT", - "SYS_AUDIT_SESSION_SELF", - "SYS_BDFLUSH", - "SYS_BIND", - "SYS_BINDAT", - "SYS_BREAK", - "SYS_BRK", - "SYS_BSDTHREAD_CREATE", - "SYS_BSDTHREAD_REGISTER", - "SYS_BSDTHREAD_TERMINATE", - "SYS_CAPGET", - "SYS_CAPSET", - "SYS_CAP_ENTER", - "SYS_CAP_FCNTLS_GET", - "SYS_CAP_FCNTLS_LIMIT", - "SYS_CAP_GETMODE", - "SYS_CAP_GETRIGHTS", - "SYS_CAP_IOCTLS_GET", - "SYS_CAP_IOCTLS_LIMIT", - "SYS_CAP_NEW", - "SYS_CAP_RIGHTS_GET", - "SYS_CAP_RIGHTS_LIMIT", - "SYS_CHDIR", - "SYS_CHFLAGS", - "SYS_CHFLAGSAT", - "SYS_CHMOD", - "SYS_CHMOD_EXTENDED", - "SYS_CHOWN", - "SYS_CHOWN32", - "SYS_CHROOT", - "SYS_CHUD", - "SYS_CLOCK_ADJTIME", - "SYS_CLOCK_GETCPUCLOCKID2", - "SYS_CLOCK_GETRES", - "SYS_CLOCK_GETTIME", - "SYS_CLOCK_NANOSLEEP", - "SYS_CLOCK_SETTIME", - "SYS_CLONE", - "SYS_CLOSE", - "SYS_CLOSEFROM", - "SYS_CLOSE_NOCANCEL", - "SYS_CONNECT", - "SYS_CONNECTAT", - "SYS_CONNECT_NOCANCEL", - "SYS_COPYFILE", - "SYS_CPUSET", - "SYS_CPUSET_GETAFFINITY", - "SYS_CPUSET_GETID", - "SYS_CPUSET_SETAFFINITY", - "SYS_CPUSET_SETID", - "SYS_CREAT", - "SYS_CREATE_MODULE", - "SYS_CSOPS", - "SYS_CSOPS_AUDITTOKEN", - "SYS_DELETE", - "SYS_DELETE_MODULE", - "SYS_DUP", - "SYS_DUP2", - "SYS_DUP3", - "SYS_EACCESS", - "SYS_EPOLL_CREATE", - "SYS_EPOLL_CREATE1", - "SYS_EPOLL_CTL", - "SYS_EPOLL_CTL_OLD", - "SYS_EPOLL_PWAIT", - "SYS_EPOLL_WAIT", - "SYS_EPOLL_WAIT_OLD", - "SYS_EVENTFD", - "SYS_EVENTFD2", - "SYS_EXCHANGEDATA", - "SYS_EXECVE", - "SYS_EXIT", - "SYS_EXIT_GROUP", - "SYS_EXTATTRCTL", - "SYS_EXTATTR_DELETE_FD", - "SYS_EXTATTR_DELETE_FILE", - "SYS_EXTATTR_DELETE_LINK", - "SYS_EXTATTR_GET_FD", - "SYS_EXTATTR_GET_FILE", - "SYS_EXTATTR_GET_LINK", - "SYS_EXTATTR_LIST_FD", - "SYS_EXTATTR_LIST_FILE", - "SYS_EXTATTR_LIST_LINK", - "SYS_EXTATTR_SET_FD", - "SYS_EXTATTR_SET_FILE", - "SYS_EXTATTR_SET_LINK", - "SYS_FACCESSAT", - "SYS_FADVISE64", - "SYS_FADVISE64_64", - "SYS_FALLOCATE", - "SYS_FANOTIFY_INIT", - "SYS_FANOTIFY_MARK", - "SYS_FCHDIR", - "SYS_FCHFLAGS", - "SYS_FCHMOD", - "SYS_FCHMODAT", - "SYS_FCHMOD_EXTENDED", - "SYS_FCHOWN", - "SYS_FCHOWN32", - "SYS_FCHOWNAT", - "SYS_FCHROOT", - "SYS_FCNTL", - "SYS_FCNTL64", - "SYS_FCNTL_NOCANCEL", - "SYS_FDATASYNC", - "SYS_FEXECVE", - "SYS_FFCLOCK_GETCOUNTER", - "SYS_FFCLOCK_GETESTIMATE", - "SYS_FFCLOCK_SETESTIMATE", - "SYS_FFSCTL", - "SYS_FGETATTRLIST", - "SYS_FGETXATTR", - "SYS_FHOPEN", - "SYS_FHSTAT", - "SYS_FHSTATFS", - "SYS_FILEPORT_MAKEFD", - "SYS_FILEPORT_MAKEPORT", - "SYS_FKTRACE", - "SYS_FLISTXATTR", - "SYS_FLOCK", - "SYS_FORK", - "SYS_FPATHCONF", - "SYS_FREEBSD6_FTRUNCATE", - "SYS_FREEBSD6_LSEEK", - "SYS_FREEBSD6_MMAP", - "SYS_FREEBSD6_PREAD", - "SYS_FREEBSD6_PWRITE", - "SYS_FREEBSD6_TRUNCATE", - "SYS_FREMOVEXATTR", - "SYS_FSCTL", - "SYS_FSETATTRLIST", - "SYS_FSETXATTR", - "SYS_FSGETPATH", - "SYS_FSTAT", - "SYS_FSTAT64", - "SYS_FSTAT64_EXTENDED", - "SYS_FSTATAT", - "SYS_FSTATAT64", - "SYS_FSTATFS", - "SYS_FSTATFS64", - "SYS_FSTATV", - "SYS_FSTATVFS1", - "SYS_FSTAT_EXTENDED", - "SYS_FSYNC", - "SYS_FSYNC_NOCANCEL", - "SYS_FSYNC_RANGE", - "SYS_FTIME", - "SYS_FTRUNCATE", - "SYS_FTRUNCATE64", - "SYS_FUTEX", - "SYS_FUTIMENS", - "SYS_FUTIMES", - "SYS_FUTIMESAT", - "SYS_GETATTRLIST", - "SYS_GETAUDIT", - "SYS_GETAUDIT_ADDR", - "SYS_GETAUID", - "SYS_GETCONTEXT", - "SYS_GETCPU", - "SYS_GETCWD", - "SYS_GETDENTS", - "SYS_GETDENTS64", - "SYS_GETDIRENTRIES", - "SYS_GETDIRENTRIES64", - "SYS_GETDIRENTRIESATTR", - "SYS_GETDTABLECOUNT", - "SYS_GETDTABLESIZE", - "SYS_GETEGID", - "SYS_GETEGID32", - "SYS_GETEUID", - "SYS_GETEUID32", - "SYS_GETFH", - "SYS_GETFSSTAT", - "SYS_GETFSSTAT64", - "SYS_GETGID", - "SYS_GETGID32", - "SYS_GETGROUPS", - "SYS_GETGROUPS32", - "SYS_GETHOSTUUID", - "SYS_GETITIMER", - "SYS_GETLCID", - "SYS_GETLOGIN", - "SYS_GETLOGINCLASS", - "SYS_GETPEERNAME", - "SYS_GETPGID", - "SYS_GETPGRP", - "SYS_GETPID", - "SYS_GETPMSG", - "SYS_GETPPID", - "SYS_GETPRIORITY", - "SYS_GETRESGID", - "SYS_GETRESGID32", - "SYS_GETRESUID", - "SYS_GETRESUID32", - "SYS_GETRLIMIT", - "SYS_GETRTABLE", - "SYS_GETRUSAGE", - "SYS_GETSGROUPS", - "SYS_GETSID", - "SYS_GETSOCKNAME", - "SYS_GETSOCKOPT", - "SYS_GETTHRID", - "SYS_GETTID", - "SYS_GETTIMEOFDAY", - "SYS_GETUID", - "SYS_GETUID32", - "SYS_GETVFSSTAT", - "SYS_GETWGROUPS", - "SYS_GETXATTR", - "SYS_GET_KERNEL_SYMS", - "SYS_GET_MEMPOLICY", - "SYS_GET_ROBUST_LIST", - "SYS_GET_THREAD_AREA", - "SYS_GSSD_SYSCALL", - "SYS_GTTY", - "SYS_IDENTITYSVC", - "SYS_IDLE", - "SYS_INITGROUPS", - "SYS_INIT_MODULE", - "SYS_INOTIFY_ADD_WATCH", - "SYS_INOTIFY_INIT", - "SYS_INOTIFY_INIT1", - "SYS_INOTIFY_RM_WATCH", - "SYS_IOCTL", - "SYS_IOPERM", - "SYS_IOPL", - "SYS_IOPOLICYSYS", - "SYS_IOPRIO_GET", - "SYS_IOPRIO_SET", - "SYS_IO_CANCEL", - "SYS_IO_DESTROY", - "SYS_IO_GETEVENTS", - "SYS_IO_SETUP", - "SYS_IO_SUBMIT", - "SYS_IPC", - "SYS_ISSETUGID", - "SYS_JAIL", - "SYS_JAIL_ATTACH", - "SYS_JAIL_GET", - "SYS_JAIL_REMOVE", - "SYS_JAIL_SET", - "SYS_KAS_INFO", - "SYS_KDEBUG_TRACE", - "SYS_KENV", - "SYS_KEVENT", - "SYS_KEVENT64", - "SYS_KEXEC_LOAD", - "SYS_KEYCTL", - "SYS_KILL", - "SYS_KLDFIND", - "SYS_KLDFIRSTMOD", - "SYS_KLDLOAD", - "SYS_KLDNEXT", - "SYS_KLDSTAT", - "SYS_KLDSYM", - "SYS_KLDUNLOAD", - "SYS_KLDUNLOADF", - "SYS_KMQ_NOTIFY", - "SYS_KMQ_OPEN", - "SYS_KMQ_SETATTR", - "SYS_KMQ_TIMEDRECEIVE", - "SYS_KMQ_TIMEDSEND", - "SYS_KMQ_UNLINK", - "SYS_KQUEUE", - "SYS_KQUEUE1", - "SYS_KSEM_CLOSE", - "SYS_KSEM_DESTROY", - "SYS_KSEM_GETVALUE", - "SYS_KSEM_INIT", - "SYS_KSEM_OPEN", - "SYS_KSEM_POST", - "SYS_KSEM_TIMEDWAIT", - "SYS_KSEM_TRYWAIT", - "SYS_KSEM_UNLINK", - "SYS_KSEM_WAIT", - "SYS_KTIMER_CREATE", - "SYS_KTIMER_DELETE", - "SYS_KTIMER_GETOVERRUN", - "SYS_KTIMER_GETTIME", - "SYS_KTIMER_SETTIME", - "SYS_KTRACE", - "SYS_LCHFLAGS", - "SYS_LCHMOD", - "SYS_LCHOWN", - "SYS_LCHOWN32", - "SYS_LEDGER", - "SYS_LGETFH", - "SYS_LGETXATTR", - "SYS_LINK", - "SYS_LINKAT", - "SYS_LIO_LISTIO", - "SYS_LISTEN", - "SYS_LISTXATTR", - "SYS_LLISTXATTR", - "SYS_LOCK", - "SYS_LOOKUP_DCOOKIE", - "SYS_LPATHCONF", - "SYS_LREMOVEXATTR", - "SYS_LSEEK", - "SYS_LSETXATTR", - "SYS_LSTAT", - "SYS_LSTAT64", - "SYS_LSTAT64_EXTENDED", - "SYS_LSTATV", - "SYS_LSTAT_EXTENDED", - "SYS_LUTIMES", - "SYS_MAC_SYSCALL", - "SYS_MADVISE", - "SYS_MADVISE1", - "SYS_MAXSYSCALL", - "SYS_MBIND", - "SYS_MIGRATE_PAGES", - "SYS_MINCORE", - "SYS_MINHERIT", - "SYS_MKCOMPLEX", - "SYS_MKDIR", - "SYS_MKDIRAT", - "SYS_MKDIR_EXTENDED", - "SYS_MKFIFO", - "SYS_MKFIFOAT", - "SYS_MKFIFO_EXTENDED", - "SYS_MKNOD", - "SYS_MKNODAT", - "SYS_MLOCK", - "SYS_MLOCKALL", - "SYS_MMAP", - "SYS_MMAP2", - "SYS_MODCTL", - "SYS_MODFIND", - "SYS_MODFNEXT", - "SYS_MODIFY_LDT", - "SYS_MODNEXT", - "SYS_MODSTAT", - "SYS_MODWATCH", - "SYS_MOUNT", - "SYS_MOVE_PAGES", - "SYS_MPROTECT", - "SYS_MPX", - "SYS_MQUERY", - "SYS_MQ_GETSETATTR", - "SYS_MQ_NOTIFY", - "SYS_MQ_OPEN", - "SYS_MQ_TIMEDRECEIVE", - "SYS_MQ_TIMEDSEND", - "SYS_MQ_UNLINK", - "SYS_MREMAP", - "SYS_MSGCTL", - "SYS_MSGGET", - "SYS_MSGRCV", - "SYS_MSGRCV_NOCANCEL", - "SYS_MSGSND", - "SYS_MSGSND_NOCANCEL", - "SYS_MSGSYS", - "SYS_MSYNC", - "SYS_MSYNC_NOCANCEL", - "SYS_MUNLOCK", - "SYS_MUNLOCKALL", - "SYS_MUNMAP", - "SYS_NAME_TO_HANDLE_AT", - "SYS_NANOSLEEP", - "SYS_NEWFSTATAT", - "SYS_NFSCLNT", - "SYS_NFSSERVCTL", - "SYS_NFSSVC", - "SYS_NFSTAT", - "SYS_NICE", - "SYS_NLM_SYSCALL", - "SYS_NLSTAT", - "SYS_NMOUNT", - "SYS_NSTAT", - "SYS_NTP_ADJTIME", - "SYS_NTP_GETTIME", - "SYS_NUMA_GETAFFINITY", - "SYS_NUMA_SETAFFINITY", - "SYS_OABI_SYSCALL_BASE", - "SYS_OBREAK", - "SYS_OLDFSTAT", - "SYS_OLDLSTAT", - "SYS_OLDOLDUNAME", - "SYS_OLDSTAT", - "SYS_OLDUNAME", - "SYS_OPEN", - "SYS_OPENAT", - "SYS_OPENBSD_POLL", - "SYS_OPEN_BY_HANDLE_AT", - "SYS_OPEN_DPROTECTED_NP", - "SYS_OPEN_EXTENDED", - "SYS_OPEN_NOCANCEL", - "SYS_OVADVISE", - "SYS_PACCEPT", - "SYS_PATHCONF", - "SYS_PAUSE", - "SYS_PCICONFIG_IOBASE", - "SYS_PCICONFIG_READ", - "SYS_PCICONFIG_WRITE", - "SYS_PDFORK", - "SYS_PDGETPID", - "SYS_PDKILL", - "SYS_PERF_EVENT_OPEN", - "SYS_PERSONALITY", - "SYS_PID_HIBERNATE", - "SYS_PID_RESUME", - "SYS_PID_SHUTDOWN_SOCKETS", - "SYS_PID_SUSPEND", - "SYS_PIPE", - "SYS_PIPE2", - "SYS_PIVOT_ROOT", - "SYS_PMC_CONTROL", - "SYS_PMC_GET_INFO", - "SYS_POLL", - "SYS_POLLTS", - "SYS_POLL_NOCANCEL", - "SYS_POSIX_FADVISE", - "SYS_POSIX_FALLOCATE", - "SYS_POSIX_OPENPT", - "SYS_POSIX_SPAWN", - "SYS_PPOLL", - "SYS_PRCTL", - "SYS_PREAD", - "SYS_PREAD64", - "SYS_PREADV", - "SYS_PREAD_NOCANCEL", - "SYS_PRLIMIT64", - "SYS_PROCCTL", - "SYS_PROCESS_POLICY", - "SYS_PROCESS_VM_READV", - "SYS_PROCESS_VM_WRITEV", - "SYS_PROC_INFO", - "SYS_PROF", - "SYS_PROFIL", - "SYS_PSELECT", - "SYS_PSELECT6", - "SYS_PSET_ASSIGN", - "SYS_PSET_CREATE", - "SYS_PSET_DESTROY", - "SYS_PSYNCH_CVBROAD", - "SYS_PSYNCH_CVCLRPREPOST", - "SYS_PSYNCH_CVSIGNAL", - "SYS_PSYNCH_CVWAIT", - "SYS_PSYNCH_MUTEXDROP", - "SYS_PSYNCH_MUTEXWAIT", - "SYS_PSYNCH_RW_DOWNGRADE", - "SYS_PSYNCH_RW_LONGRDLOCK", - "SYS_PSYNCH_RW_RDLOCK", - "SYS_PSYNCH_RW_UNLOCK", - "SYS_PSYNCH_RW_UNLOCK2", - "SYS_PSYNCH_RW_UPGRADE", - "SYS_PSYNCH_RW_WRLOCK", - "SYS_PSYNCH_RW_YIELDWRLOCK", - "SYS_PTRACE", - "SYS_PUTPMSG", - "SYS_PWRITE", - "SYS_PWRITE64", - "SYS_PWRITEV", - "SYS_PWRITE_NOCANCEL", - "SYS_QUERY_MODULE", - "SYS_QUOTACTL", - "SYS_RASCTL", - "SYS_RCTL_ADD_RULE", - "SYS_RCTL_GET_LIMITS", - "SYS_RCTL_GET_RACCT", - "SYS_RCTL_GET_RULES", - "SYS_RCTL_REMOVE_RULE", - "SYS_READ", - "SYS_READAHEAD", - "SYS_READDIR", - "SYS_READLINK", - "SYS_READLINKAT", - "SYS_READV", - "SYS_READV_NOCANCEL", - "SYS_READ_NOCANCEL", - "SYS_REBOOT", - "SYS_RECV", - "SYS_RECVFROM", - "SYS_RECVFROM_NOCANCEL", - "SYS_RECVMMSG", - "SYS_RECVMSG", - "SYS_RECVMSG_NOCANCEL", - "SYS_REMAP_FILE_PAGES", - "SYS_REMOVEXATTR", - "SYS_RENAME", - "SYS_RENAMEAT", - "SYS_REQUEST_KEY", - "SYS_RESTART_SYSCALL", - "SYS_REVOKE", - "SYS_RFORK", - "SYS_RMDIR", - "SYS_RTPRIO", - "SYS_RTPRIO_THREAD", - "SYS_RT_SIGACTION", - "SYS_RT_SIGPENDING", - "SYS_RT_SIGPROCMASK", - "SYS_RT_SIGQUEUEINFO", - "SYS_RT_SIGRETURN", - "SYS_RT_SIGSUSPEND", - "SYS_RT_SIGTIMEDWAIT", - "SYS_RT_TGSIGQUEUEINFO", - "SYS_SBRK", - "SYS_SCHED_GETAFFINITY", - "SYS_SCHED_GETPARAM", - "SYS_SCHED_GETSCHEDULER", - "SYS_SCHED_GET_PRIORITY_MAX", - "SYS_SCHED_GET_PRIORITY_MIN", - "SYS_SCHED_RR_GET_INTERVAL", - "SYS_SCHED_SETAFFINITY", - "SYS_SCHED_SETPARAM", - "SYS_SCHED_SETSCHEDULER", - "SYS_SCHED_YIELD", - "SYS_SCTP_GENERIC_RECVMSG", - "SYS_SCTP_GENERIC_SENDMSG", - "SYS_SCTP_GENERIC_SENDMSG_IOV", - "SYS_SCTP_PEELOFF", - "SYS_SEARCHFS", - "SYS_SECURITY", - "SYS_SELECT", - "SYS_SELECT_NOCANCEL", - "SYS_SEMCONFIG", - "SYS_SEMCTL", - "SYS_SEMGET", - "SYS_SEMOP", - "SYS_SEMSYS", - "SYS_SEMTIMEDOP", - "SYS_SEM_CLOSE", - "SYS_SEM_DESTROY", - "SYS_SEM_GETVALUE", - "SYS_SEM_INIT", - "SYS_SEM_OPEN", - "SYS_SEM_POST", - "SYS_SEM_TRYWAIT", - "SYS_SEM_UNLINK", - "SYS_SEM_WAIT", - "SYS_SEM_WAIT_NOCANCEL", - "SYS_SEND", - "SYS_SENDFILE", - "SYS_SENDFILE64", - "SYS_SENDMMSG", - "SYS_SENDMSG", - "SYS_SENDMSG_NOCANCEL", - "SYS_SENDTO", - "SYS_SENDTO_NOCANCEL", - "SYS_SETATTRLIST", - "SYS_SETAUDIT", - "SYS_SETAUDIT_ADDR", - "SYS_SETAUID", - "SYS_SETCONTEXT", - "SYS_SETDOMAINNAME", - "SYS_SETEGID", - "SYS_SETEUID", - "SYS_SETFIB", - "SYS_SETFSGID", - "SYS_SETFSGID32", - "SYS_SETFSUID", - "SYS_SETFSUID32", - "SYS_SETGID", - "SYS_SETGID32", - "SYS_SETGROUPS", - "SYS_SETGROUPS32", - "SYS_SETHOSTNAME", - "SYS_SETITIMER", - "SYS_SETLCID", - "SYS_SETLOGIN", - "SYS_SETLOGINCLASS", - "SYS_SETNS", - "SYS_SETPGID", - "SYS_SETPRIORITY", - "SYS_SETPRIVEXEC", - "SYS_SETREGID", - "SYS_SETREGID32", - "SYS_SETRESGID", - "SYS_SETRESGID32", - "SYS_SETRESUID", - "SYS_SETRESUID32", - "SYS_SETREUID", - "SYS_SETREUID32", - "SYS_SETRLIMIT", - "SYS_SETRTABLE", - "SYS_SETSGROUPS", - "SYS_SETSID", - "SYS_SETSOCKOPT", - "SYS_SETTID", - "SYS_SETTID_WITH_PID", - "SYS_SETTIMEOFDAY", - "SYS_SETUID", - "SYS_SETUID32", - "SYS_SETWGROUPS", - "SYS_SETXATTR", - "SYS_SET_MEMPOLICY", - "SYS_SET_ROBUST_LIST", - "SYS_SET_THREAD_AREA", - "SYS_SET_TID_ADDRESS", - "SYS_SGETMASK", - "SYS_SHARED_REGION_CHECK_NP", - "SYS_SHARED_REGION_MAP_AND_SLIDE_NP", - "SYS_SHMAT", - "SYS_SHMCTL", - "SYS_SHMDT", - "SYS_SHMGET", - "SYS_SHMSYS", - "SYS_SHM_OPEN", - "SYS_SHM_UNLINK", - "SYS_SHUTDOWN", - "SYS_SIGACTION", - "SYS_SIGALTSTACK", - "SYS_SIGNAL", - "SYS_SIGNALFD", - "SYS_SIGNALFD4", - "SYS_SIGPENDING", - "SYS_SIGPROCMASK", - "SYS_SIGQUEUE", - "SYS_SIGQUEUEINFO", - "SYS_SIGRETURN", - "SYS_SIGSUSPEND", - "SYS_SIGSUSPEND_NOCANCEL", - "SYS_SIGTIMEDWAIT", - "SYS_SIGWAIT", - "SYS_SIGWAITINFO", - "SYS_SOCKET", - "SYS_SOCKETCALL", - "SYS_SOCKETPAIR", - "SYS_SPLICE", - "SYS_SSETMASK", - "SYS_SSTK", - "SYS_STACK_SNAPSHOT", - "SYS_STAT", - "SYS_STAT64", - "SYS_STAT64_EXTENDED", - "SYS_STATFS", - "SYS_STATFS64", - "SYS_STATV", - "SYS_STATVFS1", - "SYS_STAT_EXTENDED", - "SYS_STIME", - "SYS_STTY", - "SYS_SWAPCONTEXT", - "SYS_SWAPCTL", - "SYS_SWAPOFF", - "SYS_SWAPON", - "SYS_SYMLINK", - "SYS_SYMLINKAT", - "SYS_SYNC", - "SYS_SYNCFS", - "SYS_SYNC_FILE_RANGE", - "SYS_SYSARCH", - "SYS_SYSCALL", - "SYS_SYSCALL_BASE", - "SYS_SYSFS", - "SYS_SYSINFO", - "SYS_SYSLOG", - "SYS_TEE", - "SYS_TGKILL", - "SYS_THREAD_SELFID", - "SYS_THR_CREATE", - "SYS_THR_EXIT", - "SYS_THR_KILL", - "SYS_THR_KILL2", - "SYS_THR_NEW", - "SYS_THR_SELF", - "SYS_THR_SET_NAME", - "SYS_THR_SUSPEND", - "SYS_THR_WAKE", - "SYS_TIME", - "SYS_TIMERFD_CREATE", - "SYS_TIMERFD_GETTIME", - "SYS_TIMERFD_SETTIME", - "SYS_TIMER_CREATE", - "SYS_TIMER_DELETE", - "SYS_TIMER_GETOVERRUN", - "SYS_TIMER_GETTIME", - "SYS_TIMER_SETTIME", - "SYS_TIMES", - "SYS_TKILL", - "SYS_TRUNCATE", - "SYS_TRUNCATE64", - "SYS_TUXCALL", - "SYS_UGETRLIMIT", - "SYS_ULIMIT", - "SYS_UMASK", - "SYS_UMASK_EXTENDED", - "SYS_UMOUNT", - "SYS_UMOUNT2", - "SYS_UNAME", - "SYS_UNDELETE", - "SYS_UNLINK", - "SYS_UNLINKAT", - "SYS_UNMOUNT", - "SYS_UNSHARE", - "SYS_USELIB", - "SYS_USTAT", - "SYS_UTIME", - "SYS_UTIMENSAT", - "SYS_UTIMES", - "SYS_UTRACE", - "SYS_UUIDGEN", - "SYS_VADVISE", - "SYS_VFORK", - "SYS_VHANGUP", - "SYS_VM86", - "SYS_VM86OLD", - "SYS_VMSPLICE", - "SYS_VM_PRESSURE_MONITOR", - "SYS_VSERVER", - "SYS_WAIT4", - "SYS_WAIT4_NOCANCEL", - "SYS_WAIT6", - "SYS_WAITEVENT", - "SYS_WAITID", - "SYS_WAITID_NOCANCEL", - "SYS_WAITPID", - "SYS_WATCHEVENT", - "SYS_WORKQ_KERNRETURN", - "SYS_WORKQ_OPEN", - "SYS_WRITE", - "SYS_WRITEV", - "SYS_WRITEV_NOCANCEL", - "SYS_WRITE_NOCANCEL", - "SYS_YIELD", - "SYS__LLSEEK", - "SYS__LWP_CONTINUE", - "SYS__LWP_CREATE", - "SYS__LWP_CTL", - "SYS__LWP_DETACH", - "SYS__LWP_EXIT", - "SYS__LWP_GETNAME", - "SYS__LWP_GETPRIVATE", - "SYS__LWP_KILL", - "SYS__LWP_PARK", - "SYS__LWP_SELF", - "SYS__LWP_SETNAME", - "SYS__LWP_SETPRIVATE", - "SYS__LWP_SUSPEND", - "SYS__LWP_UNPARK", - "SYS__LWP_UNPARK_ALL", - "SYS__LWP_WAIT", - "SYS__LWP_WAKEUP", - "SYS__NEWSELECT", - "SYS__PSET_BIND", - "SYS__SCHED_GETAFFINITY", - "SYS__SCHED_GETPARAM", - "SYS__SCHED_SETAFFINITY", - "SYS__SCHED_SETPARAM", - "SYS__SYSCTL", - "SYS__UMTX_LOCK", - "SYS__UMTX_OP", - "SYS__UMTX_UNLOCK", - "SYS___ACL_ACLCHECK_FD", - "SYS___ACL_ACLCHECK_FILE", - "SYS___ACL_ACLCHECK_LINK", - "SYS___ACL_DELETE_FD", - "SYS___ACL_DELETE_FILE", - "SYS___ACL_DELETE_LINK", - "SYS___ACL_GET_FD", - "SYS___ACL_GET_FILE", - "SYS___ACL_GET_LINK", - "SYS___ACL_SET_FD", - "SYS___ACL_SET_FILE", - "SYS___ACL_SET_LINK", - "SYS___CAP_RIGHTS_GET", - "SYS___CLONE", - "SYS___DISABLE_THREADSIGNAL", - "SYS___GETCWD", - "SYS___GETLOGIN", - "SYS___GET_TCB", - "SYS___MAC_EXECVE", - "SYS___MAC_GETFSSTAT", - "SYS___MAC_GET_FD", - "SYS___MAC_GET_FILE", - "SYS___MAC_GET_LCID", - "SYS___MAC_GET_LCTX", - "SYS___MAC_GET_LINK", - "SYS___MAC_GET_MOUNT", - "SYS___MAC_GET_PID", - "SYS___MAC_GET_PROC", - "SYS___MAC_MOUNT", - "SYS___MAC_SET_FD", - "SYS___MAC_SET_FILE", - "SYS___MAC_SET_LCTX", - "SYS___MAC_SET_LINK", - "SYS___MAC_SET_PROC", - "SYS___MAC_SYSCALL", - "SYS___OLD_SEMWAIT_SIGNAL", - "SYS___OLD_SEMWAIT_SIGNAL_NOCANCEL", - "SYS___POSIX_CHOWN", - "SYS___POSIX_FCHOWN", - "SYS___POSIX_LCHOWN", - "SYS___POSIX_RENAME", - "SYS___PTHREAD_CANCELED", - "SYS___PTHREAD_CHDIR", - "SYS___PTHREAD_FCHDIR", - "SYS___PTHREAD_KILL", - "SYS___PTHREAD_MARKCANCEL", - "SYS___PTHREAD_SIGMASK", - "SYS___QUOTACTL", - "SYS___SEMCTL", - "SYS___SEMWAIT_SIGNAL", - "SYS___SEMWAIT_SIGNAL_NOCANCEL", - "SYS___SETLOGIN", - "SYS___SETUGID", - "SYS___SET_TCB", - "SYS___SIGACTION_SIGTRAMP", - "SYS___SIGTIMEDWAIT", - "SYS___SIGWAIT", - "SYS___SIGWAIT_NOCANCEL", - "SYS___SYSCTL", - "SYS___TFORK", - "SYS___THREXIT", - "SYS___THRSIGDIVERT", - "SYS___THRSLEEP", - "SYS___THRWAKEUP", - "S_ARCH1", - "S_ARCH2", - "S_BLKSIZE", - "S_IEXEC", - "S_IFBLK", - "S_IFCHR", - "S_IFDIR", - "S_IFIFO", - "S_IFLNK", - "S_IFMT", - "S_IFREG", - "S_IFSOCK", - "S_IFWHT", - "S_IREAD", - "S_IRGRP", - "S_IROTH", - "S_IRUSR", - "S_IRWXG", - "S_IRWXO", - "S_IRWXU", - "S_ISGID", - "S_ISTXT", - "S_ISUID", - "S_ISVTX", - "S_IWGRP", - "S_IWOTH", - "S_IWRITE", - "S_IWUSR", - "S_IXGRP", - "S_IXOTH", - "S_IXUSR", - "S_LOGIN_SET", - "SecurityAttributes", - "Seek", - "Select", - "Sendfile", - "Sendmsg", - "SendmsgN", - "Sendto", - "Servent", - "SetBpf", - "SetBpfBuflen", - "SetBpfDatalink", - "SetBpfHeadercmpl", - "SetBpfImmediate", - "SetBpfInterface", - "SetBpfPromisc", - "SetBpfTimeout", - "SetCurrentDirectory", - "SetEndOfFile", - "SetEnvironmentVariable", - "SetFileAttributes", - "SetFileCompletionNotificationModes", - "SetFilePointer", - "SetFileTime", - "SetHandleInformation", - "SetKevent", - "SetLsfPromisc", - "SetNonblock", - "Setdomainname", - "Setegid", - "Setenv", - "Seteuid", - "Setfsgid", - "Setfsuid", - "Setgid", - "Setgroups", - "Sethostname", - "Setlogin", - "Setpgid", - "Setpriority", - "Setprivexec", - "Setregid", - "Setresgid", - "Setresuid", - "Setreuid", - "Setrlimit", - "Setsid", - "Setsockopt", - "SetsockoptByte", - "SetsockoptICMPv6Filter", - "SetsockoptIPMreq", - "SetsockoptIPMreqn", - "SetsockoptIPv6Mreq", - "SetsockoptInet4Addr", - "SetsockoptInt", - "SetsockoptLinger", - "SetsockoptString", - "SetsockoptTimeval", - "Settimeofday", - "Setuid", - "Setxattr", - "Shutdown", - "SidTypeAlias", - "SidTypeComputer", - "SidTypeDeletedAccount", - "SidTypeDomain", - "SidTypeGroup", - "SidTypeInvalid", - "SidTypeLabel", - "SidTypeUnknown", - "SidTypeUser", - "SidTypeWellKnownGroup", - "Signal", - "SizeofBpfHdr", - "SizeofBpfInsn", - "SizeofBpfProgram", - "SizeofBpfStat", - "SizeofBpfVersion", - "SizeofBpfZbuf", - "SizeofBpfZbufHeader", - "SizeofCmsghdr", - "SizeofICMPv6Filter", - "SizeofIPMreq", - "SizeofIPMreqn", - "SizeofIPv6MTUInfo", - "SizeofIPv6Mreq", - "SizeofIfAddrmsg", - "SizeofIfAnnounceMsghdr", - "SizeofIfData", - "SizeofIfInfomsg", - "SizeofIfMsghdr", - "SizeofIfaMsghdr", - "SizeofIfmaMsghdr", - "SizeofIfmaMsghdr2", - "SizeofInet4Pktinfo", - "SizeofInet6Pktinfo", - "SizeofInotifyEvent", - "SizeofLinger", - "SizeofMsghdr", - "SizeofNlAttr", - "SizeofNlMsgerr", - "SizeofNlMsghdr", - "SizeofRtAttr", - "SizeofRtGenmsg", - "SizeofRtMetrics", - "SizeofRtMsg", - "SizeofRtMsghdr", - "SizeofRtNexthop", - "SizeofSockFilter", - "SizeofSockFprog", - "SizeofSockaddrAny", - "SizeofSockaddrDatalink", - "SizeofSockaddrInet4", - "SizeofSockaddrInet6", - "SizeofSockaddrLinklayer", - "SizeofSockaddrNetlink", - "SizeofSockaddrUnix", - "SizeofTCPInfo", - "SizeofUcred", - "SlicePtrFromStrings", - "SockFilter", - "SockFprog", - "Sockaddr", - "SockaddrDatalink", - "SockaddrGen", - "SockaddrInet4", - "SockaddrInet6", - "SockaddrLinklayer", - "SockaddrNetlink", - "SockaddrUnix", - "Socket", - "SocketControlMessage", - "SocketDisableIPv6", - "Socketpair", - "Splice", - "StartProcess", - "StartupInfo", - "Stat", - "Stat_t", - "Statfs", - "Statfs_t", - "Stderr", - "Stdin", - "Stdout", - "StringBytePtr", - "StringByteSlice", - "StringSlicePtr", - "StringToSid", - "StringToUTF16", - "StringToUTF16Ptr", - "Symlink", - "Sync", - "SyncFileRange", - "SysProcAttr", - "SysProcIDMap", - "Syscall", - "Syscall12", - "Syscall15", - "Syscall18", - "Syscall6", - "Syscall9", - "SyscallN", - "Sysctl", - "SysctlUint32", - "Sysctlnode", - "Sysinfo", - "Sysinfo_t", - "Systemtime", - "TCGETS", - "TCIFLUSH", - "TCIOFLUSH", - "TCOFLUSH", - "TCPInfo", - "TCPKeepalive", - "TCP_CA_NAME_MAX", - "TCP_CONGCTL", - "TCP_CONGESTION", - "TCP_CONNECTIONTIMEOUT", - "TCP_CORK", - "TCP_DEFER_ACCEPT", - "TCP_ENABLE_ECN", - "TCP_INFO", - "TCP_KEEPALIVE", - "TCP_KEEPCNT", - "TCP_KEEPIDLE", - "TCP_KEEPINIT", - "TCP_KEEPINTVL", - "TCP_LINGER2", - "TCP_MAXBURST", - "TCP_MAXHLEN", - "TCP_MAXOLEN", - "TCP_MAXSEG", - "TCP_MAXWIN", - "TCP_MAX_SACK", - "TCP_MAX_WINSHIFT", - "TCP_MD5SIG", - "TCP_MD5SIG_MAXKEYLEN", - "TCP_MINMSS", - "TCP_MINMSSOVERLOAD", - "TCP_MSS", - "TCP_NODELAY", - "TCP_NOOPT", - "TCP_NOPUSH", - "TCP_NOTSENT_LOWAT", - "TCP_NSTATES", - "TCP_QUICKACK", - "TCP_RXT_CONNDROPTIME", - "TCP_RXT_FINDROP", - "TCP_SACK_ENABLE", - "TCP_SENDMOREACKS", - "TCP_SYNCNT", - "TCP_VENDOR", - "TCP_WINDOW_CLAMP", - "TCSAFLUSH", - "TCSETS", - "TF_DISCONNECT", - "TF_REUSE_SOCKET", - "TF_USE_DEFAULT_WORKER", - "TF_USE_KERNEL_APC", - "TF_USE_SYSTEM_THREAD", - "TF_WRITE_BEHIND", - "TH32CS_INHERIT", - "TH32CS_SNAPALL", - "TH32CS_SNAPHEAPLIST", - "TH32CS_SNAPMODULE", - "TH32CS_SNAPMODULE32", - "TH32CS_SNAPPROCESS", - "TH32CS_SNAPTHREAD", - "TIME_ZONE_ID_DAYLIGHT", - "TIME_ZONE_ID_STANDARD", - "TIME_ZONE_ID_UNKNOWN", - "TIOCCBRK", - "TIOCCDTR", - "TIOCCONS", - "TIOCDCDTIMESTAMP", - "TIOCDRAIN", - "TIOCDSIMICROCODE", - "TIOCEXCL", - "TIOCEXT", - "TIOCFLAG_CDTRCTS", - "TIOCFLAG_CLOCAL", - "TIOCFLAG_CRTSCTS", - "TIOCFLAG_MDMBUF", - "TIOCFLAG_PPS", - "TIOCFLAG_SOFTCAR", - "TIOCFLUSH", - "TIOCGDEV", - "TIOCGDRAINWAIT", - "TIOCGETA", - "TIOCGETD", - "TIOCGFLAGS", - "TIOCGICOUNT", - "TIOCGLCKTRMIOS", - "TIOCGLINED", - "TIOCGPGRP", - "TIOCGPTN", - "TIOCGQSIZE", - "TIOCGRANTPT", - "TIOCGRS485", - "TIOCGSERIAL", - "TIOCGSID", - "TIOCGSIZE", - "TIOCGSOFTCAR", - "TIOCGTSTAMP", - "TIOCGWINSZ", - "TIOCINQ", - "TIOCIXOFF", - "TIOCIXON", - "TIOCLINUX", - "TIOCMBIC", - "TIOCMBIS", - "TIOCMGDTRWAIT", - "TIOCMGET", - "TIOCMIWAIT", - "TIOCMODG", - "TIOCMODS", - "TIOCMSDTRWAIT", - "TIOCMSET", - "TIOCM_CAR", - "TIOCM_CD", - "TIOCM_CTS", - "TIOCM_DCD", - "TIOCM_DSR", - "TIOCM_DTR", - "TIOCM_LE", - "TIOCM_RI", - "TIOCM_RNG", - "TIOCM_RTS", - "TIOCM_SR", - "TIOCM_ST", - "TIOCNOTTY", - "TIOCNXCL", - "TIOCOUTQ", - "TIOCPKT", - "TIOCPKT_DATA", - "TIOCPKT_DOSTOP", - "TIOCPKT_FLUSHREAD", - "TIOCPKT_FLUSHWRITE", - "TIOCPKT_IOCTL", - "TIOCPKT_NOSTOP", - "TIOCPKT_START", - "TIOCPKT_STOP", - "TIOCPTMASTER", - "TIOCPTMGET", - "TIOCPTSNAME", - "TIOCPTYGNAME", - "TIOCPTYGRANT", - "TIOCPTYUNLK", - "TIOCRCVFRAME", - "TIOCREMOTE", - "TIOCSBRK", - "TIOCSCONS", - "TIOCSCTTY", - "TIOCSDRAINWAIT", - "TIOCSDTR", - "TIOCSERCONFIG", - "TIOCSERGETLSR", - "TIOCSERGETMULTI", - "TIOCSERGSTRUCT", - "TIOCSERGWILD", - "TIOCSERSETMULTI", - "TIOCSERSWILD", - "TIOCSER_TEMT", - "TIOCSETA", - "TIOCSETAF", - "TIOCSETAW", - "TIOCSETD", - "TIOCSFLAGS", - "TIOCSIG", - "TIOCSLCKTRMIOS", - "TIOCSLINED", - "TIOCSPGRP", - "TIOCSPTLCK", - "TIOCSQSIZE", - "TIOCSRS485", - "TIOCSSERIAL", - "TIOCSSIZE", - "TIOCSSOFTCAR", - "TIOCSTART", - "TIOCSTAT", - "TIOCSTI", - "TIOCSTOP", - "TIOCSTSTAMP", - "TIOCSWINSZ", - "TIOCTIMESTAMP", - "TIOCUCNTL", - "TIOCVHANGUP", - "TIOCXMTFRAME", - "TOKEN_ADJUST_DEFAULT", - "TOKEN_ADJUST_GROUPS", - "TOKEN_ADJUST_PRIVILEGES", - "TOKEN_ADJUST_SESSIONID", - "TOKEN_ALL_ACCESS", - "TOKEN_ASSIGN_PRIMARY", - "TOKEN_DUPLICATE", - "TOKEN_EXECUTE", - "TOKEN_IMPERSONATE", - "TOKEN_QUERY", - "TOKEN_QUERY_SOURCE", - "TOKEN_READ", - "TOKEN_WRITE", - "TOSTOP", - "TRUNCATE_EXISTING", - "TUNATTACHFILTER", - "TUNDETACHFILTER", - "TUNGETFEATURES", - "TUNGETIFF", - "TUNGETSNDBUF", - "TUNGETVNETHDRSZ", - "TUNSETDEBUG", - "TUNSETGROUP", - "TUNSETIFF", - "TUNSETLINK", - "TUNSETNOCSUM", - "TUNSETOFFLOAD", - "TUNSETOWNER", - "TUNSETPERSIST", - "TUNSETSNDBUF", - "TUNSETTXFILTER", - "TUNSETVNETHDRSZ", - "Tee", - "TerminateProcess", - "Termios", - "Tgkill", - "Time", - "Time_t", - "Times", - "Timespec", - "TimespecToNsec", - "Timeval", - "Timeval32", - "TimevalToNsec", - "Timex", - "Timezoneinformation", - "Tms", - "Token", - "TokenAccessInformation", - "TokenAuditPolicy", - "TokenDefaultDacl", - "TokenElevation", - "TokenElevationType", - "TokenGroups", - "TokenGroupsAndPrivileges", - "TokenHasRestrictions", - "TokenImpersonationLevel", - "TokenIntegrityLevel", - "TokenLinkedToken", - "TokenLogonSid", - "TokenMandatoryPolicy", - "TokenOrigin", - "TokenOwner", - "TokenPrimaryGroup", - "TokenPrivileges", - "TokenRestrictedSids", - "TokenSandBoxInert", - "TokenSessionId", - "TokenSessionReference", - "TokenSource", - "TokenStatistics", - "TokenType", - "TokenUIAccess", - "TokenUser", - "TokenVirtualizationAllowed", - "TokenVirtualizationEnabled", - "Tokenprimarygroup", - "Tokenuser", - "TranslateAccountName", - "TranslateName", - "TransmitFile", - "TransmitFileBuffers", - "Truncate", - "UNIX_PATH_MAX", - "USAGE_MATCH_TYPE_AND", - "USAGE_MATCH_TYPE_OR", - "UTF16FromString", - "UTF16PtrFromString", - "UTF16ToString", - "Ucred", - "Umask", - "Uname", - "Undelete", - "UnixCredentials", - "UnixRights", - "Unlink", - "Unlinkat", - "UnmapViewOfFile", - "Unmount", - "Unsetenv", - "Unshare", - "UserInfo10", - "Ustat", - "Ustat_t", - "Utimbuf", - "Utime", - "Utimes", - "UtimesNano", - "Utsname", - "VDISCARD", - "VDSUSP", - "VEOF", - "VEOL", - "VEOL2", - "VERASE", - "VERASE2", - "VINTR", - "VKILL", - "VLNEXT", - "VMIN", - "VQUIT", - "VREPRINT", - "VSTART", - "VSTATUS", - "VSTOP", - "VSUSP", - "VSWTC", - "VT0", - "VT1", - "VTDLY", - "VTIME", - "VWERASE", - "VirtualLock", - "VirtualUnlock", - "WAIT_ABANDONED", - "WAIT_FAILED", - "WAIT_OBJECT_0", - "WAIT_TIMEOUT", - "WALL", - "WALLSIG", - "WALTSIG", - "WCLONE", - "WCONTINUED", - "WCOREFLAG", - "WEXITED", - "WLINUXCLONE", - "WNOHANG", - "WNOTHREAD", - "WNOWAIT", - "WNOZOMBIE", - "WOPTSCHECKED", - "WORDSIZE", - "WSABuf", - "WSACleanup", - "WSADESCRIPTION_LEN", - "WSAData", - "WSAEACCES", - "WSAECONNABORTED", - "WSAECONNRESET", - "WSAEnumProtocols", - "WSAID_CONNECTEX", - "WSAIoctl", - "WSAPROTOCOL_LEN", - "WSAProtocolChain", - "WSAProtocolInfo", - "WSARecv", - "WSARecvFrom", - "WSASYS_STATUS_LEN", - "WSASend", - "WSASendTo", - "WSASendto", - "WSAStartup", - "WSTOPPED", - "WTRAPPED", - "WUNTRACED", - "Wait4", - "WaitForSingleObject", - "WaitStatus", - "Win32FileAttributeData", - "Win32finddata", - "Write", - "WriteConsole", - "WriteFile", - "X509_ASN_ENCODING", - "XCASE", - "XP1_CONNECTIONLESS", - "XP1_CONNECT_DATA", - "XP1_DISCONNECT_DATA", - "XP1_EXPEDITED_DATA", - "XP1_GRACEFUL_CLOSE", - "XP1_GUARANTEED_DELIVERY", - "XP1_GUARANTEED_ORDER", - "XP1_IFS_HANDLES", - "XP1_MESSAGE_ORIENTED", - "XP1_MULTIPOINT_CONTROL_PLANE", - "XP1_MULTIPOINT_DATA_PLANE", - "XP1_PARTIAL_MESSAGE", - "XP1_PSEUDO_STREAM", - "XP1_QOS_SUPPORTED", - "XP1_SAN_SUPPORT_SDP", - "XP1_SUPPORT_BROADCAST", - "XP1_SUPPORT_MULTIPOINT", - "XP1_UNI_RECV", - "XP1_UNI_SEND", - }, - "syscall/js": { - "CopyBytesToGo", - "CopyBytesToJS", - "Error", - "Func", - "FuncOf", - "Global", - "Null", - "Type", - "TypeBoolean", - "TypeFunction", - "TypeNull", - "TypeNumber", - "TypeObject", - "TypeString", - "TypeSymbol", - "TypeUndefined", - "Undefined", - "Value", - "ValueError", - "ValueOf", - }, - "testing": { - "AllocsPerRun", - "B", - "Benchmark", - "BenchmarkResult", - "Cover", - "CoverBlock", - "CoverMode", - "Coverage", - "F", - "Init", - "InternalBenchmark", - "InternalExample", - "InternalFuzzTarget", - "InternalTest", - "M", - "Main", - "MainStart", - "PB", - "RegisterCover", - "RunBenchmarks", - "RunExamples", - "RunTests", - "Short", - "T", - "TB", - "Testing", - "Verbose", - }, - "testing/fstest": { - "MapFS", - "MapFile", - "TestFS", - }, - "testing/iotest": { - "DataErrReader", - "ErrReader", - "ErrTimeout", - "HalfReader", - "NewReadLogger", - "NewWriteLogger", - "OneByteReader", - "TestReader", - "TimeoutReader", - "TruncateWriter", - }, - "testing/quick": { - "Check", - "CheckEqual", - "CheckEqualError", - "CheckError", - "Config", - "Generator", - "SetupError", - "Value", - }, - "testing/slogtest": { - "TestHandler", - }, - "text/scanner": { - "Char", - "Comment", - "EOF", - "Float", - "GoTokens", - "GoWhitespace", - "Ident", - "Int", - "Position", - "RawString", - "ScanChars", - "ScanComments", - "ScanFloats", - "ScanIdents", - "ScanInts", - "ScanRawStrings", - "ScanStrings", - "Scanner", - "SkipComments", - "String", - "TokenString", - }, - "text/tabwriter": { - "AlignRight", - "Debug", - "DiscardEmptyColumns", - "Escape", - "FilterHTML", - "NewWriter", - "StripEscape", - "TabIndent", - "Writer", - }, - "text/template": { - "ExecError", - "FuncMap", - "HTMLEscape", - "HTMLEscapeString", - "HTMLEscaper", - "IsTrue", - "JSEscape", - "JSEscapeString", - "JSEscaper", - "Must", - "New", - "ParseFS", - "ParseFiles", - "ParseGlob", - "Template", - "URLQueryEscaper", - }, - "text/template/parse": { - "ActionNode", - "BoolNode", - "BranchNode", - "BreakNode", - "ChainNode", - "CommandNode", - "CommentNode", - "ContinueNode", - "DotNode", - "FieldNode", - "IdentifierNode", - "IfNode", - "IsEmptyTree", - "ListNode", - "Mode", - "New", - "NewIdentifier", - "NilNode", - "Node", - "NodeAction", - "NodeBool", - "NodeBreak", - "NodeChain", - "NodeCommand", - "NodeComment", - "NodeContinue", - "NodeDot", - "NodeField", - "NodeIdentifier", - "NodeIf", - "NodeList", - "NodeNil", - "NodeNumber", - "NodePipe", - "NodeRange", - "NodeString", - "NodeTemplate", - "NodeText", - "NodeType", - "NodeVariable", - "NodeWith", - "NumberNode", - "Parse", - "ParseComments", - "PipeNode", - "Pos", - "RangeNode", - "SkipFuncCheck", - "StringNode", - "TemplateNode", - "TextNode", - "Tree", - "VariableNode", - "WithNode", - }, - "time": { - "ANSIC", - "After", - "AfterFunc", - "April", - "August", - "Date", - "DateOnly", - "DateTime", - "December", - "Duration", - "February", - "FixedZone", - "Friday", - "Hour", - "January", - "July", - "June", - "Kitchen", - "Layout", - "LoadLocation", - "LoadLocationFromTZData", - "Local", - "Location", - "March", - "May", - "Microsecond", - "Millisecond", - "Minute", - "Monday", - "Month", - "Nanosecond", - "NewTicker", - "NewTimer", - "November", - "Now", - "October", - "Parse", - "ParseDuration", - "ParseError", - "ParseInLocation", - "RFC1123", - "RFC1123Z", - "RFC3339", - "RFC3339Nano", - "RFC822", - "RFC822Z", - "RFC850", - "RubyDate", - "Saturday", - "Second", - "September", - "Since", - "Sleep", - "Stamp", - "StampMicro", - "StampMilli", - "StampNano", - "Sunday", - "Thursday", - "Tick", - "Ticker", - "Time", - "TimeOnly", - "Timer", - "Tuesday", - "UTC", - "Unix", - "UnixDate", - "UnixMicro", - "UnixMilli", - "Until", - "Wednesday", - "Weekday", - }, - "unicode": { - "ASCII_Hex_Digit", - "Adlam", - "Ahom", - "Anatolian_Hieroglyphs", - "Arabic", - "Armenian", - "Avestan", - "AzeriCase", - "Balinese", - "Bamum", - "Bassa_Vah", - "Batak", - "Bengali", - "Bhaiksuki", - "Bidi_Control", - "Bopomofo", - "Brahmi", - "Braille", - "Buginese", - "Buhid", - "C", - "Canadian_Aboriginal", - "Carian", - "CaseRange", - "CaseRanges", - "Categories", - "Caucasian_Albanian", - "Cc", - "Cf", - "Chakma", - "Cham", - "Cherokee", - "Chorasmian", - "Co", - "Common", - "Coptic", - "Cs", - "Cuneiform", - "Cypriot", - "Cypro_Minoan", - "Cyrillic", - "Dash", - "Deprecated", - "Deseret", - "Devanagari", - "Diacritic", - "Digit", - "Dives_Akuru", - "Dogra", - "Duployan", - "Egyptian_Hieroglyphs", - "Elbasan", - "Elymaic", - "Ethiopic", - "Extender", - "FoldCategory", - "FoldScript", - "Georgian", - "Glagolitic", - "Gothic", - "Grantha", - "GraphicRanges", - "Greek", - "Gujarati", - "Gunjala_Gondi", - "Gurmukhi", - "Han", - "Hangul", - "Hanifi_Rohingya", - "Hanunoo", - "Hatran", - "Hebrew", - "Hex_Digit", - "Hiragana", - "Hyphen", - "IDS_Binary_Operator", - "IDS_Trinary_Operator", - "Ideographic", - "Imperial_Aramaic", - "In", - "Inherited", - "Inscriptional_Pahlavi", - "Inscriptional_Parthian", - "Is", - "IsControl", - "IsDigit", - "IsGraphic", - "IsLetter", - "IsLower", - "IsMark", - "IsNumber", - "IsOneOf", - "IsPrint", - "IsPunct", - "IsSpace", - "IsSymbol", - "IsTitle", - "IsUpper", - "Javanese", - "Join_Control", - "Kaithi", - "Kannada", - "Katakana", - "Kawi", - "Kayah_Li", - "Kharoshthi", - "Khitan_Small_Script", - "Khmer", - "Khojki", - "Khudawadi", - "L", - "Lao", - "Latin", - "Lepcha", - "Letter", - "Limbu", - "Linear_A", - "Linear_B", - "Lisu", - "Ll", - "Lm", - "Lo", - "Logical_Order_Exception", - "Lower", - "LowerCase", - "Lt", - "Lu", - "Lycian", - "Lydian", - "M", - "Mahajani", - "Makasar", - "Malayalam", - "Mandaic", - "Manichaean", - "Marchen", - "Mark", - "Masaram_Gondi", - "MaxASCII", - "MaxCase", - "MaxLatin1", - "MaxRune", - "Mc", - "Me", - "Medefaidrin", - "Meetei_Mayek", - "Mende_Kikakui", - "Meroitic_Cursive", - "Meroitic_Hieroglyphs", - "Miao", - "Mn", - "Modi", - "Mongolian", - "Mro", - "Multani", - "Myanmar", - "N", - "Nabataean", - "Nag_Mundari", - "Nandinagari", - "Nd", - "New_Tai_Lue", - "Newa", - "Nko", - "Nl", - "No", - "Noncharacter_Code_Point", - "Number", - "Nushu", - "Nyiakeng_Puachue_Hmong", - "Ogham", - "Ol_Chiki", - "Old_Hungarian", - "Old_Italic", - "Old_North_Arabian", - "Old_Permic", - "Old_Persian", - "Old_Sogdian", - "Old_South_Arabian", - "Old_Turkic", - "Old_Uyghur", - "Oriya", - "Osage", - "Osmanya", - "Other", - "Other_Alphabetic", - "Other_Default_Ignorable_Code_Point", - "Other_Grapheme_Extend", - "Other_ID_Continue", - "Other_ID_Start", - "Other_Lowercase", - "Other_Math", - "Other_Uppercase", - "P", - "Pahawh_Hmong", - "Palmyrene", - "Pattern_Syntax", - "Pattern_White_Space", - "Pau_Cin_Hau", - "Pc", - "Pd", - "Pe", - "Pf", - "Phags_Pa", - "Phoenician", - "Pi", - "Po", - "Prepended_Concatenation_Mark", - "PrintRanges", - "Properties", - "Ps", - "Psalter_Pahlavi", - "Punct", - "Quotation_Mark", - "Radical", - "Range16", - "Range32", - "RangeTable", - "Regional_Indicator", - "Rejang", - "ReplacementChar", - "Runic", - "S", - "STerm", - "Samaritan", - "Saurashtra", - "Sc", - "Scripts", - "Sentence_Terminal", - "Sharada", - "Shavian", - "Siddham", - "SignWriting", - "SimpleFold", - "Sinhala", - "Sk", - "Sm", - "So", - "Soft_Dotted", - "Sogdian", - "Sora_Sompeng", - "Soyombo", - "Space", - "SpecialCase", - "Sundanese", - "Syloti_Nagri", - "Symbol", - "Syriac", - "Tagalog", - "Tagbanwa", - "Tai_Le", - "Tai_Tham", - "Tai_Viet", - "Takri", - "Tamil", - "Tangsa", - "Tangut", - "Telugu", - "Terminal_Punctuation", - "Thaana", - "Thai", - "Tibetan", - "Tifinagh", - "Tirhuta", - "Title", - "TitleCase", - "To", - "ToLower", - "ToTitle", - "ToUpper", - "Toto", - "TurkishCase", - "Ugaritic", - "Unified_Ideograph", - "Upper", - "UpperCase", - "UpperLower", - "Vai", - "Variation_Selector", - "Version", - "Vithkuqi", - "Wancho", - "Warang_Citi", - "White_Space", - "Yezidi", - "Yi", - "Z", - "Zanabazar_Square", - "Zl", - "Zp", - "Zs", - }, - "unicode/utf16": { - "AppendRune", - "Decode", - "DecodeRune", - "Encode", - "EncodeRune", - "IsSurrogate", - }, - "unicode/utf8": { - "AppendRune", - "DecodeLastRune", - "DecodeLastRuneInString", - "DecodeRune", - "DecodeRuneInString", - "EncodeRune", - "FullRune", - "FullRuneInString", - "MaxRune", - "RuneCount", - "RuneCountInString", - "RuneError", - "RuneLen", - "RuneSelf", - "RuneStart", - "UTFMax", - "Valid", - "ValidRune", - "ValidString", - }, - "unsafe": { - "Add", - "Alignof", - "Offsetof", - "Pointer", - "Sizeof", - "Slice", - "SliceData", - "String", - "StringData", - }, -} diff --git a/vendor/golang.org/x/tools/internal/packagesinternal/packages.go b/vendor/golang.org/x/tools/internal/packagesinternal/packages.go new file mode 100644 index 0000000..44719de --- /dev/null +++ b/vendor/golang.org/x/tools/internal/packagesinternal/packages.go @@ -0,0 +1,22 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package packagesinternal exposes internal-only fields from go/packages. +package packagesinternal + +var GetForTest = func(p interface{}) string { return "" } +var GetDepsErrors = func(p interface{}) []*PackageError { return nil } + +type PackageError struct { + ImportStack []string // shortest path from package named on command line to this one + Pos string // position of error (if present, file:line:col) + Err string // the error itself +} + +var TypecheckCgo int +var DepsErrors int // must be set as a LoadMode to call GetDepsErrors +var ForTest int // must be set as a LoadMode to call GetForTest + +var SetModFlag = func(config interface{}, value string) {} +var SetModFile = func(config interface{}, value string) {} diff --git a/vendor/golang.org/x/tools/internal/pkgbits/codes.go b/vendor/golang.org/x/tools/internal/pkgbits/codes.go new file mode 100644 index 0000000..f0cabde --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/codes.go @@ -0,0 +1,77 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +// A Code is an enum value that can be encoded into bitstreams. +// +// Code types are preferable for enum types, because they allow +// Decoder to detect desyncs. +type Code interface { + // Marker returns the SyncMarker for the Code's dynamic type. + Marker() SyncMarker + + // Value returns the Code's ordinal value. + Value() int +} + +// A CodeVal distinguishes among go/constant.Value encodings. +type CodeVal int + +func (c CodeVal) Marker() SyncMarker { return SyncVal } +func (c CodeVal) Value() int { return int(c) } + +// Note: These values are public and cannot be changed without +// updating the go/types importers. + +const ( + ValBool CodeVal = iota + ValString + ValInt64 + ValBigInt + ValBigRat + ValBigFloat +) + +// A CodeType distinguishes among go/types.Type encodings. +type CodeType int + +func (c CodeType) Marker() SyncMarker { return SyncType } +func (c CodeType) Value() int { return int(c) } + +// Note: These values are public and cannot be changed without +// updating the go/types importers. + +const ( + TypeBasic CodeType = iota + TypeNamed + TypePointer + TypeSlice + TypeArray + TypeChan + TypeMap + TypeSignature + TypeStruct + TypeInterface + TypeUnion + TypeTypeParam +) + +// A CodeObj distinguishes among go/types.Object encodings. +type CodeObj int + +func (c CodeObj) Marker() SyncMarker { return SyncCodeObj } +func (c CodeObj) Value() int { return int(c) } + +// Note: These values are public and cannot be changed without +// updating the go/types importers. + +const ( + ObjAlias CodeObj = iota + ObjConst + ObjType + ObjFunc + ObjVar + ObjStub +) diff --git a/vendor/golang.org/x/tools/internal/pkgbits/decoder.go b/vendor/golang.org/x/tools/internal/pkgbits/decoder.go new file mode 100644 index 0000000..f6cb37c --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/decoder.go @@ -0,0 +1,519 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +import ( + "encoding/binary" + "errors" + "fmt" + "go/constant" + "go/token" + "io" + "math/big" + "os" + "runtime" + "strings" +) + +// A PkgDecoder provides methods for decoding a package's Unified IR +// export data. +type PkgDecoder struct { + // version is the file format version. + version Version + + // sync indicates whether the file uses sync markers. + sync bool + + // pkgPath is the package path for the package to be decoded. + // + // TODO(mdempsky): Remove; unneeded since CL 391014. + pkgPath string + + // elemData is the full data payload of the encoded package. + // Elements are densely and contiguously packed together. + // + // The last 8 bytes of elemData are the package fingerprint. + elemData string + + // elemEnds stores the byte-offset end positions of element + // bitstreams within elemData. + // + // For example, element I's bitstream data starts at elemEnds[I-1] + // (or 0, if I==0) and ends at elemEnds[I]. + // + // Note: elemEnds is indexed by absolute indices, not + // section-relative indices. + elemEnds []uint32 + + // elemEndsEnds stores the index-offset end positions of relocation + // sections within elemEnds. + // + // For example, section K's end positions start at elemEndsEnds[K-1] + // (or 0, if K==0) and end at elemEndsEnds[K]. + elemEndsEnds [numRelocs]uint32 + + scratchRelocEnt []RelocEnt +} + +// PkgPath returns the package path for the package +// +// TODO(mdempsky): Remove; unneeded since CL 391014. +func (pr *PkgDecoder) PkgPath() string { return pr.pkgPath } + +// SyncMarkers reports whether pr uses sync markers. +func (pr *PkgDecoder) SyncMarkers() bool { return pr.sync } + +// NewPkgDecoder returns a PkgDecoder initialized to read the Unified +// IR export data from input. pkgPath is the package path for the +// compilation unit that produced the export data. +func NewPkgDecoder(pkgPath, input string) PkgDecoder { + pr := PkgDecoder{ + pkgPath: pkgPath, + } + + // TODO(mdempsky): Implement direct indexing of input string to + // avoid copying the position information. + + r := strings.NewReader(input) + + var ver uint32 + assert(binary.Read(r, binary.LittleEndian, &ver) == nil) + pr.version = Version(ver) + + if pr.version >= numVersions { + panic(fmt.Errorf("cannot decode %q, export data version %d is greater than maximum supported version %d", pkgPath, pr.version, numVersions-1)) + } + + if pr.version.Has(Flags) { + var flags uint32 + assert(binary.Read(r, binary.LittleEndian, &flags) == nil) + pr.sync = flags&flagSyncMarkers != 0 + } + + assert(binary.Read(r, binary.LittleEndian, pr.elemEndsEnds[:]) == nil) + + pr.elemEnds = make([]uint32, pr.elemEndsEnds[len(pr.elemEndsEnds)-1]) + assert(binary.Read(r, binary.LittleEndian, pr.elemEnds[:]) == nil) + + pos, err := r.Seek(0, io.SeekCurrent) + assert(err == nil) + + pr.elemData = input[pos:] + + const fingerprintSize = 8 + assert(len(pr.elemData)-fingerprintSize == int(pr.elemEnds[len(pr.elemEnds)-1])) + + return pr +} + +// NumElems returns the number of elements in section k. +func (pr *PkgDecoder) NumElems(k RelocKind) int { + count := int(pr.elemEndsEnds[k]) + if k > 0 { + count -= int(pr.elemEndsEnds[k-1]) + } + return count +} + +// TotalElems returns the total number of elements across all sections. +func (pr *PkgDecoder) TotalElems() int { + return len(pr.elemEnds) +} + +// Fingerprint returns the package fingerprint. +func (pr *PkgDecoder) Fingerprint() [8]byte { + var fp [8]byte + copy(fp[:], pr.elemData[len(pr.elemData)-8:]) + return fp +} + +// AbsIdx returns the absolute index for the given (section, index) +// pair. +func (pr *PkgDecoder) AbsIdx(k RelocKind, idx Index) int { + absIdx := int(idx) + if k > 0 { + absIdx += int(pr.elemEndsEnds[k-1]) + } + if absIdx >= int(pr.elemEndsEnds[k]) { + panicf("%v:%v is out of bounds; %v", k, idx, pr.elemEndsEnds) + } + return absIdx +} + +// DataIdx returns the raw element bitstream for the given (section, +// index) pair. +func (pr *PkgDecoder) DataIdx(k RelocKind, idx Index) string { + absIdx := pr.AbsIdx(k, idx) + + var start uint32 + if absIdx > 0 { + start = pr.elemEnds[absIdx-1] + } + end := pr.elemEnds[absIdx] + + return pr.elemData[start:end] +} + +// StringIdx returns the string value for the given string index. +func (pr *PkgDecoder) StringIdx(idx Index) string { + return pr.DataIdx(RelocString, idx) +} + +// NewDecoder returns a Decoder for the given (section, index) pair, +// and decodes the given SyncMarker from the element bitstream. +func (pr *PkgDecoder) NewDecoder(k RelocKind, idx Index, marker SyncMarker) Decoder { + r := pr.NewDecoderRaw(k, idx) + r.Sync(marker) + return r +} + +// TempDecoder returns a Decoder for the given (section, index) pair, +// and decodes the given SyncMarker from the element bitstream. +// If possible the Decoder should be RetireDecoder'd when it is no longer +// needed, this will avoid heap allocations. +func (pr *PkgDecoder) TempDecoder(k RelocKind, idx Index, marker SyncMarker) Decoder { + r := pr.TempDecoderRaw(k, idx) + r.Sync(marker) + return r +} + +func (pr *PkgDecoder) RetireDecoder(d *Decoder) { + pr.scratchRelocEnt = d.Relocs + d.Relocs = nil +} + +// NewDecoderRaw returns a Decoder for the given (section, index) pair. +// +// Most callers should use NewDecoder instead. +func (pr *PkgDecoder) NewDecoderRaw(k RelocKind, idx Index) Decoder { + r := Decoder{ + common: pr, + k: k, + Idx: idx, + } + + r.Data.Reset(pr.DataIdx(k, idx)) + r.Sync(SyncRelocs) + r.Relocs = make([]RelocEnt, r.Len()) + for i := range r.Relocs { + r.Sync(SyncReloc) + r.Relocs[i] = RelocEnt{RelocKind(r.Len()), Index(r.Len())} + } + + return r +} + +func (pr *PkgDecoder) TempDecoderRaw(k RelocKind, idx Index) Decoder { + r := Decoder{ + common: pr, + k: k, + Idx: idx, + } + + r.Data.Reset(pr.DataIdx(k, idx)) + r.Sync(SyncRelocs) + l := r.Len() + if cap(pr.scratchRelocEnt) >= l { + r.Relocs = pr.scratchRelocEnt[:l] + pr.scratchRelocEnt = nil + } else { + r.Relocs = make([]RelocEnt, l) + } + for i := range r.Relocs { + r.Sync(SyncReloc) + r.Relocs[i] = RelocEnt{RelocKind(r.Len()), Index(r.Len())} + } + + return r +} + +// A Decoder provides methods for decoding an individual element's +// bitstream data. +type Decoder struct { + common *PkgDecoder + + Relocs []RelocEnt + Data strings.Reader + + k RelocKind + Idx Index +} + +func (r *Decoder) checkErr(err error) { + if err != nil { + panicf("unexpected decoding error: %w", err) + } +} + +func (r *Decoder) rawUvarint() uint64 { + x, err := readUvarint(&r.Data) + r.checkErr(err) + return x +} + +// readUvarint is a type-specialized copy of encoding/binary.ReadUvarint. +// This avoids the interface conversion and thus has better escape properties, +// which flows up the stack. +func readUvarint(r *strings.Reader) (uint64, error) { + var x uint64 + var s uint + for i := 0; i < binary.MaxVarintLen64; i++ { + b, err := r.ReadByte() + if err != nil { + if i > 0 && err == io.EOF { + err = io.ErrUnexpectedEOF + } + return x, err + } + if b < 0x80 { + if i == binary.MaxVarintLen64-1 && b > 1 { + return x, overflow + } + return x | uint64(b)<> 1) + if ux&1 != 0 { + x = ^x + } + return x +} + +func (r *Decoder) rawReloc(k RelocKind, idx int) Index { + e := r.Relocs[idx] + assert(e.Kind == k) + return e.Idx +} + +// Sync decodes a sync marker from the element bitstream and asserts +// that it matches the expected marker. +// +// If r.common.sync is false, then Sync is a no-op. +func (r *Decoder) Sync(mWant SyncMarker) { + if !r.common.sync { + return + } + + pos, _ := r.Data.Seek(0, io.SeekCurrent) + mHave := SyncMarker(r.rawUvarint()) + writerPCs := make([]int, r.rawUvarint()) + for i := range writerPCs { + writerPCs[i] = int(r.rawUvarint()) + } + + if mHave == mWant { + return + } + + // There's some tension here between printing: + // + // (1) full file paths that tools can recognize (e.g., so emacs + // hyperlinks the "file:line" text for easy navigation), or + // + // (2) short file paths that are easier for humans to read (e.g., by + // omitting redundant or irrelevant details, so it's easier to + // focus on the useful bits that remain). + // + // The current formatting favors the former, as it seems more + // helpful in practice. But perhaps the formatting could be improved + // to better address both concerns. For example, use relative file + // paths if they would be shorter, or rewrite file paths to contain + // "$GOROOT" (like objabi.AbsFile does) if tools can be taught how + // to reliably expand that again. + + fmt.Printf("export data desync: package %q, section %v, index %v, offset %v\n", r.common.pkgPath, r.k, r.Idx, pos) + + fmt.Printf("\nfound %v, written at:\n", mHave) + if len(writerPCs) == 0 { + fmt.Printf("\t[stack trace unavailable; recompile package %q with -d=syncframes]\n", r.common.pkgPath) + } + for _, pc := range writerPCs { + fmt.Printf("\t%s\n", r.common.StringIdx(r.rawReloc(RelocString, pc))) + } + + fmt.Printf("\nexpected %v, reading at:\n", mWant) + var readerPCs [32]uintptr // TODO(mdempsky): Dynamically size? + n := runtime.Callers(2, readerPCs[:]) + for _, pc := range fmtFrames(readerPCs[:n]...) { + fmt.Printf("\t%s\n", pc) + } + + // We already printed a stack trace for the reader, so now we can + // simply exit. Printing a second one with panic or base.Fatalf + // would just be noise. + os.Exit(1) +} + +// Bool decodes and returns a bool value from the element bitstream. +func (r *Decoder) Bool() bool { + r.Sync(SyncBool) + x, err := r.Data.ReadByte() + r.checkErr(err) + assert(x < 2) + return x != 0 +} + +// Int64 decodes and returns an int64 value from the element bitstream. +func (r *Decoder) Int64() int64 { + r.Sync(SyncInt64) + return r.rawVarint() +} + +// Uint64 decodes and returns a uint64 value from the element bitstream. +func (r *Decoder) Uint64() uint64 { + r.Sync(SyncUint64) + return r.rawUvarint() +} + +// Len decodes and returns a non-negative int value from the element bitstream. +func (r *Decoder) Len() int { x := r.Uint64(); v := int(x); assert(uint64(v) == x); return v } + +// Int decodes and returns an int value from the element bitstream. +func (r *Decoder) Int() int { x := r.Int64(); v := int(x); assert(int64(v) == x); return v } + +// Uint decodes and returns a uint value from the element bitstream. +func (r *Decoder) Uint() uint { x := r.Uint64(); v := uint(x); assert(uint64(v) == x); return v } + +// Code decodes a Code value from the element bitstream and returns +// its ordinal value. It's the caller's responsibility to convert the +// result to an appropriate Code type. +// +// TODO(mdempsky): Ideally this method would have signature "Code[T +// Code] T" instead, but we don't allow generic methods and the +// compiler can't depend on generics yet anyway. +func (r *Decoder) Code(mark SyncMarker) int { + r.Sync(mark) + return r.Len() +} + +// Reloc decodes a relocation of expected section k from the element +// bitstream and returns an index to the referenced element. +func (r *Decoder) Reloc(k RelocKind) Index { + r.Sync(SyncUseReloc) + return r.rawReloc(k, r.Len()) +} + +// String decodes and returns a string value from the element +// bitstream. +func (r *Decoder) String() string { + r.Sync(SyncString) + return r.common.StringIdx(r.Reloc(RelocString)) +} + +// Strings decodes and returns a variable-length slice of strings from +// the element bitstream. +func (r *Decoder) Strings() []string { + res := make([]string, r.Len()) + for i := range res { + res[i] = r.String() + } + return res +} + +// Value decodes and returns a constant.Value from the element +// bitstream. +func (r *Decoder) Value() constant.Value { + r.Sync(SyncValue) + isComplex := r.Bool() + val := r.scalar() + if isComplex { + val = constant.BinaryOp(val, token.ADD, constant.MakeImag(r.scalar())) + } + return val +} + +func (r *Decoder) scalar() constant.Value { + switch tag := CodeVal(r.Code(SyncVal)); tag { + default: + panic(fmt.Errorf("unexpected scalar tag: %v", tag)) + + case ValBool: + return constant.MakeBool(r.Bool()) + case ValString: + return constant.MakeString(r.String()) + case ValInt64: + return constant.MakeInt64(r.Int64()) + case ValBigInt: + return constant.Make(r.bigInt()) + case ValBigRat: + num := r.bigInt() + denom := r.bigInt() + return constant.Make(new(big.Rat).SetFrac(num, denom)) + case ValBigFloat: + return constant.Make(r.bigFloat()) + } +} + +func (r *Decoder) bigInt() *big.Int { + v := new(big.Int).SetBytes([]byte(r.String())) + if r.Bool() { + v.Neg(v) + } + return v +} + +func (r *Decoder) bigFloat() *big.Float { + v := new(big.Float).SetPrec(512) + assert(v.UnmarshalText([]byte(r.String())) == nil) + return v +} + +// @@@ Helpers + +// TODO(mdempsky): These should probably be removed. I think they're a +// smell that the export data format is not yet quite right. + +// PeekPkgPath returns the package path for the specified package +// index. +func (pr *PkgDecoder) PeekPkgPath(idx Index) string { + var path string + { + r := pr.TempDecoder(RelocPkg, idx, SyncPkgDef) + path = r.String() + pr.RetireDecoder(&r) + } + if path == "" { + path = pr.pkgPath + } + return path +} + +// PeekObj returns the package path, object name, and CodeObj for the +// specified object index. +func (pr *PkgDecoder) PeekObj(idx Index) (string, string, CodeObj) { + var ridx Index + var name string + var rcode int + { + r := pr.TempDecoder(RelocName, idx, SyncObject1) + r.Sync(SyncSym) + r.Sync(SyncPkg) + ridx = r.Reloc(RelocPkg) + name = r.String() + rcode = r.Code(SyncCodeObj) + pr.RetireDecoder(&r) + } + + path := pr.PeekPkgPath(ridx) + assert(name != "") + + tag := CodeObj(rcode) + + return path, name, tag +} + +// Version reports the version of the bitstream. +func (w *Decoder) Version() Version { return w.common.version } diff --git a/vendor/golang.org/x/tools/internal/pkgbits/doc.go b/vendor/golang.org/x/tools/internal/pkgbits/doc.go new file mode 100644 index 0000000..c8a2796 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/doc.go @@ -0,0 +1,32 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package pkgbits implements low-level coding abstractions for +// Unified IR's export data format. +// +// At a low-level, a package is a collection of bitstream elements. +// Each element has a "kind" and a dense, non-negative index. +// Elements can be randomly accessed given their kind and index. +// +// Individual elements are sequences of variable-length values (e.g., +// integers, booleans, strings, go/constant values, cross-references +// to other elements). Package pkgbits provides APIs for encoding and +// decoding these low-level values, but the details of mapping +// higher-level Go constructs into elements is left to higher-level +// abstractions. +// +// Elements may cross-reference each other with "relocations." For +// example, an element representing a pointer type has a relocation +// referring to the element type. +// +// Go constructs may be composed as a constellation of multiple +// elements. For example, a declared function may have one element to +// describe the object (e.g., its name, type, position), and a +// separate element to describe its function body. This allows readers +// some flexibility in efficiently seeking or re-reading data (e.g., +// inlining requires re-reading the function body for each inlined +// call, without needing to re-read the object-level details). +// +// This is a copy of internal/pkgbits in the Go implementation. +package pkgbits diff --git a/vendor/golang.org/x/tools/internal/pkgbits/encoder.go b/vendor/golang.org/x/tools/internal/pkgbits/encoder.go new file mode 100644 index 0000000..c17a123 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/encoder.go @@ -0,0 +1,392 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +import ( + "bytes" + "crypto/md5" + "encoding/binary" + "go/constant" + "io" + "math/big" + "runtime" + "strings" +) + +// A PkgEncoder provides methods for encoding a package's Unified IR +// export data. +type PkgEncoder struct { + // version of the bitstream. + version Version + + // elems holds the bitstream for previously encoded elements. + elems [numRelocs][]string + + // stringsIdx maps previously encoded strings to their index within + // the RelocString section, to allow deduplication. That is, + // elems[RelocString][stringsIdx[s]] == s (if present). + stringsIdx map[string]Index + + // syncFrames is the number of frames to write at each sync + // marker. A negative value means sync markers are omitted. + syncFrames int +} + +// SyncMarkers reports whether pw uses sync markers. +func (pw *PkgEncoder) SyncMarkers() bool { return pw.syncFrames >= 0 } + +// NewPkgEncoder returns an initialized PkgEncoder. +// +// syncFrames is the number of caller frames that should be serialized +// at Sync points. Serializing additional frames results in larger +// export data files, but can help diagnosing desync errors in +// higher-level Unified IR reader/writer code. If syncFrames is +// negative, then sync markers are omitted entirely. +func NewPkgEncoder(version Version, syncFrames int) PkgEncoder { + return PkgEncoder{ + version: version, + stringsIdx: make(map[string]Index), + syncFrames: syncFrames, + } +} + +// DumpTo writes the package's encoded data to out0 and returns the +// package fingerprint. +func (pw *PkgEncoder) DumpTo(out0 io.Writer) (fingerprint [8]byte) { + h := md5.New() + out := io.MultiWriter(out0, h) + + writeUint32 := func(x uint32) { + assert(binary.Write(out, binary.LittleEndian, x) == nil) + } + + writeUint32(uint32(pw.version)) + + if pw.version.Has(Flags) { + var flags uint32 + if pw.SyncMarkers() { + flags |= flagSyncMarkers + } + writeUint32(flags) + } + + // Write elemEndsEnds. + var sum uint32 + for _, elems := range &pw.elems { + sum += uint32(len(elems)) + writeUint32(sum) + } + + // Write elemEnds. + sum = 0 + for _, elems := range &pw.elems { + for _, elem := range elems { + sum += uint32(len(elem)) + writeUint32(sum) + } + } + + // Write elemData. + for _, elems := range &pw.elems { + for _, elem := range elems { + _, err := io.WriteString(out, elem) + assert(err == nil) + } + } + + // Write fingerprint. + copy(fingerprint[:], h.Sum(nil)) + _, err := out0.Write(fingerprint[:]) + assert(err == nil) + + return +} + +// StringIdx adds a string value to the strings section, if not +// already present, and returns its index. +func (pw *PkgEncoder) StringIdx(s string) Index { + if idx, ok := pw.stringsIdx[s]; ok { + assert(pw.elems[RelocString][idx] == s) + return idx + } + + idx := Index(len(pw.elems[RelocString])) + pw.elems[RelocString] = append(pw.elems[RelocString], s) + pw.stringsIdx[s] = idx + return idx +} + +// NewEncoder returns an Encoder for a new element within the given +// section, and encodes the given SyncMarker as the start of the +// element bitstream. +func (pw *PkgEncoder) NewEncoder(k RelocKind, marker SyncMarker) Encoder { + e := pw.NewEncoderRaw(k) + e.Sync(marker) + return e +} + +// NewEncoderRaw returns an Encoder for a new element within the given +// section. +// +// Most callers should use NewEncoder instead. +func (pw *PkgEncoder) NewEncoderRaw(k RelocKind) Encoder { + idx := Index(len(pw.elems[k])) + pw.elems[k] = append(pw.elems[k], "") // placeholder + + return Encoder{ + p: pw, + k: k, + Idx: idx, + } +} + +// An Encoder provides methods for encoding an individual element's +// bitstream data. +type Encoder struct { + p *PkgEncoder + + Relocs []RelocEnt + RelocMap map[RelocEnt]uint32 + Data bytes.Buffer // accumulated element bitstream data + + encodingRelocHeader bool + + k RelocKind + Idx Index // index within relocation section +} + +// Flush finalizes the element's bitstream and returns its Index. +func (w *Encoder) Flush() Index { + var sb strings.Builder + + // Backup the data so we write the relocations at the front. + var tmp bytes.Buffer + io.Copy(&tmp, &w.Data) + + // TODO(mdempsky): Consider writing these out separately so they're + // easier to strip, along with function bodies, so that we can prune + // down to just the data that's relevant to go/types. + if w.encodingRelocHeader { + panic("encodingRelocHeader already true; recursive flush?") + } + w.encodingRelocHeader = true + w.Sync(SyncRelocs) + w.Len(len(w.Relocs)) + for _, rEnt := range w.Relocs { + w.Sync(SyncReloc) + w.Len(int(rEnt.Kind)) + w.Len(int(rEnt.Idx)) + } + + io.Copy(&sb, &w.Data) + io.Copy(&sb, &tmp) + w.p.elems[w.k][w.Idx] = sb.String() + + return w.Idx +} + +func (w *Encoder) checkErr(err error) { + if err != nil { + panicf("unexpected encoding error: %v", err) + } +} + +func (w *Encoder) rawUvarint(x uint64) { + var buf [binary.MaxVarintLen64]byte + n := binary.PutUvarint(buf[:], x) + _, err := w.Data.Write(buf[:n]) + w.checkErr(err) +} + +func (w *Encoder) rawVarint(x int64) { + // Zig-zag encode. + ux := uint64(x) << 1 + if x < 0 { + ux = ^ux + } + + w.rawUvarint(ux) +} + +func (w *Encoder) rawReloc(r RelocKind, idx Index) int { + e := RelocEnt{r, idx} + if w.RelocMap != nil { + if i, ok := w.RelocMap[e]; ok { + return int(i) + } + } else { + w.RelocMap = make(map[RelocEnt]uint32) + } + + i := len(w.Relocs) + w.RelocMap[e] = uint32(i) + w.Relocs = append(w.Relocs, e) + return i +} + +func (w *Encoder) Sync(m SyncMarker) { + if !w.p.SyncMarkers() { + return + } + + // Writing out stack frame string references requires working + // relocations, but writing out the relocations themselves involves + // sync markers. To prevent infinite recursion, we simply trim the + // stack frame for sync markers within the relocation header. + var frames []string + if !w.encodingRelocHeader && w.p.syncFrames > 0 { + pcs := make([]uintptr, w.p.syncFrames) + n := runtime.Callers(2, pcs) + frames = fmtFrames(pcs[:n]...) + } + + // TODO(mdempsky): Save space by writing out stack frames as a + // linked list so we can share common stack frames. + w.rawUvarint(uint64(m)) + w.rawUvarint(uint64(len(frames))) + for _, frame := range frames { + w.rawUvarint(uint64(w.rawReloc(RelocString, w.p.StringIdx(frame)))) + } +} + +// Bool encodes and writes a bool value into the element bitstream, +// and then returns the bool value. +// +// For simple, 2-alternative encodings, the idiomatic way to call Bool +// is something like: +// +// if w.Bool(x != 0) { +// // alternative #1 +// } else { +// // alternative #2 +// } +// +// For multi-alternative encodings, use Code instead. +func (w *Encoder) Bool(b bool) bool { + w.Sync(SyncBool) + var x byte + if b { + x = 1 + } + err := w.Data.WriteByte(x) + w.checkErr(err) + return b +} + +// Int64 encodes and writes an int64 value into the element bitstream. +func (w *Encoder) Int64(x int64) { + w.Sync(SyncInt64) + w.rawVarint(x) +} + +// Uint64 encodes and writes a uint64 value into the element bitstream. +func (w *Encoder) Uint64(x uint64) { + w.Sync(SyncUint64) + w.rawUvarint(x) +} + +// Len encodes and writes a non-negative int value into the element bitstream. +func (w *Encoder) Len(x int) { assert(x >= 0); w.Uint64(uint64(x)) } + +// Int encodes and writes an int value into the element bitstream. +func (w *Encoder) Int(x int) { w.Int64(int64(x)) } + +// Uint encodes and writes a uint value into the element bitstream. +func (w *Encoder) Uint(x uint) { w.Uint64(uint64(x)) } + +// Reloc encodes and writes a relocation for the given (section, +// index) pair into the element bitstream. +// +// Note: Only the index is formally written into the element +// bitstream, so bitstream decoders must know from context which +// section an encoded relocation refers to. +func (w *Encoder) Reloc(r RelocKind, idx Index) { + w.Sync(SyncUseReloc) + w.Len(w.rawReloc(r, idx)) +} + +// Code encodes and writes a Code value into the element bitstream. +func (w *Encoder) Code(c Code) { + w.Sync(c.Marker()) + w.Len(c.Value()) +} + +// String encodes and writes a string value into the element +// bitstream. +// +// Internally, strings are deduplicated by adding them to the strings +// section (if not already present), and then writing a relocation +// into the element bitstream. +func (w *Encoder) String(s string) { + w.StringRef(w.p.StringIdx(s)) +} + +// StringRef writes a reference to the given index, which must be a +// previously encoded string value. +func (w *Encoder) StringRef(idx Index) { + w.Sync(SyncString) + w.Reloc(RelocString, idx) +} + +// Strings encodes and writes a variable-length slice of strings into +// the element bitstream. +func (w *Encoder) Strings(ss []string) { + w.Len(len(ss)) + for _, s := range ss { + w.String(s) + } +} + +// Value encodes and writes a constant.Value into the element +// bitstream. +func (w *Encoder) Value(val constant.Value) { + w.Sync(SyncValue) + if w.Bool(val.Kind() == constant.Complex) { + w.scalar(constant.Real(val)) + w.scalar(constant.Imag(val)) + } else { + w.scalar(val) + } +} + +func (w *Encoder) scalar(val constant.Value) { + switch v := constant.Val(val).(type) { + default: + panicf("unhandled %v (%v)", val, val.Kind()) + case bool: + w.Code(ValBool) + w.Bool(v) + case string: + w.Code(ValString) + w.String(v) + case int64: + w.Code(ValInt64) + w.Int64(v) + case *big.Int: + w.Code(ValBigInt) + w.bigInt(v) + case *big.Rat: + w.Code(ValBigRat) + w.bigInt(v.Num()) + w.bigInt(v.Denom()) + case *big.Float: + w.Code(ValBigFloat) + w.bigFloat(v) + } +} + +func (w *Encoder) bigInt(v *big.Int) { + b := v.Bytes() + w.String(string(b)) // TODO: More efficient encoding. + w.Bool(v.Sign() < 0) +} + +func (w *Encoder) bigFloat(v *big.Float) { + b := v.Append(nil, 'p', -1) + w.String(string(b)) // TODO: More efficient encoding. +} + +// Version reports the version of the bitstream. +func (w *Encoder) Version() Version { return w.p.version } diff --git a/vendor/golang.org/x/tools/internal/pkgbits/flags.go b/vendor/golang.org/x/tools/internal/pkgbits/flags.go new file mode 100644 index 0000000..6542227 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/flags.go @@ -0,0 +1,9 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +const ( + flagSyncMarkers = 1 << iota // file format contains sync markers +) diff --git a/vendor/golang.org/x/tools/internal/pkgbits/reloc.go b/vendor/golang.org/x/tools/internal/pkgbits/reloc.go new file mode 100644 index 0000000..fcdfb97 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/reloc.go @@ -0,0 +1,42 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +// A RelocKind indicates a particular section within a unified IR export. +type RelocKind int32 + +// An Index represents a bitstream element index within a particular +// section. +type Index int32 + +// A relocEnt (relocation entry) is an entry in an element's local +// reference table. +// +// TODO(mdempsky): Rename this too. +type RelocEnt struct { + Kind RelocKind + Idx Index +} + +// Reserved indices within the meta relocation section. +const ( + PublicRootIdx Index = 0 + PrivateRootIdx Index = 1 +) + +const ( + RelocString RelocKind = iota + RelocMeta + RelocPosBase + RelocPkg + RelocName + RelocType + RelocObj + RelocObjExt + RelocObjDict + RelocBody + + numRelocs = iota +) diff --git a/vendor/golang.org/x/tools/internal/pkgbits/support.go b/vendor/golang.org/x/tools/internal/pkgbits/support.go new file mode 100644 index 0000000..50534a2 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/support.go @@ -0,0 +1,17 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +import "fmt" + +func assert(b bool) { + if !b { + panic("assertion failed") + } +} + +func panicf(format string, args ...any) { + panic(fmt.Errorf(format, args...)) +} diff --git a/vendor/golang.org/x/tools/internal/pkgbits/sync.go b/vendor/golang.org/x/tools/internal/pkgbits/sync.go new file mode 100644 index 0000000..1520b73 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/sync.go @@ -0,0 +1,136 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +import ( + "fmt" + "runtime" + "strings" +) + +// fmtFrames formats a backtrace for reporting reader/writer desyncs. +func fmtFrames(pcs ...uintptr) []string { + res := make([]string, 0, len(pcs)) + walkFrames(pcs, func(file string, line int, name string, offset uintptr) { + // Trim package from function name. It's just redundant noise. + name = strings.TrimPrefix(name, "cmd/compile/internal/noder.") + + res = append(res, fmt.Sprintf("%s:%v: %s +0x%v", file, line, name, offset)) + }) + return res +} + +type frameVisitor func(file string, line int, name string, offset uintptr) + +// walkFrames calls visit for each call frame represented by pcs. +// +// pcs should be a slice of PCs, as returned by runtime.Callers. +func walkFrames(pcs []uintptr, visit frameVisitor) { + if len(pcs) == 0 { + return + } + + frames := runtime.CallersFrames(pcs) + for { + frame, more := frames.Next() + visit(frame.File, frame.Line, frame.Function, frame.PC-frame.Entry) + if !more { + return + } + } +} + +// SyncMarker is an enum type that represents markers that may be +// written to export data to ensure the reader and writer stay +// synchronized. +type SyncMarker int + +//go:generate stringer -type=SyncMarker -trimprefix=Sync + +const ( + _ SyncMarker = iota + + // Public markers (known to go/types importers). + + // Low-level coding markers. + SyncEOF + SyncBool + SyncInt64 + SyncUint64 + SyncString + SyncValue + SyncVal + SyncRelocs + SyncReloc + SyncUseReloc + + // Higher-level object and type markers. + SyncPublic + SyncPos + SyncPosBase + SyncObject + SyncObject1 + SyncPkg + SyncPkgDef + SyncMethod + SyncType + SyncTypeIdx + SyncTypeParamNames + SyncSignature + SyncParams + SyncParam + SyncCodeObj + SyncSym + SyncLocalIdent + SyncSelector + + // Private markers (only known to cmd/compile). + SyncPrivate + + SyncFuncExt + SyncVarExt + SyncTypeExt + SyncPragma + + SyncExprList + SyncExprs + SyncExpr + SyncExprType + SyncAssign + SyncOp + SyncFuncLit + SyncCompLit + + SyncDecl + SyncFuncBody + SyncOpenScope + SyncCloseScope + SyncCloseAnotherScope + SyncDeclNames + SyncDeclName + + SyncStmts + SyncBlockStmt + SyncIfStmt + SyncForStmt + SyncSwitchStmt + SyncRangeStmt + SyncCaseClause + SyncCommClause + SyncSelectStmt + SyncDecls + SyncLabeledStmt + SyncUseObjLocal + SyncAddLocal + SyncLinkname + SyncStmt1 + SyncStmtsEnd + SyncLabel + SyncOptLabel + + SyncMultiExpr + SyncRType + SyncConvRTTI +) diff --git a/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go b/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go new file mode 100644 index 0000000..582ad56 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go @@ -0,0 +1,92 @@ +// Code generated by "stringer -type=SyncMarker -trimprefix=Sync"; DO NOT EDIT. + +package pkgbits + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[SyncEOF-1] + _ = x[SyncBool-2] + _ = x[SyncInt64-3] + _ = x[SyncUint64-4] + _ = x[SyncString-5] + _ = x[SyncValue-6] + _ = x[SyncVal-7] + _ = x[SyncRelocs-8] + _ = x[SyncReloc-9] + _ = x[SyncUseReloc-10] + _ = x[SyncPublic-11] + _ = x[SyncPos-12] + _ = x[SyncPosBase-13] + _ = x[SyncObject-14] + _ = x[SyncObject1-15] + _ = x[SyncPkg-16] + _ = x[SyncPkgDef-17] + _ = x[SyncMethod-18] + _ = x[SyncType-19] + _ = x[SyncTypeIdx-20] + _ = x[SyncTypeParamNames-21] + _ = x[SyncSignature-22] + _ = x[SyncParams-23] + _ = x[SyncParam-24] + _ = x[SyncCodeObj-25] + _ = x[SyncSym-26] + _ = x[SyncLocalIdent-27] + _ = x[SyncSelector-28] + _ = x[SyncPrivate-29] + _ = x[SyncFuncExt-30] + _ = x[SyncVarExt-31] + _ = x[SyncTypeExt-32] + _ = x[SyncPragma-33] + _ = x[SyncExprList-34] + _ = x[SyncExprs-35] + _ = x[SyncExpr-36] + _ = x[SyncExprType-37] + _ = x[SyncAssign-38] + _ = x[SyncOp-39] + _ = x[SyncFuncLit-40] + _ = x[SyncCompLit-41] + _ = x[SyncDecl-42] + _ = x[SyncFuncBody-43] + _ = x[SyncOpenScope-44] + _ = x[SyncCloseScope-45] + _ = x[SyncCloseAnotherScope-46] + _ = x[SyncDeclNames-47] + _ = x[SyncDeclName-48] + _ = x[SyncStmts-49] + _ = x[SyncBlockStmt-50] + _ = x[SyncIfStmt-51] + _ = x[SyncForStmt-52] + _ = x[SyncSwitchStmt-53] + _ = x[SyncRangeStmt-54] + _ = x[SyncCaseClause-55] + _ = x[SyncCommClause-56] + _ = x[SyncSelectStmt-57] + _ = x[SyncDecls-58] + _ = x[SyncLabeledStmt-59] + _ = x[SyncUseObjLocal-60] + _ = x[SyncAddLocal-61] + _ = x[SyncLinkname-62] + _ = x[SyncStmt1-63] + _ = x[SyncStmtsEnd-64] + _ = x[SyncLabel-65] + _ = x[SyncOptLabel-66] + _ = x[SyncMultiExpr-67] + _ = x[SyncRType-68] + _ = x[SyncConvRTTI-69] +} + +const _SyncMarker_name = "EOFBoolInt64Uint64StringValueValRelocsRelocUseRelocPublicPosPosBaseObjectObject1PkgPkgDefMethodTypeTypeIdxTypeParamNamesSignatureParamsParamCodeObjSymLocalIdentSelectorPrivateFuncExtVarExtTypeExtPragmaExprListExprsExprExprTypeAssignOpFuncLitCompLitDeclFuncBodyOpenScopeCloseScopeCloseAnotherScopeDeclNamesDeclNameStmtsBlockStmtIfStmtForStmtSwitchStmtRangeStmtCaseClauseCommClauseSelectStmtDeclsLabeledStmtUseObjLocalAddLocalLinknameStmt1StmtsEndLabelOptLabelMultiExprRTypeConvRTTI" + +var _SyncMarker_index = [...]uint16{0, 3, 7, 12, 18, 24, 29, 32, 38, 43, 51, 57, 60, 67, 73, 80, 83, 89, 95, 99, 106, 120, 129, 135, 140, 147, 150, 160, 168, 175, 182, 188, 195, 201, 209, 214, 218, 226, 232, 234, 241, 248, 252, 260, 269, 279, 296, 305, 313, 318, 327, 333, 340, 350, 359, 369, 379, 389, 394, 405, 416, 424, 432, 437, 445, 450, 458, 467, 472, 480} + +func (i SyncMarker) String() string { + i -= 1 + if i < 0 || i >= SyncMarker(len(_SyncMarker_index)-1) { + return "SyncMarker(" + strconv.FormatInt(int64(i+1), 10) + ")" + } + return _SyncMarker_name[_SyncMarker_index[i]:_SyncMarker_index[i+1]] +} diff --git a/vendor/golang.org/x/tools/internal/pkgbits/version.go b/vendor/golang.org/x/tools/internal/pkgbits/version.go new file mode 100644 index 0000000..53af9df --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/version.go @@ -0,0 +1,85 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +// Version indicates a version of a unified IR bitstream. +// Each Version indicates the addition, removal, or change of +// new data in the bitstream. +// +// These are serialized to disk and the interpretation remains fixed. +type Version uint32 + +const ( + // V0: initial prototype. + // + // All data that is not assigned a Field is in version V0 + // and has not been deprecated. + V0 Version = iota + + // V1: adds the Flags uint32 word + V1 + + // V2: removes unused legacy fields and supports type parameters for aliases. + // - remove the legacy "has init" bool from the public root + // - remove obj's "derived func instance" bool + // - add a TypeParamNames field to ObjAlias + // - remove derived info "needed" bool + V2 + + numVersions = iota +) + +// Field denotes a unit of data in the serialized unified IR bitstream. +// It is conceptually a like field in a structure. +// +// We only really need Fields when the data may or may not be present +// in a stream based on the Version of the bitstream. +// +// Unlike much of pkgbits, Fields are not serialized and +// can change values as needed. +type Field int + +const ( + // Flags in a uint32 in the header of a bitstream + // that is used to indicate whether optional features are enabled. + Flags Field = iota + + // Deprecated: HasInit was a bool indicating whether a package + // has any init functions. + HasInit + + // Deprecated: DerivedFuncInstance was a bool indicating + // whether an object was a function instance. + DerivedFuncInstance + + // ObjAlias has a list of TypeParamNames. + AliasTypeParamNames + + // Deprecated: DerivedInfoNeeded was a bool indicating + // whether a type was a derived type. + DerivedInfoNeeded + + numFields = iota +) + +// introduced is the version a field was added. +var introduced = [numFields]Version{ + Flags: V1, + AliasTypeParamNames: V2, +} + +// removed is the version a field was removed in or 0 for fields +// that have not yet been deprecated. +// (So removed[f]-1 is the last version it is included in.) +var removed = [numFields]Version{ + HasInit: V2, + DerivedFuncInstance: V2, + DerivedInfoNeeded: V2, +} + +// Has reports whether field f is present in a bitstream at version v. +func (v Version) Has(f Field) bool { + return introduced[f] <= v && (v < removed[f] || removed[f] == V0) +} diff --git a/vendor/golang.org/x/tools/internal/stdlib/manifest.go b/vendor/golang.org/x/tools/internal/stdlib/manifest.go new file mode 100644 index 0000000..cdaac9a --- /dev/null +++ b/vendor/golang.org/x/tools/internal/stdlib/manifest.go @@ -0,0 +1,17431 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Code generated by generate.go. DO NOT EDIT. + +package stdlib + +var PackageSymbols = map[string][]Symbol{ + "archive/tar": { + {"(*Header).FileInfo", Method, 1}, + {"(*Reader).Next", Method, 0}, + {"(*Reader).Read", Method, 0}, + {"(*Writer).AddFS", Method, 22}, + {"(*Writer).Close", Method, 0}, + {"(*Writer).Flush", Method, 0}, + {"(*Writer).Write", Method, 0}, + {"(*Writer).WriteHeader", Method, 0}, + {"(Format).String", Method, 10}, + {"ErrFieldTooLong", Var, 0}, + {"ErrHeader", Var, 0}, + {"ErrInsecurePath", Var, 20}, + {"ErrWriteAfterClose", Var, 0}, + {"ErrWriteTooLong", Var, 0}, + {"FileInfoHeader", Func, 1}, + {"FileInfoNames", Type, 23}, + {"Format", Type, 10}, + {"FormatGNU", Const, 10}, + {"FormatPAX", Const, 10}, + {"FormatUSTAR", Const, 10}, + {"FormatUnknown", Const, 10}, + {"Header", Type, 0}, + {"Header.AccessTime", Field, 0}, + {"Header.ChangeTime", Field, 0}, + {"Header.Devmajor", Field, 0}, + {"Header.Devminor", Field, 0}, + {"Header.Format", Field, 10}, + {"Header.Gid", Field, 0}, + {"Header.Gname", Field, 0}, + {"Header.Linkname", Field, 0}, + {"Header.ModTime", Field, 0}, + {"Header.Mode", Field, 0}, + {"Header.Name", Field, 0}, + {"Header.PAXRecords", Field, 10}, + {"Header.Size", Field, 0}, + {"Header.Typeflag", Field, 0}, + {"Header.Uid", Field, 0}, + {"Header.Uname", Field, 0}, + {"Header.Xattrs", Field, 3}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"Reader", Type, 0}, + {"TypeBlock", Const, 0}, + {"TypeChar", Const, 0}, + {"TypeCont", Const, 0}, + {"TypeDir", Const, 0}, + {"TypeFifo", Const, 0}, + {"TypeGNULongLink", Const, 1}, + {"TypeGNULongName", Const, 1}, + {"TypeGNUSparse", Const, 3}, + {"TypeLink", Const, 0}, + {"TypeReg", Const, 0}, + {"TypeRegA", Const, 0}, + {"TypeSymlink", Const, 0}, + {"TypeXGlobalHeader", Const, 0}, + {"TypeXHeader", Const, 0}, + {"Writer", Type, 0}, + }, + "archive/zip": { + {"(*File).DataOffset", Method, 2}, + {"(*File).FileInfo", Method, 0}, + {"(*File).ModTime", Method, 0}, + {"(*File).Mode", Method, 0}, + {"(*File).Open", Method, 0}, + {"(*File).OpenRaw", Method, 17}, + {"(*File).SetModTime", Method, 0}, + {"(*File).SetMode", Method, 0}, + {"(*FileHeader).FileInfo", Method, 0}, + {"(*FileHeader).ModTime", Method, 0}, + {"(*FileHeader).Mode", Method, 0}, + {"(*FileHeader).SetModTime", Method, 0}, + {"(*FileHeader).SetMode", Method, 0}, + {"(*ReadCloser).Close", Method, 0}, + {"(*ReadCloser).Open", Method, 16}, + {"(*ReadCloser).RegisterDecompressor", Method, 6}, + {"(*Reader).Open", Method, 16}, + {"(*Reader).RegisterDecompressor", Method, 6}, + {"(*Writer).AddFS", Method, 22}, + {"(*Writer).Close", Method, 0}, + {"(*Writer).Copy", Method, 17}, + {"(*Writer).Create", Method, 0}, + {"(*Writer).CreateHeader", Method, 0}, + {"(*Writer).CreateRaw", Method, 17}, + {"(*Writer).Flush", Method, 4}, + {"(*Writer).RegisterCompressor", Method, 6}, + {"(*Writer).SetComment", Method, 10}, + {"(*Writer).SetOffset", Method, 5}, + {"Compressor", Type, 2}, + {"Decompressor", Type, 2}, + {"Deflate", Const, 0}, + {"ErrAlgorithm", Var, 0}, + {"ErrChecksum", Var, 0}, + {"ErrFormat", Var, 0}, + {"ErrInsecurePath", Var, 20}, + {"File", Type, 0}, + {"File.FileHeader", Field, 0}, + {"FileHeader", Type, 0}, + {"FileHeader.CRC32", Field, 0}, + {"FileHeader.Comment", Field, 0}, + {"FileHeader.CompressedSize", Field, 0}, + {"FileHeader.CompressedSize64", Field, 1}, + {"FileHeader.CreatorVersion", Field, 0}, + {"FileHeader.ExternalAttrs", Field, 0}, + {"FileHeader.Extra", Field, 0}, + {"FileHeader.Flags", Field, 0}, + {"FileHeader.Method", Field, 0}, + {"FileHeader.Modified", Field, 10}, + {"FileHeader.ModifiedDate", Field, 0}, + {"FileHeader.ModifiedTime", Field, 0}, + {"FileHeader.Name", Field, 0}, + {"FileHeader.NonUTF8", Field, 10}, + {"FileHeader.ReaderVersion", Field, 0}, + {"FileHeader.UncompressedSize", Field, 0}, + {"FileHeader.UncompressedSize64", Field, 1}, + {"FileInfoHeader", Func, 0}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"OpenReader", Func, 0}, + {"ReadCloser", Type, 0}, + {"ReadCloser.Reader", Field, 0}, + {"Reader", Type, 0}, + {"Reader.Comment", Field, 0}, + {"Reader.File", Field, 0}, + {"RegisterCompressor", Func, 2}, + {"RegisterDecompressor", Func, 2}, + {"Store", Const, 0}, + {"Writer", Type, 0}, + }, + "bufio": { + {"(*Reader).Buffered", Method, 0}, + {"(*Reader).Discard", Method, 5}, + {"(*Reader).Peek", Method, 0}, + {"(*Reader).Read", Method, 0}, + {"(*Reader).ReadByte", Method, 0}, + {"(*Reader).ReadBytes", Method, 0}, + {"(*Reader).ReadLine", Method, 0}, + {"(*Reader).ReadRune", Method, 0}, + {"(*Reader).ReadSlice", Method, 0}, + {"(*Reader).ReadString", Method, 0}, + {"(*Reader).Reset", Method, 2}, + {"(*Reader).Size", Method, 10}, + {"(*Reader).UnreadByte", Method, 0}, + {"(*Reader).UnreadRune", Method, 0}, + {"(*Reader).WriteTo", Method, 1}, + {"(*Scanner).Buffer", Method, 6}, + {"(*Scanner).Bytes", Method, 1}, + {"(*Scanner).Err", Method, 1}, + {"(*Scanner).Scan", Method, 1}, + {"(*Scanner).Split", Method, 1}, + {"(*Scanner).Text", Method, 1}, + {"(*Writer).Available", Method, 0}, + {"(*Writer).AvailableBuffer", Method, 18}, + {"(*Writer).Buffered", Method, 0}, + {"(*Writer).Flush", Method, 0}, + {"(*Writer).ReadFrom", Method, 1}, + {"(*Writer).Reset", Method, 2}, + {"(*Writer).Size", Method, 10}, + {"(*Writer).Write", Method, 0}, + {"(*Writer).WriteByte", Method, 0}, + {"(*Writer).WriteRune", Method, 0}, + {"(*Writer).WriteString", Method, 0}, + {"(ReadWriter).Available", Method, 0}, + {"(ReadWriter).AvailableBuffer", Method, 18}, + {"(ReadWriter).Discard", Method, 5}, + {"(ReadWriter).Flush", Method, 0}, + {"(ReadWriter).Peek", Method, 0}, + {"(ReadWriter).Read", Method, 0}, + {"(ReadWriter).ReadByte", Method, 0}, + {"(ReadWriter).ReadBytes", Method, 0}, + {"(ReadWriter).ReadFrom", Method, 1}, + {"(ReadWriter).ReadLine", Method, 0}, + {"(ReadWriter).ReadRune", Method, 0}, + {"(ReadWriter).ReadSlice", Method, 0}, + {"(ReadWriter).ReadString", Method, 0}, + {"(ReadWriter).UnreadByte", Method, 0}, + {"(ReadWriter).UnreadRune", Method, 0}, + {"(ReadWriter).Write", Method, 0}, + {"(ReadWriter).WriteByte", Method, 0}, + {"(ReadWriter).WriteRune", Method, 0}, + {"(ReadWriter).WriteString", Method, 0}, + {"(ReadWriter).WriteTo", Method, 1}, + {"ErrAdvanceTooFar", Var, 1}, + {"ErrBadReadCount", Var, 15}, + {"ErrBufferFull", Var, 0}, + {"ErrFinalToken", Var, 6}, + {"ErrInvalidUnreadByte", Var, 0}, + {"ErrInvalidUnreadRune", Var, 0}, + {"ErrNegativeAdvance", Var, 1}, + {"ErrNegativeCount", Var, 0}, + {"ErrTooLong", Var, 1}, + {"MaxScanTokenSize", Const, 1}, + {"NewReadWriter", Func, 0}, + {"NewReader", Func, 0}, + {"NewReaderSize", Func, 0}, + {"NewScanner", Func, 1}, + {"NewWriter", Func, 0}, + {"NewWriterSize", Func, 0}, + {"ReadWriter", Type, 0}, + {"ReadWriter.Reader", Field, 0}, + {"ReadWriter.Writer", Field, 0}, + {"Reader", Type, 0}, + {"ScanBytes", Func, 1}, + {"ScanLines", Func, 1}, + {"ScanRunes", Func, 1}, + {"ScanWords", Func, 1}, + {"Scanner", Type, 1}, + {"SplitFunc", Type, 1}, + {"Writer", Type, 0}, + }, + "bytes": { + {"(*Buffer).Available", Method, 21}, + {"(*Buffer).AvailableBuffer", Method, 21}, + {"(*Buffer).Bytes", Method, 0}, + {"(*Buffer).Cap", Method, 5}, + {"(*Buffer).Grow", Method, 1}, + {"(*Buffer).Len", Method, 0}, + {"(*Buffer).Next", Method, 0}, + {"(*Buffer).Read", Method, 0}, + {"(*Buffer).ReadByte", Method, 0}, + {"(*Buffer).ReadBytes", Method, 0}, + {"(*Buffer).ReadFrom", Method, 0}, + {"(*Buffer).ReadRune", Method, 0}, + {"(*Buffer).ReadString", Method, 0}, + {"(*Buffer).Reset", Method, 0}, + {"(*Buffer).String", Method, 0}, + {"(*Buffer).Truncate", Method, 0}, + {"(*Buffer).UnreadByte", Method, 0}, + {"(*Buffer).UnreadRune", Method, 0}, + {"(*Buffer).Write", Method, 0}, + {"(*Buffer).WriteByte", Method, 0}, + {"(*Buffer).WriteRune", Method, 0}, + {"(*Buffer).WriteString", Method, 0}, + {"(*Buffer).WriteTo", Method, 0}, + {"(*Reader).Len", Method, 0}, + {"(*Reader).Read", Method, 0}, + {"(*Reader).ReadAt", Method, 0}, + {"(*Reader).ReadByte", Method, 0}, + {"(*Reader).ReadRune", Method, 0}, + {"(*Reader).Reset", Method, 7}, + {"(*Reader).Seek", Method, 0}, + {"(*Reader).Size", Method, 5}, + {"(*Reader).UnreadByte", Method, 0}, + {"(*Reader).UnreadRune", Method, 0}, + {"(*Reader).WriteTo", Method, 1}, + {"Buffer", Type, 0}, + {"Clone", Func, 20}, + {"Compare", Func, 0}, + {"Contains", Func, 0}, + {"ContainsAny", Func, 7}, + {"ContainsFunc", Func, 21}, + {"ContainsRune", Func, 7}, + {"Count", Func, 0}, + {"Cut", Func, 18}, + {"CutPrefix", Func, 20}, + {"CutSuffix", Func, 20}, + {"Equal", Func, 0}, + {"EqualFold", Func, 0}, + {"ErrTooLarge", Var, 0}, + {"Fields", Func, 0}, + {"FieldsFunc", Func, 0}, + {"HasPrefix", Func, 0}, + {"HasSuffix", Func, 0}, + {"Index", Func, 0}, + {"IndexAny", Func, 0}, + {"IndexByte", Func, 0}, + {"IndexFunc", Func, 0}, + {"IndexRune", Func, 0}, + {"Join", Func, 0}, + {"LastIndex", Func, 0}, + {"LastIndexAny", Func, 0}, + {"LastIndexByte", Func, 5}, + {"LastIndexFunc", Func, 0}, + {"Map", Func, 0}, + {"MinRead", Const, 0}, + {"NewBuffer", Func, 0}, + {"NewBufferString", Func, 0}, + {"NewReader", Func, 0}, + {"Reader", Type, 0}, + {"Repeat", Func, 0}, + {"Replace", Func, 0}, + {"ReplaceAll", Func, 12}, + {"Runes", Func, 0}, + {"Split", Func, 0}, + {"SplitAfter", Func, 0}, + {"SplitAfterN", Func, 0}, + {"SplitN", Func, 0}, + {"Title", Func, 0}, + {"ToLower", Func, 0}, + {"ToLowerSpecial", Func, 0}, + {"ToTitle", Func, 0}, + {"ToTitleSpecial", Func, 0}, + {"ToUpper", Func, 0}, + {"ToUpperSpecial", Func, 0}, + {"ToValidUTF8", Func, 13}, + {"Trim", Func, 0}, + {"TrimFunc", Func, 0}, + {"TrimLeft", Func, 0}, + {"TrimLeftFunc", Func, 0}, + {"TrimPrefix", Func, 1}, + {"TrimRight", Func, 0}, + {"TrimRightFunc", Func, 0}, + {"TrimSpace", Func, 0}, + {"TrimSuffix", Func, 1}, + }, + "cmp": { + {"Compare", Func, 21}, + {"Less", Func, 21}, + {"Or", Func, 22}, + {"Ordered", Type, 21}, + }, + "compress/bzip2": { + {"(StructuralError).Error", Method, 0}, + {"NewReader", Func, 0}, + {"StructuralError", Type, 0}, + }, + "compress/flate": { + {"(*ReadError).Error", Method, 0}, + {"(*WriteError).Error", Method, 0}, + {"(*Writer).Close", Method, 0}, + {"(*Writer).Flush", Method, 0}, + {"(*Writer).Reset", Method, 2}, + {"(*Writer).Write", Method, 0}, + {"(CorruptInputError).Error", Method, 0}, + {"(InternalError).Error", Method, 0}, + {"BestCompression", Const, 0}, + {"BestSpeed", Const, 0}, + {"CorruptInputError", Type, 0}, + {"DefaultCompression", Const, 0}, + {"HuffmanOnly", Const, 7}, + {"InternalError", Type, 0}, + {"NewReader", Func, 0}, + {"NewReaderDict", Func, 0}, + {"NewWriter", Func, 0}, + {"NewWriterDict", Func, 0}, + {"NoCompression", Const, 0}, + {"ReadError", Type, 0}, + {"ReadError.Err", Field, 0}, + {"ReadError.Offset", Field, 0}, + {"Reader", Type, 0}, + {"Resetter", Type, 4}, + {"WriteError", Type, 0}, + {"WriteError.Err", Field, 0}, + {"WriteError.Offset", Field, 0}, + {"Writer", Type, 0}, + }, + "compress/gzip": { + {"(*Reader).Close", Method, 0}, + {"(*Reader).Multistream", Method, 4}, + {"(*Reader).Read", Method, 0}, + {"(*Reader).Reset", Method, 3}, + {"(*Writer).Close", Method, 0}, + {"(*Writer).Flush", Method, 1}, + {"(*Writer).Reset", Method, 2}, + {"(*Writer).Write", Method, 0}, + {"BestCompression", Const, 0}, + {"BestSpeed", Const, 0}, + {"DefaultCompression", Const, 0}, + {"ErrChecksum", Var, 0}, + {"ErrHeader", Var, 0}, + {"Header", Type, 0}, + {"Header.Comment", Field, 0}, + {"Header.Extra", Field, 0}, + {"Header.ModTime", Field, 0}, + {"Header.Name", Field, 0}, + {"Header.OS", Field, 0}, + {"HuffmanOnly", Const, 8}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"NewWriterLevel", Func, 0}, + {"NoCompression", Const, 0}, + {"Reader", Type, 0}, + {"Reader.Header", Field, 0}, + {"Writer", Type, 0}, + {"Writer.Header", Field, 0}, + }, + "compress/lzw": { + {"(*Reader).Close", Method, 17}, + {"(*Reader).Read", Method, 17}, + {"(*Reader).Reset", Method, 17}, + {"(*Writer).Close", Method, 17}, + {"(*Writer).Reset", Method, 17}, + {"(*Writer).Write", Method, 17}, + {"LSB", Const, 0}, + {"MSB", Const, 0}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"Order", Type, 0}, + {"Reader", Type, 17}, + {"Writer", Type, 17}, + }, + "compress/zlib": { + {"(*Writer).Close", Method, 0}, + {"(*Writer).Flush", Method, 0}, + {"(*Writer).Reset", Method, 2}, + {"(*Writer).Write", Method, 0}, + {"BestCompression", Const, 0}, + {"BestSpeed", Const, 0}, + {"DefaultCompression", Const, 0}, + {"ErrChecksum", Var, 0}, + {"ErrDictionary", Var, 0}, + {"ErrHeader", Var, 0}, + {"HuffmanOnly", Const, 8}, + {"NewReader", Func, 0}, + {"NewReaderDict", Func, 0}, + {"NewWriter", Func, 0}, + {"NewWriterLevel", Func, 0}, + {"NewWriterLevelDict", Func, 0}, + {"NoCompression", Const, 0}, + {"Resetter", Type, 4}, + {"Writer", Type, 0}, + }, + "container/heap": { + {"Fix", Func, 2}, + {"Init", Func, 0}, + {"Interface", Type, 0}, + {"Pop", Func, 0}, + {"Push", Func, 0}, + {"Remove", Func, 0}, + }, + "container/list": { + {"(*Element).Next", Method, 0}, + {"(*Element).Prev", Method, 0}, + {"(*List).Back", Method, 0}, + {"(*List).Front", Method, 0}, + {"(*List).Init", Method, 0}, + {"(*List).InsertAfter", Method, 0}, + {"(*List).InsertBefore", Method, 0}, + {"(*List).Len", Method, 0}, + {"(*List).MoveAfter", Method, 2}, + {"(*List).MoveBefore", Method, 2}, + {"(*List).MoveToBack", Method, 0}, + {"(*List).MoveToFront", Method, 0}, + {"(*List).PushBack", Method, 0}, + {"(*List).PushBackList", Method, 0}, + {"(*List).PushFront", Method, 0}, + {"(*List).PushFrontList", Method, 0}, + {"(*List).Remove", Method, 0}, + {"Element", Type, 0}, + {"Element.Value", Field, 0}, + {"List", Type, 0}, + {"New", Func, 0}, + }, + "container/ring": { + {"(*Ring).Do", Method, 0}, + {"(*Ring).Len", Method, 0}, + {"(*Ring).Link", Method, 0}, + {"(*Ring).Move", Method, 0}, + {"(*Ring).Next", Method, 0}, + {"(*Ring).Prev", Method, 0}, + {"(*Ring).Unlink", Method, 0}, + {"New", Func, 0}, + {"Ring", Type, 0}, + {"Ring.Value", Field, 0}, + }, + "context": { + {"AfterFunc", Func, 21}, + {"Background", Func, 7}, + {"CancelCauseFunc", Type, 20}, + {"CancelFunc", Type, 7}, + {"Canceled", Var, 7}, + {"Cause", Func, 20}, + {"Context", Type, 7}, + {"DeadlineExceeded", Var, 7}, + {"TODO", Func, 7}, + {"WithCancel", Func, 7}, + {"WithCancelCause", Func, 20}, + {"WithDeadline", Func, 7}, + {"WithDeadlineCause", Func, 21}, + {"WithTimeout", Func, 7}, + {"WithTimeoutCause", Func, 21}, + {"WithValue", Func, 7}, + {"WithoutCancel", Func, 21}, + }, + "crypto": { + {"(Hash).Available", Method, 0}, + {"(Hash).HashFunc", Method, 4}, + {"(Hash).New", Method, 0}, + {"(Hash).Size", Method, 0}, + {"(Hash).String", Method, 15}, + {"BLAKE2b_256", Const, 9}, + {"BLAKE2b_384", Const, 9}, + {"BLAKE2b_512", Const, 9}, + {"BLAKE2s_256", Const, 9}, + {"Decrypter", Type, 5}, + {"DecrypterOpts", Type, 5}, + {"Hash", Type, 0}, + {"MD4", Const, 0}, + {"MD5", Const, 0}, + {"MD5SHA1", Const, 0}, + {"PrivateKey", Type, 0}, + {"PublicKey", Type, 2}, + {"RIPEMD160", Const, 0}, + {"RegisterHash", Func, 0}, + {"SHA1", Const, 0}, + {"SHA224", Const, 0}, + {"SHA256", Const, 0}, + {"SHA384", Const, 0}, + {"SHA3_224", Const, 4}, + {"SHA3_256", Const, 4}, + {"SHA3_384", Const, 4}, + {"SHA3_512", Const, 4}, + {"SHA512", Const, 0}, + {"SHA512_224", Const, 5}, + {"SHA512_256", Const, 5}, + {"Signer", Type, 4}, + {"SignerOpts", Type, 4}, + }, + "crypto/aes": { + {"(KeySizeError).Error", Method, 0}, + {"BlockSize", Const, 0}, + {"KeySizeError", Type, 0}, + {"NewCipher", Func, 0}, + }, + "crypto/cipher": { + {"(StreamReader).Read", Method, 0}, + {"(StreamWriter).Close", Method, 0}, + {"(StreamWriter).Write", Method, 0}, + {"AEAD", Type, 2}, + {"Block", Type, 0}, + {"BlockMode", Type, 0}, + {"NewCBCDecrypter", Func, 0}, + {"NewCBCEncrypter", Func, 0}, + {"NewCFBDecrypter", Func, 0}, + {"NewCFBEncrypter", Func, 0}, + {"NewCTR", Func, 0}, + {"NewGCM", Func, 2}, + {"NewGCMWithNonceSize", Func, 5}, + {"NewGCMWithTagSize", Func, 11}, + {"NewOFB", Func, 0}, + {"Stream", Type, 0}, + {"StreamReader", Type, 0}, + {"StreamReader.R", Field, 0}, + {"StreamReader.S", Field, 0}, + {"StreamWriter", Type, 0}, + {"StreamWriter.Err", Field, 0}, + {"StreamWriter.S", Field, 0}, + {"StreamWriter.W", Field, 0}, + }, + "crypto/des": { + {"(KeySizeError).Error", Method, 0}, + {"BlockSize", Const, 0}, + {"KeySizeError", Type, 0}, + {"NewCipher", Func, 0}, + {"NewTripleDESCipher", Func, 0}, + }, + "crypto/dsa": { + {"ErrInvalidPublicKey", Var, 0}, + {"GenerateKey", Func, 0}, + {"GenerateParameters", Func, 0}, + {"L1024N160", Const, 0}, + {"L2048N224", Const, 0}, + {"L2048N256", Const, 0}, + {"L3072N256", Const, 0}, + {"ParameterSizes", Type, 0}, + {"Parameters", Type, 0}, + {"Parameters.G", Field, 0}, + {"Parameters.P", Field, 0}, + {"Parameters.Q", Field, 0}, + {"PrivateKey", Type, 0}, + {"PrivateKey.PublicKey", Field, 0}, + {"PrivateKey.X", Field, 0}, + {"PublicKey", Type, 0}, + {"PublicKey.Parameters", Field, 0}, + {"PublicKey.Y", Field, 0}, + {"Sign", Func, 0}, + {"Verify", Func, 0}, + }, + "crypto/ecdh": { + {"(*PrivateKey).Bytes", Method, 20}, + {"(*PrivateKey).Curve", Method, 20}, + {"(*PrivateKey).ECDH", Method, 20}, + {"(*PrivateKey).Equal", Method, 20}, + {"(*PrivateKey).Public", Method, 20}, + {"(*PrivateKey).PublicKey", Method, 20}, + {"(*PublicKey).Bytes", Method, 20}, + {"(*PublicKey).Curve", Method, 20}, + {"(*PublicKey).Equal", Method, 20}, + {"Curve", Type, 20}, + {"P256", Func, 20}, + {"P384", Func, 20}, + {"P521", Func, 20}, + {"PrivateKey", Type, 20}, + {"PublicKey", Type, 20}, + {"X25519", Func, 20}, + }, + "crypto/ecdsa": { + {"(*PrivateKey).ECDH", Method, 20}, + {"(*PrivateKey).Equal", Method, 15}, + {"(*PrivateKey).Public", Method, 4}, + {"(*PrivateKey).Sign", Method, 4}, + {"(*PublicKey).ECDH", Method, 20}, + {"(*PublicKey).Equal", Method, 15}, + {"(PrivateKey).Add", Method, 0}, + {"(PrivateKey).Double", Method, 0}, + {"(PrivateKey).IsOnCurve", Method, 0}, + {"(PrivateKey).Params", Method, 0}, + {"(PrivateKey).ScalarBaseMult", Method, 0}, + {"(PrivateKey).ScalarMult", Method, 0}, + {"(PublicKey).Add", Method, 0}, + {"(PublicKey).Double", Method, 0}, + {"(PublicKey).IsOnCurve", Method, 0}, + {"(PublicKey).Params", Method, 0}, + {"(PublicKey).ScalarBaseMult", Method, 0}, + {"(PublicKey).ScalarMult", Method, 0}, + {"GenerateKey", Func, 0}, + {"PrivateKey", Type, 0}, + {"PrivateKey.D", Field, 0}, + {"PrivateKey.PublicKey", Field, 0}, + {"PublicKey", Type, 0}, + {"PublicKey.Curve", Field, 0}, + {"PublicKey.X", Field, 0}, + {"PublicKey.Y", Field, 0}, + {"Sign", Func, 0}, + {"SignASN1", Func, 15}, + {"Verify", Func, 0}, + {"VerifyASN1", Func, 15}, + }, + "crypto/ed25519": { + {"(*Options).HashFunc", Method, 20}, + {"(PrivateKey).Equal", Method, 15}, + {"(PrivateKey).Public", Method, 13}, + {"(PrivateKey).Seed", Method, 13}, + {"(PrivateKey).Sign", Method, 13}, + {"(PublicKey).Equal", Method, 15}, + {"GenerateKey", Func, 13}, + {"NewKeyFromSeed", Func, 13}, + {"Options", Type, 20}, + {"Options.Context", Field, 20}, + {"Options.Hash", Field, 20}, + {"PrivateKey", Type, 13}, + {"PrivateKeySize", Const, 13}, + {"PublicKey", Type, 13}, + {"PublicKeySize", Const, 13}, + {"SeedSize", Const, 13}, + {"Sign", Func, 13}, + {"SignatureSize", Const, 13}, + {"Verify", Func, 13}, + {"VerifyWithOptions", Func, 20}, + }, + "crypto/elliptic": { + {"(*CurveParams).Add", Method, 0}, + {"(*CurveParams).Double", Method, 0}, + {"(*CurveParams).IsOnCurve", Method, 0}, + {"(*CurveParams).Params", Method, 0}, + {"(*CurveParams).ScalarBaseMult", Method, 0}, + {"(*CurveParams).ScalarMult", Method, 0}, + {"Curve", Type, 0}, + {"CurveParams", Type, 0}, + {"CurveParams.B", Field, 0}, + {"CurveParams.BitSize", Field, 0}, + {"CurveParams.Gx", Field, 0}, + {"CurveParams.Gy", Field, 0}, + {"CurveParams.N", Field, 0}, + {"CurveParams.Name", Field, 5}, + {"CurveParams.P", Field, 0}, + {"GenerateKey", Func, 0}, + {"Marshal", Func, 0}, + {"MarshalCompressed", Func, 15}, + {"P224", Func, 0}, + {"P256", Func, 0}, + {"P384", Func, 0}, + {"P521", Func, 0}, + {"Unmarshal", Func, 0}, + {"UnmarshalCompressed", Func, 15}, + }, + "crypto/hmac": { + {"Equal", Func, 1}, + {"New", Func, 0}, + }, + "crypto/md5": { + {"BlockSize", Const, 0}, + {"New", Func, 0}, + {"Size", Const, 0}, + {"Sum", Func, 2}, + }, + "crypto/rand": { + {"Int", Func, 0}, + {"Prime", Func, 0}, + {"Read", Func, 0}, + {"Reader", Var, 0}, + }, + "crypto/rc4": { + {"(*Cipher).Reset", Method, 0}, + {"(*Cipher).XORKeyStream", Method, 0}, + {"(KeySizeError).Error", Method, 0}, + {"Cipher", Type, 0}, + {"KeySizeError", Type, 0}, + {"NewCipher", Func, 0}, + }, + "crypto/rsa": { + {"(*PSSOptions).HashFunc", Method, 4}, + {"(*PrivateKey).Decrypt", Method, 5}, + {"(*PrivateKey).Equal", Method, 15}, + {"(*PrivateKey).Precompute", Method, 0}, + {"(*PrivateKey).Public", Method, 4}, + {"(*PrivateKey).Sign", Method, 4}, + {"(*PrivateKey).Size", Method, 11}, + {"(*PrivateKey).Validate", Method, 0}, + {"(*PublicKey).Equal", Method, 15}, + {"(*PublicKey).Size", Method, 11}, + {"CRTValue", Type, 0}, + {"CRTValue.Coeff", Field, 0}, + {"CRTValue.Exp", Field, 0}, + {"CRTValue.R", Field, 0}, + {"DecryptOAEP", Func, 0}, + {"DecryptPKCS1v15", Func, 0}, + {"DecryptPKCS1v15SessionKey", Func, 0}, + {"EncryptOAEP", Func, 0}, + {"EncryptPKCS1v15", Func, 0}, + {"ErrDecryption", Var, 0}, + {"ErrMessageTooLong", Var, 0}, + {"ErrVerification", Var, 0}, + {"GenerateKey", Func, 0}, + {"GenerateMultiPrimeKey", Func, 0}, + {"OAEPOptions", Type, 5}, + {"OAEPOptions.Hash", Field, 5}, + {"OAEPOptions.Label", Field, 5}, + {"OAEPOptions.MGFHash", Field, 20}, + {"PKCS1v15DecryptOptions", Type, 5}, + {"PKCS1v15DecryptOptions.SessionKeyLen", Field, 5}, + {"PSSOptions", Type, 2}, + {"PSSOptions.Hash", Field, 4}, + {"PSSOptions.SaltLength", Field, 2}, + {"PSSSaltLengthAuto", Const, 2}, + {"PSSSaltLengthEqualsHash", Const, 2}, + {"PrecomputedValues", Type, 0}, + {"PrecomputedValues.CRTValues", Field, 0}, + {"PrecomputedValues.Dp", Field, 0}, + {"PrecomputedValues.Dq", Field, 0}, + {"PrecomputedValues.Qinv", Field, 0}, + {"PrivateKey", Type, 0}, + {"PrivateKey.D", Field, 0}, + {"PrivateKey.Precomputed", Field, 0}, + {"PrivateKey.Primes", Field, 0}, + {"PrivateKey.PublicKey", Field, 0}, + {"PublicKey", Type, 0}, + {"PublicKey.E", Field, 0}, + {"PublicKey.N", Field, 0}, + {"SignPKCS1v15", Func, 0}, + {"SignPSS", Func, 2}, + {"VerifyPKCS1v15", Func, 0}, + {"VerifyPSS", Func, 2}, + }, + "crypto/sha1": { + {"BlockSize", Const, 0}, + {"New", Func, 0}, + {"Size", Const, 0}, + {"Sum", Func, 2}, + }, + "crypto/sha256": { + {"BlockSize", Const, 0}, + {"New", Func, 0}, + {"New224", Func, 0}, + {"Size", Const, 0}, + {"Size224", Const, 0}, + {"Sum224", Func, 2}, + {"Sum256", Func, 2}, + }, + "crypto/sha512": { + {"BlockSize", Const, 0}, + {"New", Func, 0}, + {"New384", Func, 0}, + {"New512_224", Func, 5}, + {"New512_256", Func, 5}, + {"Size", Const, 0}, + {"Size224", Const, 5}, + {"Size256", Const, 5}, + {"Size384", Const, 0}, + {"Sum384", Func, 2}, + {"Sum512", Func, 2}, + {"Sum512_224", Func, 5}, + {"Sum512_256", Func, 5}, + }, + "crypto/subtle": { + {"ConstantTimeByteEq", Func, 0}, + {"ConstantTimeCompare", Func, 0}, + {"ConstantTimeCopy", Func, 0}, + {"ConstantTimeEq", Func, 0}, + {"ConstantTimeLessOrEq", Func, 2}, + {"ConstantTimeSelect", Func, 0}, + {"XORBytes", Func, 20}, + }, + "crypto/tls": { + {"(*CertificateRequestInfo).Context", Method, 17}, + {"(*CertificateRequestInfo).SupportsCertificate", Method, 14}, + {"(*CertificateVerificationError).Error", Method, 20}, + {"(*CertificateVerificationError).Unwrap", Method, 20}, + {"(*ClientHelloInfo).Context", Method, 17}, + {"(*ClientHelloInfo).SupportsCertificate", Method, 14}, + {"(*ClientSessionState).ResumptionState", Method, 21}, + {"(*Config).BuildNameToCertificate", Method, 0}, + {"(*Config).Clone", Method, 8}, + {"(*Config).DecryptTicket", Method, 21}, + {"(*Config).EncryptTicket", Method, 21}, + {"(*Config).SetSessionTicketKeys", Method, 5}, + {"(*Conn).Close", Method, 0}, + {"(*Conn).CloseWrite", Method, 8}, + {"(*Conn).ConnectionState", Method, 0}, + {"(*Conn).Handshake", Method, 0}, + {"(*Conn).HandshakeContext", Method, 17}, + {"(*Conn).LocalAddr", Method, 0}, + {"(*Conn).NetConn", Method, 18}, + {"(*Conn).OCSPResponse", Method, 0}, + {"(*Conn).Read", Method, 0}, + {"(*Conn).RemoteAddr", Method, 0}, + {"(*Conn).SetDeadline", Method, 0}, + {"(*Conn).SetReadDeadline", Method, 0}, + {"(*Conn).SetWriteDeadline", Method, 0}, + {"(*Conn).VerifyHostname", Method, 0}, + {"(*Conn).Write", Method, 0}, + {"(*ConnectionState).ExportKeyingMaterial", Method, 11}, + {"(*Dialer).Dial", Method, 15}, + {"(*Dialer).DialContext", Method, 15}, + {"(*ECHRejectionError).Error", Method, 23}, + {"(*QUICConn).Close", Method, 21}, + {"(*QUICConn).ConnectionState", Method, 21}, + {"(*QUICConn).HandleData", Method, 21}, + {"(*QUICConn).NextEvent", Method, 21}, + {"(*QUICConn).SendSessionTicket", Method, 21}, + {"(*QUICConn).SetTransportParameters", Method, 21}, + {"(*QUICConn).Start", Method, 21}, + {"(*QUICConn).StoreSession", Method, 23}, + {"(*SessionState).Bytes", Method, 21}, + {"(AlertError).Error", Method, 21}, + {"(ClientAuthType).String", Method, 15}, + {"(CurveID).String", Method, 15}, + {"(QUICEncryptionLevel).String", Method, 21}, + {"(RecordHeaderError).Error", Method, 6}, + {"(SignatureScheme).String", Method, 15}, + {"AlertError", Type, 21}, + {"Certificate", Type, 0}, + {"Certificate.Certificate", Field, 0}, + {"Certificate.Leaf", Field, 0}, + {"Certificate.OCSPStaple", Field, 0}, + {"Certificate.PrivateKey", Field, 0}, + {"Certificate.SignedCertificateTimestamps", Field, 5}, + {"Certificate.SupportedSignatureAlgorithms", Field, 14}, + {"CertificateRequestInfo", Type, 8}, + {"CertificateRequestInfo.AcceptableCAs", Field, 8}, + {"CertificateRequestInfo.SignatureSchemes", Field, 8}, + {"CertificateRequestInfo.Version", Field, 14}, + {"CertificateVerificationError", Type, 20}, + {"CertificateVerificationError.Err", Field, 20}, + {"CertificateVerificationError.UnverifiedCertificates", Field, 20}, + {"CipherSuite", Type, 14}, + {"CipherSuite.ID", Field, 14}, + {"CipherSuite.Insecure", Field, 14}, + {"CipherSuite.Name", Field, 14}, + {"CipherSuite.SupportedVersions", Field, 14}, + {"CipherSuiteName", Func, 14}, + {"CipherSuites", Func, 14}, + {"Client", Func, 0}, + {"ClientAuthType", Type, 0}, + {"ClientHelloInfo", Type, 4}, + {"ClientHelloInfo.CipherSuites", Field, 4}, + {"ClientHelloInfo.Conn", Field, 8}, + {"ClientHelloInfo.ServerName", Field, 4}, + {"ClientHelloInfo.SignatureSchemes", Field, 8}, + {"ClientHelloInfo.SupportedCurves", Field, 4}, + {"ClientHelloInfo.SupportedPoints", Field, 4}, + {"ClientHelloInfo.SupportedProtos", Field, 8}, + {"ClientHelloInfo.SupportedVersions", Field, 8}, + {"ClientSessionCache", Type, 3}, + {"ClientSessionState", Type, 3}, + {"Config", Type, 0}, + {"Config.Certificates", Field, 0}, + {"Config.CipherSuites", Field, 0}, + {"Config.ClientAuth", Field, 0}, + {"Config.ClientCAs", Field, 0}, + {"Config.ClientSessionCache", Field, 3}, + {"Config.CurvePreferences", Field, 3}, + {"Config.DynamicRecordSizingDisabled", Field, 7}, + {"Config.EncryptedClientHelloConfigList", Field, 23}, + {"Config.EncryptedClientHelloRejectionVerify", Field, 23}, + {"Config.GetCertificate", Field, 4}, + {"Config.GetClientCertificate", Field, 8}, + {"Config.GetConfigForClient", Field, 8}, + {"Config.InsecureSkipVerify", Field, 0}, + {"Config.KeyLogWriter", Field, 8}, + {"Config.MaxVersion", Field, 2}, + {"Config.MinVersion", Field, 2}, + {"Config.NameToCertificate", Field, 0}, + {"Config.NextProtos", Field, 0}, + {"Config.PreferServerCipherSuites", Field, 1}, + {"Config.Rand", Field, 0}, + {"Config.Renegotiation", Field, 7}, + {"Config.RootCAs", Field, 0}, + {"Config.ServerName", Field, 0}, + {"Config.SessionTicketKey", Field, 1}, + {"Config.SessionTicketsDisabled", Field, 1}, + {"Config.Time", Field, 0}, + {"Config.UnwrapSession", Field, 21}, + {"Config.VerifyConnection", Field, 15}, + {"Config.VerifyPeerCertificate", Field, 8}, + {"Config.WrapSession", Field, 21}, + {"Conn", Type, 0}, + {"ConnectionState", Type, 0}, + {"ConnectionState.CipherSuite", Field, 0}, + {"ConnectionState.DidResume", Field, 1}, + {"ConnectionState.ECHAccepted", Field, 23}, + {"ConnectionState.HandshakeComplete", Field, 0}, + {"ConnectionState.NegotiatedProtocol", Field, 0}, + {"ConnectionState.NegotiatedProtocolIsMutual", Field, 0}, + {"ConnectionState.OCSPResponse", Field, 5}, + {"ConnectionState.PeerCertificates", Field, 0}, + {"ConnectionState.ServerName", Field, 0}, + {"ConnectionState.SignedCertificateTimestamps", Field, 5}, + {"ConnectionState.TLSUnique", Field, 4}, + {"ConnectionState.VerifiedChains", Field, 0}, + {"ConnectionState.Version", Field, 3}, + {"CurveID", Type, 3}, + {"CurveP256", Const, 3}, + {"CurveP384", Const, 3}, + {"CurveP521", Const, 3}, + {"Dial", Func, 0}, + {"DialWithDialer", Func, 3}, + {"Dialer", Type, 15}, + {"Dialer.Config", Field, 15}, + {"Dialer.NetDialer", Field, 15}, + {"ECDSAWithP256AndSHA256", Const, 8}, + {"ECDSAWithP384AndSHA384", Const, 8}, + {"ECDSAWithP521AndSHA512", Const, 8}, + {"ECDSAWithSHA1", Const, 10}, + {"ECHRejectionError", Type, 23}, + {"ECHRejectionError.RetryConfigList", Field, 23}, + {"Ed25519", Const, 13}, + {"InsecureCipherSuites", Func, 14}, + {"Listen", Func, 0}, + {"LoadX509KeyPair", Func, 0}, + {"NewLRUClientSessionCache", Func, 3}, + {"NewListener", Func, 0}, + {"NewResumptionState", Func, 21}, + {"NoClientCert", Const, 0}, + {"PKCS1WithSHA1", Const, 8}, + {"PKCS1WithSHA256", Const, 8}, + {"PKCS1WithSHA384", Const, 8}, + {"PKCS1WithSHA512", Const, 8}, + {"PSSWithSHA256", Const, 8}, + {"PSSWithSHA384", Const, 8}, + {"PSSWithSHA512", Const, 8}, + {"ParseSessionState", Func, 21}, + {"QUICClient", Func, 21}, + {"QUICConfig", Type, 21}, + {"QUICConfig.EnableSessionEvents", Field, 23}, + {"QUICConfig.TLSConfig", Field, 21}, + {"QUICConn", Type, 21}, + {"QUICEncryptionLevel", Type, 21}, + {"QUICEncryptionLevelApplication", Const, 21}, + {"QUICEncryptionLevelEarly", Const, 21}, + {"QUICEncryptionLevelHandshake", Const, 21}, + {"QUICEncryptionLevelInitial", Const, 21}, + {"QUICEvent", Type, 21}, + {"QUICEvent.Data", Field, 21}, + {"QUICEvent.Kind", Field, 21}, + {"QUICEvent.Level", Field, 21}, + {"QUICEvent.SessionState", Field, 23}, + {"QUICEvent.Suite", Field, 21}, + {"QUICEventKind", Type, 21}, + {"QUICHandshakeDone", Const, 21}, + {"QUICNoEvent", Const, 21}, + {"QUICRejectedEarlyData", Const, 21}, + {"QUICResumeSession", Const, 23}, + {"QUICServer", Func, 21}, + {"QUICSessionTicketOptions", Type, 21}, + {"QUICSessionTicketOptions.EarlyData", Field, 21}, + {"QUICSessionTicketOptions.Extra", Field, 23}, + {"QUICSetReadSecret", Const, 21}, + {"QUICSetWriteSecret", Const, 21}, + {"QUICStoreSession", Const, 23}, + {"QUICTransportParameters", Const, 21}, + {"QUICTransportParametersRequired", Const, 21}, + {"QUICWriteData", Const, 21}, + {"RecordHeaderError", Type, 6}, + {"RecordHeaderError.Conn", Field, 12}, + {"RecordHeaderError.Msg", Field, 6}, + {"RecordHeaderError.RecordHeader", Field, 6}, + {"RenegotiateFreelyAsClient", Const, 7}, + {"RenegotiateNever", Const, 7}, + {"RenegotiateOnceAsClient", Const, 7}, + {"RenegotiationSupport", Type, 7}, + {"RequestClientCert", Const, 0}, + {"RequireAndVerifyClientCert", Const, 0}, + {"RequireAnyClientCert", Const, 0}, + {"Server", Func, 0}, + {"SessionState", Type, 21}, + {"SessionState.EarlyData", Field, 21}, + {"SessionState.Extra", Field, 21}, + {"SignatureScheme", Type, 8}, + {"TLS_AES_128_GCM_SHA256", Const, 12}, + {"TLS_AES_256_GCM_SHA384", Const, 12}, + {"TLS_CHACHA20_POLY1305_SHA256", Const, 12}, + {"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", Const, 2}, + {"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", Const, 8}, + {"TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256", Const, 2}, + {"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", Const, 2}, + {"TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", Const, 5}, + {"TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305", Const, 8}, + {"TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256", Const, 14}, + {"TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", Const, 2}, + {"TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", Const, 0}, + {"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", Const, 0}, + {"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", Const, 8}, + {"TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", Const, 2}, + {"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", Const, 1}, + {"TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", Const, 5}, + {"TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305", Const, 8}, + {"TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256", Const, 14}, + {"TLS_ECDHE_RSA_WITH_RC4_128_SHA", Const, 0}, + {"TLS_FALLBACK_SCSV", Const, 4}, + {"TLS_RSA_WITH_3DES_EDE_CBC_SHA", Const, 0}, + {"TLS_RSA_WITH_AES_128_CBC_SHA", Const, 0}, + {"TLS_RSA_WITH_AES_128_CBC_SHA256", Const, 8}, + {"TLS_RSA_WITH_AES_128_GCM_SHA256", Const, 6}, + {"TLS_RSA_WITH_AES_256_CBC_SHA", Const, 1}, + {"TLS_RSA_WITH_AES_256_GCM_SHA384", Const, 6}, + {"TLS_RSA_WITH_RC4_128_SHA", Const, 0}, + {"VerifyClientCertIfGiven", Const, 0}, + {"VersionName", Func, 21}, + {"VersionSSL30", Const, 2}, + {"VersionTLS10", Const, 2}, + {"VersionTLS11", Const, 2}, + {"VersionTLS12", Const, 2}, + {"VersionTLS13", Const, 12}, + {"X25519", Const, 8}, + {"X509KeyPair", Func, 0}, + }, + "crypto/x509": { + {"(*CertPool).AddCert", Method, 0}, + {"(*CertPool).AddCertWithConstraint", Method, 22}, + {"(*CertPool).AppendCertsFromPEM", Method, 0}, + {"(*CertPool).Clone", Method, 19}, + {"(*CertPool).Equal", Method, 19}, + {"(*CertPool).Subjects", Method, 0}, + {"(*Certificate).CheckCRLSignature", Method, 0}, + {"(*Certificate).CheckSignature", Method, 0}, + {"(*Certificate).CheckSignatureFrom", Method, 0}, + {"(*Certificate).CreateCRL", Method, 0}, + {"(*Certificate).Equal", Method, 0}, + {"(*Certificate).Verify", Method, 0}, + {"(*Certificate).VerifyHostname", Method, 0}, + {"(*CertificateRequest).CheckSignature", Method, 5}, + {"(*OID).UnmarshalBinary", Method, 23}, + {"(*OID).UnmarshalText", Method, 23}, + {"(*RevocationList).CheckSignatureFrom", Method, 19}, + {"(CertificateInvalidError).Error", Method, 0}, + {"(ConstraintViolationError).Error", Method, 0}, + {"(HostnameError).Error", Method, 0}, + {"(InsecureAlgorithmError).Error", Method, 6}, + {"(OID).Equal", Method, 22}, + {"(OID).EqualASN1OID", Method, 22}, + {"(OID).MarshalBinary", Method, 23}, + {"(OID).MarshalText", Method, 23}, + {"(OID).String", Method, 22}, + {"(PublicKeyAlgorithm).String", Method, 10}, + {"(SignatureAlgorithm).String", Method, 6}, + {"(SystemRootsError).Error", Method, 1}, + {"(SystemRootsError).Unwrap", Method, 16}, + {"(UnhandledCriticalExtension).Error", Method, 0}, + {"(UnknownAuthorityError).Error", Method, 0}, + {"CANotAuthorizedForExtKeyUsage", Const, 10}, + {"CANotAuthorizedForThisName", Const, 0}, + {"CertPool", Type, 0}, + {"Certificate", Type, 0}, + {"Certificate.AuthorityKeyId", Field, 0}, + {"Certificate.BasicConstraintsValid", Field, 0}, + {"Certificate.CRLDistributionPoints", Field, 2}, + {"Certificate.DNSNames", Field, 0}, + {"Certificate.EmailAddresses", Field, 0}, + {"Certificate.ExcludedDNSDomains", Field, 9}, + {"Certificate.ExcludedEmailAddresses", Field, 10}, + {"Certificate.ExcludedIPRanges", Field, 10}, + {"Certificate.ExcludedURIDomains", Field, 10}, + {"Certificate.ExtKeyUsage", Field, 0}, + {"Certificate.Extensions", Field, 2}, + {"Certificate.ExtraExtensions", Field, 2}, + {"Certificate.IPAddresses", Field, 1}, + {"Certificate.IsCA", Field, 0}, + {"Certificate.Issuer", Field, 0}, + {"Certificate.IssuingCertificateURL", Field, 2}, + {"Certificate.KeyUsage", Field, 0}, + {"Certificate.MaxPathLen", Field, 0}, + {"Certificate.MaxPathLenZero", Field, 4}, + {"Certificate.NotAfter", Field, 0}, + {"Certificate.NotBefore", Field, 0}, + {"Certificate.OCSPServer", Field, 2}, + {"Certificate.PermittedDNSDomains", Field, 0}, + {"Certificate.PermittedDNSDomainsCritical", Field, 0}, + {"Certificate.PermittedEmailAddresses", Field, 10}, + {"Certificate.PermittedIPRanges", Field, 10}, + {"Certificate.PermittedURIDomains", Field, 10}, + {"Certificate.Policies", Field, 22}, + {"Certificate.PolicyIdentifiers", Field, 0}, + {"Certificate.PublicKey", Field, 0}, + {"Certificate.PublicKeyAlgorithm", Field, 0}, + {"Certificate.Raw", Field, 0}, + {"Certificate.RawIssuer", Field, 0}, + {"Certificate.RawSubject", Field, 0}, + {"Certificate.RawSubjectPublicKeyInfo", Field, 0}, + {"Certificate.RawTBSCertificate", Field, 0}, + {"Certificate.SerialNumber", Field, 0}, + {"Certificate.Signature", Field, 0}, + {"Certificate.SignatureAlgorithm", Field, 0}, + {"Certificate.Subject", Field, 0}, + {"Certificate.SubjectKeyId", Field, 0}, + {"Certificate.URIs", Field, 10}, + {"Certificate.UnhandledCriticalExtensions", Field, 5}, + {"Certificate.UnknownExtKeyUsage", Field, 0}, + {"Certificate.Version", Field, 0}, + {"CertificateInvalidError", Type, 0}, + {"CertificateInvalidError.Cert", Field, 0}, + {"CertificateInvalidError.Detail", Field, 10}, + {"CertificateInvalidError.Reason", Field, 0}, + {"CertificateRequest", Type, 3}, + {"CertificateRequest.Attributes", Field, 3}, + {"CertificateRequest.DNSNames", Field, 3}, + {"CertificateRequest.EmailAddresses", Field, 3}, + {"CertificateRequest.Extensions", Field, 3}, + {"CertificateRequest.ExtraExtensions", Field, 3}, + {"CertificateRequest.IPAddresses", Field, 3}, + {"CertificateRequest.PublicKey", Field, 3}, + {"CertificateRequest.PublicKeyAlgorithm", Field, 3}, + {"CertificateRequest.Raw", Field, 3}, + {"CertificateRequest.RawSubject", Field, 3}, + {"CertificateRequest.RawSubjectPublicKeyInfo", Field, 3}, + {"CertificateRequest.RawTBSCertificateRequest", Field, 3}, + {"CertificateRequest.Signature", Field, 3}, + {"CertificateRequest.SignatureAlgorithm", Field, 3}, + {"CertificateRequest.Subject", Field, 3}, + {"CertificateRequest.URIs", Field, 10}, + {"CertificateRequest.Version", Field, 3}, + {"ConstraintViolationError", Type, 0}, + {"CreateCertificate", Func, 0}, + {"CreateCertificateRequest", Func, 3}, + {"CreateRevocationList", Func, 15}, + {"DSA", Const, 0}, + {"DSAWithSHA1", Const, 0}, + {"DSAWithSHA256", Const, 0}, + {"DecryptPEMBlock", Func, 1}, + {"ECDSA", Const, 1}, + {"ECDSAWithSHA1", Const, 1}, + {"ECDSAWithSHA256", Const, 1}, + {"ECDSAWithSHA384", Const, 1}, + {"ECDSAWithSHA512", Const, 1}, + {"Ed25519", Const, 13}, + {"EncryptPEMBlock", Func, 1}, + {"ErrUnsupportedAlgorithm", Var, 0}, + {"Expired", Const, 0}, + {"ExtKeyUsage", Type, 0}, + {"ExtKeyUsageAny", Const, 0}, + {"ExtKeyUsageClientAuth", Const, 0}, + {"ExtKeyUsageCodeSigning", Const, 0}, + {"ExtKeyUsageEmailProtection", Const, 0}, + {"ExtKeyUsageIPSECEndSystem", Const, 1}, + {"ExtKeyUsageIPSECTunnel", Const, 1}, + {"ExtKeyUsageIPSECUser", Const, 1}, + {"ExtKeyUsageMicrosoftCommercialCodeSigning", Const, 10}, + {"ExtKeyUsageMicrosoftKernelCodeSigning", Const, 10}, + {"ExtKeyUsageMicrosoftServerGatedCrypto", Const, 1}, + {"ExtKeyUsageNetscapeServerGatedCrypto", Const, 1}, + {"ExtKeyUsageOCSPSigning", Const, 0}, + {"ExtKeyUsageServerAuth", Const, 0}, + {"ExtKeyUsageTimeStamping", Const, 0}, + {"HostnameError", Type, 0}, + {"HostnameError.Certificate", Field, 0}, + {"HostnameError.Host", Field, 0}, + {"IncompatibleUsage", Const, 1}, + {"IncorrectPasswordError", Var, 1}, + {"InsecureAlgorithmError", Type, 6}, + {"InvalidReason", Type, 0}, + {"IsEncryptedPEMBlock", Func, 1}, + {"KeyUsage", Type, 0}, + {"KeyUsageCRLSign", Const, 0}, + {"KeyUsageCertSign", Const, 0}, + {"KeyUsageContentCommitment", Const, 0}, + {"KeyUsageDataEncipherment", Const, 0}, + {"KeyUsageDecipherOnly", Const, 0}, + {"KeyUsageDigitalSignature", Const, 0}, + {"KeyUsageEncipherOnly", Const, 0}, + {"KeyUsageKeyAgreement", Const, 0}, + {"KeyUsageKeyEncipherment", Const, 0}, + {"MD2WithRSA", Const, 0}, + {"MD5WithRSA", Const, 0}, + {"MarshalECPrivateKey", Func, 2}, + {"MarshalPKCS1PrivateKey", Func, 0}, + {"MarshalPKCS1PublicKey", Func, 10}, + {"MarshalPKCS8PrivateKey", Func, 10}, + {"MarshalPKIXPublicKey", Func, 0}, + {"NameConstraintsWithoutSANs", Const, 10}, + {"NameMismatch", Const, 8}, + {"NewCertPool", Func, 0}, + {"NotAuthorizedToSign", Const, 0}, + {"OID", Type, 22}, + {"OIDFromInts", Func, 22}, + {"PEMCipher", Type, 1}, + {"PEMCipher3DES", Const, 1}, + {"PEMCipherAES128", Const, 1}, + {"PEMCipherAES192", Const, 1}, + {"PEMCipherAES256", Const, 1}, + {"PEMCipherDES", Const, 1}, + {"ParseCRL", Func, 0}, + {"ParseCertificate", Func, 0}, + {"ParseCertificateRequest", Func, 3}, + {"ParseCertificates", Func, 0}, + {"ParseDERCRL", Func, 0}, + {"ParseECPrivateKey", Func, 1}, + {"ParseOID", Func, 23}, + {"ParsePKCS1PrivateKey", Func, 0}, + {"ParsePKCS1PublicKey", Func, 10}, + {"ParsePKCS8PrivateKey", Func, 0}, + {"ParsePKIXPublicKey", Func, 0}, + {"ParseRevocationList", Func, 19}, + {"PublicKeyAlgorithm", Type, 0}, + {"PureEd25519", Const, 13}, + {"RSA", Const, 0}, + {"RevocationList", Type, 15}, + {"RevocationList.AuthorityKeyId", Field, 19}, + {"RevocationList.Extensions", Field, 19}, + {"RevocationList.ExtraExtensions", Field, 15}, + {"RevocationList.Issuer", Field, 19}, + {"RevocationList.NextUpdate", Field, 15}, + {"RevocationList.Number", Field, 15}, + {"RevocationList.Raw", Field, 19}, + {"RevocationList.RawIssuer", Field, 19}, + {"RevocationList.RawTBSRevocationList", Field, 19}, + {"RevocationList.RevokedCertificateEntries", Field, 21}, + {"RevocationList.RevokedCertificates", Field, 15}, + {"RevocationList.Signature", Field, 19}, + {"RevocationList.SignatureAlgorithm", Field, 15}, + {"RevocationList.ThisUpdate", Field, 15}, + {"RevocationListEntry", Type, 21}, + {"RevocationListEntry.Extensions", Field, 21}, + {"RevocationListEntry.ExtraExtensions", Field, 21}, + {"RevocationListEntry.Raw", Field, 21}, + {"RevocationListEntry.ReasonCode", Field, 21}, + {"RevocationListEntry.RevocationTime", Field, 21}, + {"RevocationListEntry.SerialNumber", Field, 21}, + {"SHA1WithRSA", Const, 0}, + {"SHA256WithRSA", Const, 0}, + {"SHA256WithRSAPSS", Const, 8}, + {"SHA384WithRSA", Const, 0}, + {"SHA384WithRSAPSS", Const, 8}, + {"SHA512WithRSA", Const, 0}, + {"SHA512WithRSAPSS", Const, 8}, + {"SetFallbackRoots", Func, 20}, + {"SignatureAlgorithm", Type, 0}, + {"SystemCertPool", Func, 7}, + {"SystemRootsError", Type, 1}, + {"SystemRootsError.Err", Field, 7}, + {"TooManyConstraints", Const, 10}, + {"TooManyIntermediates", Const, 0}, + {"UnconstrainedName", Const, 10}, + {"UnhandledCriticalExtension", Type, 0}, + {"UnknownAuthorityError", Type, 0}, + {"UnknownAuthorityError.Cert", Field, 8}, + {"UnknownPublicKeyAlgorithm", Const, 0}, + {"UnknownSignatureAlgorithm", Const, 0}, + {"VerifyOptions", Type, 0}, + {"VerifyOptions.CurrentTime", Field, 0}, + {"VerifyOptions.DNSName", Field, 0}, + {"VerifyOptions.Intermediates", Field, 0}, + {"VerifyOptions.KeyUsages", Field, 1}, + {"VerifyOptions.MaxConstraintComparisions", Field, 10}, + {"VerifyOptions.Roots", Field, 0}, + }, + "crypto/x509/pkix": { + {"(*CertificateList).HasExpired", Method, 0}, + {"(*Name).FillFromRDNSequence", Method, 0}, + {"(Name).String", Method, 10}, + {"(Name).ToRDNSequence", Method, 0}, + {"(RDNSequence).String", Method, 10}, + {"AlgorithmIdentifier", Type, 0}, + {"AlgorithmIdentifier.Algorithm", Field, 0}, + {"AlgorithmIdentifier.Parameters", Field, 0}, + {"AttributeTypeAndValue", Type, 0}, + {"AttributeTypeAndValue.Type", Field, 0}, + {"AttributeTypeAndValue.Value", Field, 0}, + {"AttributeTypeAndValueSET", Type, 3}, + {"AttributeTypeAndValueSET.Type", Field, 3}, + {"AttributeTypeAndValueSET.Value", Field, 3}, + {"CertificateList", Type, 0}, + {"CertificateList.SignatureAlgorithm", Field, 0}, + {"CertificateList.SignatureValue", Field, 0}, + {"CertificateList.TBSCertList", Field, 0}, + {"Extension", Type, 0}, + {"Extension.Critical", Field, 0}, + {"Extension.Id", Field, 0}, + {"Extension.Value", Field, 0}, + {"Name", Type, 0}, + {"Name.CommonName", Field, 0}, + {"Name.Country", Field, 0}, + {"Name.ExtraNames", Field, 5}, + {"Name.Locality", Field, 0}, + {"Name.Names", Field, 0}, + {"Name.Organization", Field, 0}, + {"Name.OrganizationalUnit", Field, 0}, + {"Name.PostalCode", Field, 0}, + {"Name.Province", Field, 0}, + {"Name.SerialNumber", Field, 0}, + {"Name.StreetAddress", Field, 0}, + {"RDNSequence", Type, 0}, + {"RelativeDistinguishedNameSET", Type, 0}, + {"RevokedCertificate", Type, 0}, + {"RevokedCertificate.Extensions", Field, 0}, + {"RevokedCertificate.RevocationTime", Field, 0}, + {"RevokedCertificate.SerialNumber", Field, 0}, + {"TBSCertificateList", Type, 0}, + {"TBSCertificateList.Extensions", Field, 0}, + {"TBSCertificateList.Issuer", Field, 0}, + {"TBSCertificateList.NextUpdate", Field, 0}, + {"TBSCertificateList.Raw", Field, 0}, + {"TBSCertificateList.RevokedCertificates", Field, 0}, + {"TBSCertificateList.Signature", Field, 0}, + {"TBSCertificateList.ThisUpdate", Field, 0}, + {"TBSCertificateList.Version", Field, 0}, + }, + "database/sql": { + {"(*ColumnType).DatabaseTypeName", Method, 8}, + {"(*ColumnType).DecimalSize", Method, 8}, + {"(*ColumnType).Length", Method, 8}, + {"(*ColumnType).Name", Method, 8}, + {"(*ColumnType).Nullable", Method, 8}, + {"(*ColumnType).ScanType", Method, 8}, + {"(*Conn).BeginTx", Method, 9}, + {"(*Conn).Close", Method, 9}, + {"(*Conn).ExecContext", Method, 9}, + {"(*Conn).PingContext", Method, 9}, + {"(*Conn).PrepareContext", Method, 9}, + {"(*Conn).QueryContext", Method, 9}, + {"(*Conn).QueryRowContext", Method, 9}, + {"(*Conn).Raw", Method, 13}, + {"(*DB).Begin", Method, 0}, + {"(*DB).BeginTx", Method, 8}, + {"(*DB).Close", Method, 0}, + {"(*DB).Conn", Method, 9}, + {"(*DB).Driver", Method, 0}, + {"(*DB).Exec", Method, 0}, + {"(*DB).ExecContext", Method, 8}, + {"(*DB).Ping", Method, 1}, + {"(*DB).PingContext", Method, 8}, + {"(*DB).Prepare", Method, 0}, + {"(*DB).PrepareContext", Method, 8}, + {"(*DB).Query", Method, 0}, + {"(*DB).QueryContext", Method, 8}, + {"(*DB).QueryRow", Method, 0}, + {"(*DB).QueryRowContext", Method, 8}, + {"(*DB).SetConnMaxIdleTime", Method, 15}, + {"(*DB).SetConnMaxLifetime", Method, 6}, + {"(*DB).SetMaxIdleConns", Method, 1}, + {"(*DB).SetMaxOpenConns", Method, 2}, + {"(*DB).Stats", Method, 5}, + {"(*Null).Scan", Method, 22}, + {"(*NullBool).Scan", Method, 0}, + {"(*NullByte).Scan", Method, 17}, + {"(*NullFloat64).Scan", Method, 0}, + {"(*NullInt16).Scan", Method, 17}, + {"(*NullInt32).Scan", Method, 13}, + {"(*NullInt64).Scan", Method, 0}, + {"(*NullString).Scan", Method, 0}, + {"(*NullTime).Scan", Method, 13}, + {"(*Row).Err", Method, 15}, + {"(*Row).Scan", Method, 0}, + {"(*Rows).Close", Method, 0}, + {"(*Rows).ColumnTypes", Method, 8}, + {"(*Rows).Columns", Method, 0}, + {"(*Rows).Err", Method, 0}, + {"(*Rows).Next", Method, 0}, + {"(*Rows).NextResultSet", Method, 8}, + {"(*Rows).Scan", Method, 0}, + {"(*Stmt).Close", Method, 0}, + {"(*Stmt).Exec", Method, 0}, + {"(*Stmt).ExecContext", Method, 8}, + {"(*Stmt).Query", Method, 0}, + {"(*Stmt).QueryContext", Method, 8}, + {"(*Stmt).QueryRow", Method, 0}, + {"(*Stmt).QueryRowContext", Method, 8}, + {"(*Tx).Commit", Method, 0}, + {"(*Tx).Exec", Method, 0}, + {"(*Tx).ExecContext", Method, 8}, + {"(*Tx).Prepare", Method, 0}, + {"(*Tx).PrepareContext", Method, 8}, + {"(*Tx).Query", Method, 0}, + {"(*Tx).QueryContext", Method, 8}, + {"(*Tx).QueryRow", Method, 0}, + {"(*Tx).QueryRowContext", Method, 8}, + {"(*Tx).Rollback", Method, 0}, + {"(*Tx).Stmt", Method, 0}, + {"(*Tx).StmtContext", Method, 8}, + {"(IsolationLevel).String", Method, 11}, + {"(Null).Value", Method, 22}, + {"(NullBool).Value", Method, 0}, + {"(NullByte).Value", Method, 17}, + {"(NullFloat64).Value", Method, 0}, + {"(NullInt16).Value", Method, 17}, + {"(NullInt32).Value", Method, 13}, + {"(NullInt64).Value", Method, 0}, + {"(NullString).Value", Method, 0}, + {"(NullTime).Value", Method, 13}, + {"ColumnType", Type, 8}, + {"Conn", Type, 9}, + {"DB", Type, 0}, + {"DBStats", Type, 5}, + {"DBStats.Idle", Field, 11}, + {"DBStats.InUse", Field, 11}, + {"DBStats.MaxIdleClosed", Field, 11}, + {"DBStats.MaxIdleTimeClosed", Field, 15}, + {"DBStats.MaxLifetimeClosed", Field, 11}, + {"DBStats.MaxOpenConnections", Field, 11}, + {"DBStats.OpenConnections", Field, 5}, + {"DBStats.WaitCount", Field, 11}, + {"DBStats.WaitDuration", Field, 11}, + {"Drivers", Func, 4}, + {"ErrConnDone", Var, 9}, + {"ErrNoRows", Var, 0}, + {"ErrTxDone", Var, 0}, + {"IsolationLevel", Type, 8}, + {"LevelDefault", Const, 8}, + {"LevelLinearizable", Const, 8}, + {"LevelReadCommitted", Const, 8}, + {"LevelReadUncommitted", Const, 8}, + {"LevelRepeatableRead", Const, 8}, + {"LevelSerializable", Const, 8}, + {"LevelSnapshot", Const, 8}, + {"LevelWriteCommitted", Const, 8}, + {"Named", Func, 8}, + {"NamedArg", Type, 8}, + {"NamedArg.Name", Field, 8}, + {"NamedArg.Value", Field, 8}, + {"Null", Type, 22}, + {"Null.V", Field, 22}, + {"Null.Valid", Field, 22}, + {"NullBool", Type, 0}, + {"NullBool.Bool", Field, 0}, + {"NullBool.Valid", Field, 0}, + {"NullByte", Type, 17}, + {"NullByte.Byte", Field, 17}, + {"NullByte.Valid", Field, 17}, + {"NullFloat64", Type, 0}, + {"NullFloat64.Float64", Field, 0}, + {"NullFloat64.Valid", Field, 0}, + {"NullInt16", Type, 17}, + {"NullInt16.Int16", Field, 17}, + {"NullInt16.Valid", Field, 17}, + {"NullInt32", Type, 13}, + {"NullInt32.Int32", Field, 13}, + {"NullInt32.Valid", Field, 13}, + {"NullInt64", Type, 0}, + {"NullInt64.Int64", Field, 0}, + {"NullInt64.Valid", Field, 0}, + {"NullString", Type, 0}, + {"NullString.String", Field, 0}, + {"NullString.Valid", Field, 0}, + {"NullTime", Type, 13}, + {"NullTime.Time", Field, 13}, + {"NullTime.Valid", Field, 13}, + {"Open", Func, 0}, + {"OpenDB", Func, 10}, + {"Out", Type, 9}, + {"Out.Dest", Field, 9}, + {"Out.In", Field, 9}, + {"RawBytes", Type, 0}, + {"Register", Func, 0}, + {"Result", Type, 0}, + {"Row", Type, 0}, + {"Rows", Type, 0}, + {"Scanner", Type, 0}, + {"Stmt", Type, 0}, + {"Tx", Type, 0}, + {"TxOptions", Type, 8}, + {"TxOptions.Isolation", Field, 8}, + {"TxOptions.ReadOnly", Field, 8}, + }, + "database/sql/driver": { + {"(NotNull).ConvertValue", Method, 0}, + {"(Null).ConvertValue", Method, 0}, + {"(RowsAffected).LastInsertId", Method, 0}, + {"(RowsAffected).RowsAffected", Method, 0}, + {"Bool", Var, 0}, + {"ColumnConverter", Type, 0}, + {"Conn", Type, 0}, + {"ConnBeginTx", Type, 8}, + {"ConnPrepareContext", Type, 8}, + {"Connector", Type, 10}, + {"DefaultParameterConverter", Var, 0}, + {"Driver", Type, 0}, + {"DriverContext", Type, 10}, + {"ErrBadConn", Var, 0}, + {"ErrRemoveArgument", Var, 9}, + {"ErrSkip", Var, 0}, + {"Execer", Type, 0}, + {"ExecerContext", Type, 8}, + {"Int32", Var, 0}, + {"IsScanValue", Func, 0}, + {"IsValue", Func, 0}, + {"IsolationLevel", Type, 8}, + {"NamedValue", Type, 8}, + {"NamedValue.Name", Field, 8}, + {"NamedValue.Ordinal", Field, 8}, + {"NamedValue.Value", Field, 8}, + {"NamedValueChecker", Type, 9}, + {"NotNull", Type, 0}, + {"NotNull.Converter", Field, 0}, + {"Null", Type, 0}, + {"Null.Converter", Field, 0}, + {"Pinger", Type, 8}, + {"Queryer", Type, 1}, + {"QueryerContext", Type, 8}, + {"Result", Type, 0}, + {"ResultNoRows", Var, 0}, + {"Rows", Type, 0}, + {"RowsAffected", Type, 0}, + {"RowsColumnTypeDatabaseTypeName", Type, 8}, + {"RowsColumnTypeLength", Type, 8}, + {"RowsColumnTypeNullable", Type, 8}, + {"RowsColumnTypePrecisionScale", Type, 8}, + {"RowsColumnTypeScanType", Type, 8}, + {"RowsNextResultSet", Type, 8}, + {"SessionResetter", Type, 10}, + {"Stmt", Type, 0}, + {"StmtExecContext", Type, 8}, + {"StmtQueryContext", Type, 8}, + {"String", Var, 0}, + {"Tx", Type, 0}, + {"TxOptions", Type, 8}, + {"TxOptions.Isolation", Field, 8}, + {"TxOptions.ReadOnly", Field, 8}, + {"Validator", Type, 15}, + {"Value", Type, 0}, + {"ValueConverter", Type, 0}, + {"Valuer", Type, 0}, + }, + "debug/buildinfo": { + {"BuildInfo", Type, 18}, + {"Read", Func, 18}, + {"ReadFile", Func, 18}, + }, + "debug/dwarf": { + {"(*AddrType).Basic", Method, 0}, + {"(*AddrType).Common", Method, 0}, + {"(*AddrType).Size", Method, 0}, + {"(*AddrType).String", Method, 0}, + {"(*ArrayType).Common", Method, 0}, + {"(*ArrayType).Size", Method, 0}, + {"(*ArrayType).String", Method, 0}, + {"(*BasicType).Basic", Method, 0}, + {"(*BasicType).Common", Method, 0}, + {"(*BasicType).Size", Method, 0}, + {"(*BasicType).String", Method, 0}, + {"(*BoolType).Basic", Method, 0}, + {"(*BoolType).Common", Method, 0}, + {"(*BoolType).Size", Method, 0}, + {"(*BoolType).String", Method, 0}, + {"(*CharType).Basic", Method, 0}, + {"(*CharType).Common", Method, 0}, + {"(*CharType).Size", Method, 0}, + {"(*CharType).String", Method, 0}, + {"(*CommonType).Common", Method, 0}, + {"(*CommonType).Size", Method, 0}, + {"(*ComplexType).Basic", Method, 0}, + {"(*ComplexType).Common", Method, 0}, + {"(*ComplexType).Size", Method, 0}, + {"(*ComplexType).String", Method, 0}, + {"(*Data).AddSection", Method, 14}, + {"(*Data).AddTypes", Method, 3}, + {"(*Data).LineReader", Method, 5}, + {"(*Data).Ranges", Method, 7}, + {"(*Data).Reader", Method, 0}, + {"(*Data).Type", Method, 0}, + {"(*DotDotDotType).Common", Method, 0}, + {"(*DotDotDotType).Size", Method, 0}, + {"(*DotDotDotType).String", Method, 0}, + {"(*Entry).AttrField", Method, 5}, + {"(*Entry).Val", Method, 0}, + {"(*EnumType).Common", Method, 0}, + {"(*EnumType).Size", Method, 0}, + {"(*EnumType).String", Method, 0}, + {"(*FloatType).Basic", Method, 0}, + {"(*FloatType).Common", Method, 0}, + {"(*FloatType).Size", Method, 0}, + {"(*FloatType).String", Method, 0}, + {"(*FuncType).Common", Method, 0}, + {"(*FuncType).Size", Method, 0}, + {"(*FuncType).String", Method, 0}, + {"(*IntType).Basic", Method, 0}, + {"(*IntType).Common", Method, 0}, + {"(*IntType).Size", Method, 0}, + {"(*IntType).String", Method, 0}, + {"(*LineReader).Files", Method, 14}, + {"(*LineReader).Next", Method, 5}, + {"(*LineReader).Reset", Method, 5}, + {"(*LineReader).Seek", Method, 5}, + {"(*LineReader).SeekPC", Method, 5}, + {"(*LineReader).Tell", Method, 5}, + {"(*PtrType).Common", Method, 0}, + {"(*PtrType).Size", Method, 0}, + {"(*PtrType).String", Method, 0}, + {"(*QualType).Common", Method, 0}, + {"(*QualType).Size", Method, 0}, + {"(*QualType).String", Method, 0}, + {"(*Reader).AddressSize", Method, 5}, + {"(*Reader).ByteOrder", Method, 14}, + {"(*Reader).Next", Method, 0}, + {"(*Reader).Seek", Method, 0}, + {"(*Reader).SeekPC", Method, 7}, + {"(*Reader).SkipChildren", Method, 0}, + {"(*StructType).Common", Method, 0}, + {"(*StructType).Defn", Method, 0}, + {"(*StructType).Size", Method, 0}, + {"(*StructType).String", Method, 0}, + {"(*TypedefType).Common", Method, 0}, + {"(*TypedefType).Size", Method, 0}, + {"(*TypedefType).String", Method, 0}, + {"(*UcharType).Basic", Method, 0}, + {"(*UcharType).Common", Method, 0}, + {"(*UcharType).Size", Method, 0}, + {"(*UcharType).String", Method, 0}, + {"(*UintType).Basic", Method, 0}, + {"(*UintType).Common", Method, 0}, + {"(*UintType).Size", Method, 0}, + {"(*UintType).String", Method, 0}, + {"(*UnspecifiedType).Basic", Method, 4}, + {"(*UnspecifiedType).Common", Method, 4}, + {"(*UnspecifiedType).Size", Method, 4}, + {"(*UnspecifiedType).String", Method, 4}, + {"(*UnsupportedType).Common", Method, 13}, + {"(*UnsupportedType).Size", Method, 13}, + {"(*UnsupportedType).String", Method, 13}, + {"(*VoidType).Common", Method, 0}, + {"(*VoidType).Size", Method, 0}, + {"(*VoidType).String", Method, 0}, + {"(Attr).GoString", Method, 0}, + {"(Attr).String", Method, 0}, + {"(Class).GoString", Method, 5}, + {"(Class).String", Method, 5}, + {"(DecodeError).Error", Method, 0}, + {"(Tag).GoString", Method, 0}, + {"(Tag).String", Method, 0}, + {"AddrType", Type, 0}, + {"AddrType.BasicType", Field, 0}, + {"ArrayType", Type, 0}, + {"ArrayType.CommonType", Field, 0}, + {"ArrayType.Count", Field, 0}, + {"ArrayType.StrideBitSize", Field, 0}, + {"ArrayType.Type", Field, 0}, + {"Attr", Type, 0}, + {"AttrAbstractOrigin", Const, 0}, + {"AttrAccessibility", Const, 0}, + {"AttrAddrBase", Const, 14}, + {"AttrAddrClass", Const, 0}, + {"AttrAlignment", Const, 14}, + {"AttrAllocated", Const, 0}, + {"AttrArtificial", Const, 0}, + {"AttrAssociated", Const, 0}, + {"AttrBaseTypes", Const, 0}, + {"AttrBinaryScale", Const, 14}, + {"AttrBitOffset", Const, 0}, + {"AttrBitSize", Const, 0}, + {"AttrByteSize", Const, 0}, + {"AttrCallAllCalls", Const, 14}, + {"AttrCallAllSourceCalls", Const, 14}, + {"AttrCallAllTailCalls", Const, 14}, + {"AttrCallColumn", Const, 0}, + {"AttrCallDataLocation", Const, 14}, + {"AttrCallDataValue", Const, 14}, + {"AttrCallFile", Const, 0}, + {"AttrCallLine", Const, 0}, + {"AttrCallOrigin", Const, 14}, + {"AttrCallPC", Const, 14}, + {"AttrCallParameter", Const, 14}, + {"AttrCallReturnPC", Const, 14}, + {"AttrCallTailCall", Const, 14}, + {"AttrCallTarget", Const, 14}, + {"AttrCallTargetClobbered", Const, 14}, + {"AttrCallValue", Const, 14}, + {"AttrCalling", Const, 0}, + {"AttrCommonRef", Const, 0}, + {"AttrCompDir", Const, 0}, + {"AttrConstExpr", Const, 14}, + {"AttrConstValue", Const, 0}, + {"AttrContainingType", Const, 0}, + {"AttrCount", Const, 0}, + {"AttrDataBitOffset", Const, 14}, + {"AttrDataLocation", Const, 0}, + {"AttrDataMemberLoc", Const, 0}, + {"AttrDecimalScale", Const, 14}, + {"AttrDecimalSign", Const, 14}, + {"AttrDeclColumn", Const, 0}, + {"AttrDeclFile", Const, 0}, + {"AttrDeclLine", Const, 0}, + {"AttrDeclaration", Const, 0}, + {"AttrDefaultValue", Const, 0}, + {"AttrDefaulted", Const, 14}, + {"AttrDeleted", Const, 14}, + {"AttrDescription", Const, 0}, + {"AttrDigitCount", Const, 14}, + {"AttrDiscr", Const, 0}, + {"AttrDiscrList", Const, 0}, + {"AttrDiscrValue", Const, 0}, + {"AttrDwoName", Const, 14}, + {"AttrElemental", Const, 14}, + {"AttrEncoding", Const, 0}, + {"AttrEndianity", Const, 14}, + {"AttrEntrypc", Const, 0}, + {"AttrEnumClass", Const, 14}, + {"AttrExplicit", Const, 14}, + {"AttrExportSymbols", Const, 14}, + {"AttrExtension", Const, 0}, + {"AttrExternal", Const, 0}, + {"AttrFrameBase", Const, 0}, + {"AttrFriend", Const, 0}, + {"AttrHighpc", Const, 0}, + {"AttrIdentifierCase", Const, 0}, + {"AttrImport", Const, 0}, + {"AttrInline", Const, 0}, + {"AttrIsOptional", Const, 0}, + {"AttrLanguage", Const, 0}, + {"AttrLinkageName", Const, 14}, + {"AttrLocation", Const, 0}, + {"AttrLoclistsBase", Const, 14}, + {"AttrLowerBound", Const, 0}, + {"AttrLowpc", Const, 0}, + {"AttrMacroInfo", Const, 0}, + {"AttrMacros", Const, 14}, + {"AttrMainSubprogram", Const, 14}, + {"AttrMutable", Const, 14}, + {"AttrName", Const, 0}, + {"AttrNamelistItem", Const, 0}, + {"AttrNoreturn", Const, 14}, + {"AttrObjectPointer", Const, 14}, + {"AttrOrdering", Const, 0}, + {"AttrPictureString", Const, 14}, + {"AttrPriority", Const, 0}, + {"AttrProducer", Const, 0}, + {"AttrPrototyped", Const, 0}, + {"AttrPure", Const, 14}, + {"AttrRanges", Const, 0}, + {"AttrRank", Const, 14}, + {"AttrRecursive", Const, 14}, + {"AttrReference", Const, 14}, + {"AttrReturnAddr", Const, 0}, + {"AttrRnglistsBase", Const, 14}, + {"AttrRvalueReference", Const, 14}, + {"AttrSegment", Const, 0}, + {"AttrSibling", Const, 0}, + {"AttrSignature", Const, 14}, + {"AttrSmall", Const, 14}, + {"AttrSpecification", Const, 0}, + {"AttrStartScope", Const, 0}, + {"AttrStaticLink", Const, 0}, + {"AttrStmtList", Const, 0}, + {"AttrStrOffsetsBase", Const, 14}, + {"AttrStride", Const, 0}, + {"AttrStrideSize", Const, 0}, + {"AttrStringLength", Const, 0}, + {"AttrStringLengthBitSize", Const, 14}, + {"AttrStringLengthByteSize", Const, 14}, + {"AttrThreadsScaled", Const, 14}, + {"AttrTrampoline", Const, 0}, + {"AttrType", Const, 0}, + {"AttrUpperBound", Const, 0}, + {"AttrUseLocation", Const, 0}, + {"AttrUseUTF8", Const, 0}, + {"AttrVarParam", Const, 0}, + {"AttrVirtuality", Const, 0}, + {"AttrVisibility", Const, 0}, + {"AttrVtableElemLoc", Const, 0}, + {"BasicType", Type, 0}, + {"BasicType.BitOffset", Field, 0}, + {"BasicType.BitSize", Field, 0}, + {"BasicType.CommonType", Field, 0}, + {"BasicType.DataBitOffset", Field, 18}, + {"BoolType", Type, 0}, + {"BoolType.BasicType", Field, 0}, + {"CharType", Type, 0}, + {"CharType.BasicType", Field, 0}, + {"Class", Type, 5}, + {"ClassAddrPtr", Const, 14}, + {"ClassAddress", Const, 5}, + {"ClassBlock", Const, 5}, + {"ClassConstant", Const, 5}, + {"ClassExprLoc", Const, 5}, + {"ClassFlag", Const, 5}, + {"ClassLinePtr", Const, 5}, + {"ClassLocList", Const, 14}, + {"ClassLocListPtr", Const, 5}, + {"ClassMacPtr", Const, 5}, + {"ClassRangeListPtr", Const, 5}, + {"ClassReference", Const, 5}, + {"ClassReferenceAlt", Const, 5}, + {"ClassReferenceSig", Const, 5}, + {"ClassRngList", Const, 14}, + {"ClassRngListsPtr", Const, 14}, + {"ClassStrOffsetsPtr", Const, 14}, + {"ClassString", Const, 5}, + {"ClassStringAlt", Const, 5}, + {"ClassUnknown", Const, 6}, + {"CommonType", Type, 0}, + {"CommonType.ByteSize", Field, 0}, + {"CommonType.Name", Field, 0}, + {"ComplexType", Type, 0}, + {"ComplexType.BasicType", Field, 0}, + {"Data", Type, 0}, + {"DecodeError", Type, 0}, + {"DecodeError.Err", Field, 0}, + {"DecodeError.Name", Field, 0}, + {"DecodeError.Offset", Field, 0}, + {"DotDotDotType", Type, 0}, + {"DotDotDotType.CommonType", Field, 0}, + {"Entry", Type, 0}, + {"Entry.Children", Field, 0}, + {"Entry.Field", Field, 0}, + {"Entry.Offset", Field, 0}, + {"Entry.Tag", Field, 0}, + {"EnumType", Type, 0}, + {"EnumType.CommonType", Field, 0}, + {"EnumType.EnumName", Field, 0}, + {"EnumType.Val", Field, 0}, + {"EnumValue", Type, 0}, + {"EnumValue.Name", Field, 0}, + {"EnumValue.Val", Field, 0}, + {"ErrUnknownPC", Var, 5}, + {"Field", Type, 0}, + {"Field.Attr", Field, 0}, + {"Field.Class", Field, 5}, + {"Field.Val", Field, 0}, + {"FloatType", Type, 0}, + {"FloatType.BasicType", Field, 0}, + {"FuncType", Type, 0}, + {"FuncType.CommonType", Field, 0}, + {"FuncType.ParamType", Field, 0}, + {"FuncType.ReturnType", Field, 0}, + {"IntType", Type, 0}, + {"IntType.BasicType", Field, 0}, + {"LineEntry", Type, 5}, + {"LineEntry.Address", Field, 5}, + {"LineEntry.BasicBlock", Field, 5}, + {"LineEntry.Column", Field, 5}, + {"LineEntry.Discriminator", Field, 5}, + {"LineEntry.EndSequence", Field, 5}, + {"LineEntry.EpilogueBegin", Field, 5}, + {"LineEntry.File", Field, 5}, + {"LineEntry.ISA", Field, 5}, + {"LineEntry.IsStmt", Field, 5}, + {"LineEntry.Line", Field, 5}, + {"LineEntry.OpIndex", Field, 5}, + {"LineEntry.PrologueEnd", Field, 5}, + {"LineFile", Type, 5}, + {"LineFile.Length", Field, 5}, + {"LineFile.Mtime", Field, 5}, + {"LineFile.Name", Field, 5}, + {"LineReader", Type, 5}, + {"LineReaderPos", Type, 5}, + {"New", Func, 0}, + {"Offset", Type, 0}, + {"PtrType", Type, 0}, + {"PtrType.CommonType", Field, 0}, + {"PtrType.Type", Field, 0}, + {"QualType", Type, 0}, + {"QualType.CommonType", Field, 0}, + {"QualType.Qual", Field, 0}, + {"QualType.Type", Field, 0}, + {"Reader", Type, 0}, + {"StructField", Type, 0}, + {"StructField.BitOffset", Field, 0}, + {"StructField.BitSize", Field, 0}, + {"StructField.ByteOffset", Field, 0}, + {"StructField.ByteSize", Field, 0}, + {"StructField.DataBitOffset", Field, 18}, + {"StructField.Name", Field, 0}, + {"StructField.Type", Field, 0}, + {"StructType", Type, 0}, + {"StructType.CommonType", Field, 0}, + {"StructType.Field", Field, 0}, + {"StructType.Incomplete", Field, 0}, + {"StructType.Kind", Field, 0}, + {"StructType.StructName", Field, 0}, + {"Tag", Type, 0}, + {"TagAccessDeclaration", Const, 0}, + {"TagArrayType", Const, 0}, + {"TagAtomicType", Const, 14}, + {"TagBaseType", Const, 0}, + {"TagCallSite", Const, 14}, + {"TagCallSiteParameter", Const, 14}, + {"TagCatchDwarfBlock", Const, 0}, + {"TagClassType", Const, 0}, + {"TagCoarrayType", Const, 14}, + {"TagCommonDwarfBlock", Const, 0}, + {"TagCommonInclusion", Const, 0}, + {"TagCompileUnit", Const, 0}, + {"TagCondition", Const, 3}, + {"TagConstType", Const, 0}, + {"TagConstant", Const, 0}, + {"TagDwarfProcedure", Const, 0}, + {"TagDynamicType", Const, 14}, + {"TagEntryPoint", Const, 0}, + {"TagEnumerationType", Const, 0}, + {"TagEnumerator", Const, 0}, + {"TagFileType", Const, 0}, + {"TagFormalParameter", Const, 0}, + {"TagFriend", Const, 0}, + {"TagGenericSubrange", Const, 14}, + {"TagImmutableType", Const, 14}, + {"TagImportedDeclaration", Const, 0}, + {"TagImportedModule", Const, 0}, + {"TagImportedUnit", Const, 0}, + {"TagInheritance", Const, 0}, + {"TagInlinedSubroutine", Const, 0}, + {"TagInterfaceType", Const, 0}, + {"TagLabel", Const, 0}, + {"TagLexDwarfBlock", Const, 0}, + {"TagMember", Const, 0}, + {"TagModule", Const, 0}, + {"TagMutableType", Const, 0}, + {"TagNamelist", Const, 0}, + {"TagNamelistItem", Const, 0}, + {"TagNamespace", Const, 0}, + {"TagPackedType", Const, 0}, + {"TagPartialUnit", Const, 0}, + {"TagPointerType", Const, 0}, + {"TagPtrToMemberType", Const, 0}, + {"TagReferenceType", Const, 0}, + {"TagRestrictType", Const, 0}, + {"TagRvalueReferenceType", Const, 3}, + {"TagSetType", Const, 0}, + {"TagSharedType", Const, 3}, + {"TagSkeletonUnit", Const, 14}, + {"TagStringType", Const, 0}, + {"TagStructType", Const, 0}, + {"TagSubprogram", Const, 0}, + {"TagSubrangeType", Const, 0}, + {"TagSubroutineType", Const, 0}, + {"TagTemplateAlias", Const, 3}, + {"TagTemplateTypeParameter", Const, 0}, + {"TagTemplateValueParameter", Const, 0}, + {"TagThrownType", Const, 0}, + {"TagTryDwarfBlock", Const, 0}, + {"TagTypeUnit", Const, 3}, + {"TagTypedef", Const, 0}, + {"TagUnionType", Const, 0}, + {"TagUnspecifiedParameters", Const, 0}, + {"TagUnspecifiedType", Const, 0}, + {"TagVariable", Const, 0}, + {"TagVariant", Const, 0}, + {"TagVariantPart", Const, 0}, + {"TagVolatileType", Const, 0}, + {"TagWithStmt", Const, 0}, + {"Type", Type, 0}, + {"TypedefType", Type, 0}, + {"TypedefType.CommonType", Field, 0}, + {"TypedefType.Type", Field, 0}, + {"UcharType", Type, 0}, + {"UcharType.BasicType", Field, 0}, + {"UintType", Type, 0}, + {"UintType.BasicType", Field, 0}, + {"UnspecifiedType", Type, 4}, + {"UnspecifiedType.BasicType", Field, 4}, + {"UnsupportedType", Type, 13}, + {"UnsupportedType.CommonType", Field, 13}, + {"UnsupportedType.Tag", Field, 13}, + {"VoidType", Type, 0}, + {"VoidType.CommonType", Field, 0}, + }, + "debug/elf": { + {"(*File).Close", Method, 0}, + {"(*File).DWARF", Method, 0}, + {"(*File).DynString", Method, 1}, + {"(*File).DynValue", Method, 21}, + {"(*File).DynamicSymbols", Method, 4}, + {"(*File).ImportedLibraries", Method, 0}, + {"(*File).ImportedSymbols", Method, 0}, + {"(*File).Section", Method, 0}, + {"(*File).SectionByType", Method, 0}, + {"(*File).Symbols", Method, 0}, + {"(*FormatError).Error", Method, 0}, + {"(*Prog).Open", Method, 0}, + {"(*Section).Data", Method, 0}, + {"(*Section).Open", Method, 0}, + {"(Class).GoString", Method, 0}, + {"(Class).String", Method, 0}, + {"(CompressionType).GoString", Method, 6}, + {"(CompressionType).String", Method, 6}, + {"(Data).GoString", Method, 0}, + {"(Data).String", Method, 0}, + {"(DynFlag).GoString", Method, 0}, + {"(DynFlag).String", Method, 0}, + {"(DynFlag1).GoString", Method, 21}, + {"(DynFlag1).String", Method, 21}, + {"(DynTag).GoString", Method, 0}, + {"(DynTag).String", Method, 0}, + {"(Machine).GoString", Method, 0}, + {"(Machine).String", Method, 0}, + {"(NType).GoString", Method, 0}, + {"(NType).String", Method, 0}, + {"(OSABI).GoString", Method, 0}, + {"(OSABI).String", Method, 0}, + {"(Prog).ReadAt", Method, 0}, + {"(ProgFlag).GoString", Method, 0}, + {"(ProgFlag).String", Method, 0}, + {"(ProgType).GoString", Method, 0}, + {"(ProgType).String", Method, 0}, + {"(R_386).GoString", Method, 0}, + {"(R_386).String", Method, 0}, + {"(R_390).GoString", Method, 7}, + {"(R_390).String", Method, 7}, + {"(R_AARCH64).GoString", Method, 4}, + {"(R_AARCH64).String", Method, 4}, + {"(R_ALPHA).GoString", Method, 0}, + {"(R_ALPHA).String", Method, 0}, + {"(R_ARM).GoString", Method, 0}, + {"(R_ARM).String", Method, 0}, + {"(R_LARCH).GoString", Method, 19}, + {"(R_LARCH).String", Method, 19}, + {"(R_MIPS).GoString", Method, 6}, + {"(R_MIPS).String", Method, 6}, + {"(R_PPC).GoString", Method, 0}, + {"(R_PPC).String", Method, 0}, + {"(R_PPC64).GoString", Method, 5}, + {"(R_PPC64).String", Method, 5}, + {"(R_RISCV).GoString", Method, 11}, + {"(R_RISCV).String", Method, 11}, + {"(R_SPARC).GoString", Method, 0}, + {"(R_SPARC).String", Method, 0}, + {"(R_X86_64).GoString", Method, 0}, + {"(R_X86_64).String", Method, 0}, + {"(Section).ReadAt", Method, 0}, + {"(SectionFlag).GoString", Method, 0}, + {"(SectionFlag).String", Method, 0}, + {"(SectionIndex).GoString", Method, 0}, + {"(SectionIndex).String", Method, 0}, + {"(SectionType).GoString", Method, 0}, + {"(SectionType).String", Method, 0}, + {"(SymBind).GoString", Method, 0}, + {"(SymBind).String", Method, 0}, + {"(SymType).GoString", Method, 0}, + {"(SymType).String", Method, 0}, + {"(SymVis).GoString", Method, 0}, + {"(SymVis).String", Method, 0}, + {"(Type).GoString", Method, 0}, + {"(Type).String", Method, 0}, + {"(Version).GoString", Method, 0}, + {"(Version).String", Method, 0}, + {"ARM_MAGIC_TRAMP_NUMBER", Const, 0}, + {"COMPRESS_HIOS", Const, 6}, + {"COMPRESS_HIPROC", Const, 6}, + {"COMPRESS_LOOS", Const, 6}, + {"COMPRESS_LOPROC", Const, 6}, + {"COMPRESS_ZLIB", Const, 6}, + {"COMPRESS_ZSTD", Const, 21}, + {"Chdr32", Type, 6}, + {"Chdr32.Addralign", Field, 6}, + {"Chdr32.Size", Field, 6}, + {"Chdr32.Type", Field, 6}, + {"Chdr64", Type, 6}, + {"Chdr64.Addralign", Field, 6}, + {"Chdr64.Size", Field, 6}, + {"Chdr64.Type", Field, 6}, + {"Class", Type, 0}, + {"CompressionType", Type, 6}, + {"DF_1_CONFALT", Const, 21}, + {"DF_1_DIRECT", Const, 21}, + {"DF_1_DISPRELDNE", Const, 21}, + {"DF_1_DISPRELPND", Const, 21}, + {"DF_1_EDITED", Const, 21}, + {"DF_1_ENDFILTEE", Const, 21}, + {"DF_1_GLOBAL", Const, 21}, + {"DF_1_GLOBAUDIT", Const, 21}, + {"DF_1_GROUP", Const, 21}, + {"DF_1_IGNMULDEF", Const, 21}, + {"DF_1_INITFIRST", Const, 21}, + {"DF_1_INTERPOSE", Const, 21}, + {"DF_1_KMOD", Const, 21}, + {"DF_1_LOADFLTR", Const, 21}, + {"DF_1_NOCOMMON", Const, 21}, + {"DF_1_NODEFLIB", Const, 21}, + {"DF_1_NODELETE", Const, 21}, + {"DF_1_NODIRECT", Const, 21}, + {"DF_1_NODUMP", Const, 21}, + {"DF_1_NOHDR", Const, 21}, + {"DF_1_NOKSYMS", Const, 21}, + {"DF_1_NOOPEN", Const, 21}, + {"DF_1_NORELOC", Const, 21}, + {"DF_1_NOW", Const, 21}, + {"DF_1_ORIGIN", Const, 21}, + {"DF_1_PIE", Const, 21}, + {"DF_1_SINGLETON", Const, 21}, + {"DF_1_STUB", Const, 21}, + {"DF_1_SYMINTPOSE", Const, 21}, + {"DF_1_TRANS", Const, 21}, + {"DF_1_WEAKFILTER", Const, 21}, + {"DF_BIND_NOW", Const, 0}, + {"DF_ORIGIN", Const, 0}, + {"DF_STATIC_TLS", Const, 0}, + {"DF_SYMBOLIC", Const, 0}, + {"DF_TEXTREL", Const, 0}, + {"DT_ADDRRNGHI", Const, 16}, + {"DT_ADDRRNGLO", Const, 16}, + {"DT_AUDIT", Const, 16}, + {"DT_AUXILIARY", Const, 16}, + {"DT_BIND_NOW", Const, 0}, + {"DT_CHECKSUM", Const, 16}, + {"DT_CONFIG", Const, 16}, + {"DT_DEBUG", Const, 0}, + {"DT_DEPAUDIT", Const, 16}, + {"DT_ENCODING", Const, 0}, + {"DT_FEATURE", Const, 16}, + {"DT_FILTER", Const, 16}, + {"DT_FINI", Const, 0}, + {"DT_FINI_ARRAY", Const, 0}, + {"DT_FINI_ARRAYSZ", Const, 0}, + {"DT_FLAGS", Const, 0}, + {"DT_FLAGS_1", Const, 16}, + {"DT_GNU_CONFLICT", Const, 16}, + {"DT_GNU_CONFLICTSZ", Const, 16}, + {"DT_GNU_HASH", Const, 16}, + {"DT_GNU_LIBLIST", Const, 16}, + {"DT_GNU_LIBLISTSZ", Const, 16}, + {"DT_GNU_PRELINKED", Const, 16}, + {"DT_HASH", Const, 0}, + {"DT_HIOS", Const, 0}, + {"DT_HIPROC", Const, 0}, + {"DT_INIT", Const, 0}, + {"DT_INIT_ARRAY", Const, 0}, + {"DT_INIT_ARRAYSZ", Const, 0}, + {"DT_JMPREL", Const, 0}, + {"DT_LOOS", Const, 0}, + {"DT_LOPROC", Const, 0}, + {"DT_MIPS_AUX_DYNAMIC", Const, 16}, + {"DT_MIPS_BASE_ADDRESS", Const, 16}, + {"DT_MIPS_COMPACT_SIZE", Const, 16}, + {"DT_MIPS_CONFLICT", Const, 16}, + {"DT_MIPS_CONFLICTNO", Const, 16}, + {"DT_MIPS_CXX_FLAGS", Const, 16}, + {"DT_MIPS_DELTA_CLASS", Const, 16}, + {"DT_MIPS_DELTA_CLASSSYM", Const, 16}, + {"DT_MIPS_DELTA_CLASSSYM_NO", Const, 16}, + {"DT_MIPS_DELTA_CLASS_NO", Const, 16}, + {"DT_MIPS_DELTA_INSTANCE", Const, 16}, + {"DT_MIPS_DELTA_INSTANCE_NO", Const, 16}, + {"DT_MIPS_DELTA_RELOC", Const, 16}, + {"DT_MIPS_DELTA_RELOC_NO", Const, 16}, + {"DT_MIPS_DELTA_SYM", Const, 16}, + {"DT_MIPS_DELTA_SYM_NO", Const, 16}, + {"DT_MIPS_DYNSTR_ALIGN", Const, 16}, + {"DT_MIPS_FLAGS", Const, 16}, + {"DT_MIPS_GOTSYM", Const, 16}, + {"DT_MIPS_GP_VALUE", Const, 16}, + {"DT_MIPS_HIDDEN_GOTIDX", Const, 16}, + {"DT_MIPS_HIPAGENO", Const, 16}, + {"DT_MIPS_ICHECKSUM", Const, 16}, + {"DT_MIPS_INTERFACE", Const, 16}, + {"DT_MIPS_INTERFACE_SIZE", Const, 16}, + {"DT_MIPS_IVERSION", Const, 16}, + {"DT_MIPS_LIBLIST", Const, 16}, + {"DT_MIPS_LIBLISTNO", Const, 16}, + {"DT_MIPS_LOCALPAGE_GOTIDX", Const, 16}, + {"DT_MIPS_LOCAL_GOTIDX", Const, 16}, + {"DT_MIPS_LOCAL_GOTNO", Const, 16}, + {"DT_MIPS_MSYM", Const, 16}, + {"DT_MIPS_OPTIONS", Const, 16}, + {"DT_MIPS_PERF_SUFFIX", Const, 16}, + {"DT_MIPS_PIXIE_INIT", Const, 16}, + {"DT_MIPS_PLTGOT", Const, 16}, + {"DT_MIPS_PROTECTED_GOTIDX", Const, 16}, + {"DT_MIPS_RLD_MAP", Const, 16}, + {"DT_MIPS_RLD_MAP_REL", Const, 16}, + {"DT_MIPS_RLD_TEXT_RESOLVE_ADDR", Const, 16}, + {"DT_MIPS_RLD_VERSION", Const, 16}, + {"DT_MIPS_RWPLT", Const, 16}, + {"DT_MIPS_SYMBOL_LIB", Const, 16}, + {"DT_MIPS_SYMTABNO", Const, 16}, + {"DT_MIPS_TIME_STAMP", Const, 16}, + {"DT_MIPS_UNREFEXTNO", Const, 16}, + {"DT_MOVEENT", Const, 16}, + {"DT_MOVESZ", Const, 16}, + {"DT_MOVETAB", Const, 16}, + {"DT_NEEDED", Const, 0}, + {"DT_NULL", Const, 0}, + {"DT_PLTGOT", Const, 0}, + {"DT_PLTPAD", Const, 16}, + {"DT_PLTPADSZ", Const, 16}, + {"DT_PLTREL", Const, 0}, + {"DT_PLTRELSZ", Const, 0}, + {"DT_POSFLAG_1", Const, 16}, + {"DT_PPC64_GLINK", Const, 16}, + {"DT_PPC64_OPD", Const, 16}, + {"DT_PPC64_OPDSZ", Const, 16}, + {"DT_PPC64_OPT", Const, 16}, + {"DT_PPC_GOT", Const, 16}, + {"DT_PPC_OPT", Const, 16}, + {"DT_PREINIT_ARRAY", Const, 0}, + {"DT_PREINIT_ARRAYSZ", Const, 0}, + {"DT_REL", Const, 0}, + {"DT_RELA", Const, 0}, + {"DT_RELACOUNT", Const, 16}, + {"DT_RELAENT", Const, 0}, + {"DT_RELASZ", Const, 0}, + {"DT_RELCOUNT", Const, 16}, + {"DT_RELENT", Const, 0}, + {"DT_RELSZ", Const, 0}, + {"DT_RPATH", Const, 0}, + {"DT_RUNPATH", Const, 0}, + {"DT_SONAME", Const, 0}, + {"DT_SPARC_REGISTER", Const, 16}, + {"DT_STRSZ", Const, 0}, + {"DT_STRTAB", Const, 0}, + {"DT_SYMBOLIC", Const, 0}, + {"DT_SYMENT", Const, 0}, + {"DT_SYMINENT", Const, 16}, + {"DT_SYMINFO", Const, 16}, + {"DT_SYMINSZ", Const, 16}, + {"DT_SYMTAB", Const, 0}, + {"DT_SYMTAB_SHNDX", Const, 16}, + {"DT_TEXTREL", Const, 0}, + {"DT_TLSDESC_GOT", Const, 16}, + {"DT_TLSDESC_PLT", Const, 16}, + {"DT_USED", Const, 16}, + {"DT_VALRNGHI", Const, 16}, + {"DT_VALRNGLO", Const, 16}, + {"DT_VERDEF", Const, 16}, + {"DT_VERDEFNUM", Const, 16}, + {"DT_VERNEED", Const, 0}, + {"DT_VERNEEDNUM", Const, 0}, + {"DT_VERSYM", Const, 0}, + {"Data", Type, 0}, + {"Dyn32", Type, 0}, + {"Dyn32.Tag", Field, 0}, + {"Dyn32.Val", Field, 0}, + {"Dyn64", Type, 0}, + {"Dyn64.Tag", Field, 0}, + {"Dyn64.Val", Field, 0}, + {"DynFlag", Type, 0}, + {"DynFlag1", Type, 21}, + {"DynTag", Type, 0}, + {"EI_ABIVERSION", Const, 0}, + {"EI_CLASS", Const, 0}, + {"EI_DATA", Const, 0}, + {"EI_NIDENT", Const, 0}, + {"EI_OSABI", Const, 0}, + {"EI_PAD", Const, 0}, + {"EI_VERSION", Const, 0}, + {"ELFCLASS32", Const, 0}, + {"ELFCLASS64", Const, 0}, + {"ELFCLASSNONE", Const, 0}, + {"ELFDATA2LSB", Const, 0}, + {"ELFDATA2MSB", Const, 0}, + {"ELFDATANONE", Const, 0}, + {"ELFMAG", Const, 0}, + {"ELFOSABI_86OPEN", Const, 0}, + {"ELFOSABI_AIX", Const, 0}, + {"ELFOSABI_ARM", Const, 0}, + {"ELFOSABI_AROS", Const, 11}, + {"ELFOSABI_CLOUDABI", Const, 11}, + {"ELFOSABI_FENIXOS", Const, 11}, + {"ELFOSABI_FREEBSD", Const, 0}, + {"ELFOSABI_HPUX", Const, 0}, + {"ELFOSABI_HURD", Const, 0}, + {"ELFOSABI_IRIX", Const, 0}, + {"ELFOSABI_LINUX", Const, 0}, + {"ELFOSABI_MODESTO", Const, 0}, + {"ELFOSABI_NETBSD", Const, 0}, + {"ELFOSABI_NONE", Const, 0}, + {"ELFOSABI_NSK", Const, 0}, + {"ELFOSABI_OPENBSD", Const, 0}, + {"ELFOSABI_OPENVMS", Const, 0}, + {"ELFOSABI_SOLARIS", Const, 0}, + {"ELFOSABI_STANDALONE", Const, 0}, + {"ELFOSABI_TRU64", Const, 0}, + {"EM_386", Const, 0}, + {"EM_486", Const, 0}, + {"EM_56800EX", Const, 11}, + {"EM_68HC05", Const, 11}, + {"EM_68HC08", Const, 11}, + {"EM_68HC11", Const, 11}, + {"EM_68HC12", Const, 0}, + {"EM_68HC16", Const, 11}, + {"EM_68K", Const, 0}, + {"EM_78KOR", Const, 11}, + {"EM_8051", Const, 11}, + {"EM_860", Const, 0}, + {"EM_88K", Const, 0}, + {"EM_960", Const, 0}, + {"EM_AARCH64", Const, 4}, + {"EM_ALPHA", Const, 0}, + {"EM_ALPHA_STD", Const, 0}, + {"EM_ALTERA_NIOS2", Const, 11}, + {"EM_AMDGPU", Const, 11}, + {"EM_ARC", Const, 0}, + {"EM_ARCA", Const, 11}, + {"EM_ARC_COMPACT", Const, 11}, + {"EM_ARC_COMPACT2", Const, 11}, + {"EM_ARM", Const, 0}, + {"EM_AVR", Const, 11}, + {"EM_AVR32", Const, 11}, + {"EM_BA1", Const, 11}, + {"EM_BA2", Const, 11}, + {"EM_BLACKFIN", Const, 11}, + {"EM_BPF", Const, 11}, + {"EM_C166", Const, 11}, + {"EM_CDP", Const, 11}, + {"EM_CE", Const, 11}, + {"EM_CLOUDSHIELD", Const, 11}, + {"EM_COGE", Const, 11}, + {"EM_COLDFIRE", Const, 0}, + {"EM_COOL", Const, 11}, + {"EM_COREA_1ST", Const, 11}, + {"EM_COREA_2ND", Const, 11}, + {"EM_CR", Const, 11}, + {"EM_CR16", Const, 11}, + {"EM_CRAYNV2", Const, 11}, + {"EM_CRIS", Const, 11}, + {"EM_CRX", Const, 11}, + {"EM_CSR_KALIMBA", Const, 11}, + {"EM_CUDA", Const, 11}, + {"EM_CYPRESS_M8C", Const, 11}, + {"EM_D10V", Const, 11}, + {"EM_D30V", Const, 11}, + {"EM_DSP24", Const, 11}, + {"EM_DSPIC30F", Const, 11}, + {"EM_DXP", Const, 11}, + {"EM_ECOG1", Const, 11}, + {"EM_ECOG16", Const, 11}, + {"EM_ECOG1X", Const, 11}, + {"EM_ECOG2", Const, 11}, + {"EM_ETPU", Const, 11}, + {"EM_EXCESS", Const, 11}, + {"EM_F2MC16", Const, 11}, + {"EM_FIREPATH", Const, 11}, + {"EM_FR20", Const, 0}, + {"EM_FR30", Const, 11}, + {"EM_FT32", Const, 11}, + {"EM_FX66", Const, 11}, + {"EM_H8S", Const, 0}, + {"EM_H8_300", Const, 0}, + {"EM_H8_300H", Const, 0}, + {"EM_H8_500", Const, 0}, + {"EM_HUANY", Const, 11}, + {"EM_IA_64", Const, 0}, + {"EM_INTEL205", Const, 11}, + {"EM_INTEL206", Const, 11}, + {"EM_INTEL207", Const, 11}, + {"EM_INTEL208", Const, 11}, + {"EM_INTEL209", Const, 11}, + {"EM_IP2K", Const, 11}, + {"EM_JAVELIN", Const, 11}, + {"EM_K10M", Const, 11}, + {"EM_KM32", Const, 11}, + {"EM_KMX16", Const, 11}, + {"EM_KMX32", Const, 11}, + {"EM_KMX8", Const, 11}, + {"EM_KVARC", Const, 11}, + {"EM_L10M", Const, 11}, + {"EM_LANAI", Const, 11}, + {"EM_LATTICEMICO32", Const, 11}, + {"EM_LOONGARCH", Const, 19}, + {"EM_M16C", Const, 11}, + {"EM_M32", Const, 0}, + {"EM_M32C", Const, 11}, + {"EM_M32R", Const, 11}, + {"EM_MANIK", Const, 11}, + {"EM_MAX", Const, 11}, + {"EM_MAXQ30", Const, 11}, + {"EM_MCHP_PIC", Const, 11}, + {"EM_MCST_ELBRUS", Const, 11}, + {"EM_ME16", Const, 0}, + {"EM_METAG", Const, 11}, + {"EM_MICROBLAZE", Const, 11}, + {"EM_MIPS", Const, 0}, + {"EM_MIPS_RS3_LE", Const, 0}, + {"EM_MIPS_RS4_BE", Const, 0}, + {"EM_MIPS_X", Const, 0}, + {"EM_MMA", Const, 0}, + {"EM_MMDSP_PLUS", Const, 11}, + {"EM_MMIX", Const, 11}, + {"EM_MN10200", Const, 11}, + {"EM_MN10300", Const, 11}, + {"EM_MOXIE", Const, 11}, + {"EM_MSP430", Const, 11}, + {"EM_NCPU", Const, 0}, + {"EM_NDR1", Const, 0}, + {"EM_NDS32", Const, 11}, + {"EM_NONE", Const, 0}, + {"EM_NORC", Const, 11}, + {"EM_NS32K", Const, 11}, + {"EM_OPEN8", Const, 11}, + {"EM_OPENRISC", Const, 11}, + {"EM_PARISC", Const, 0}, + {"EM_PCP", Const, 0}, + {"EM_PDP10", Const, 11}, + {"EM_PDP11", Const, 11}, + {"EM_PDSP", Const, 11}, + {"EM_PJ", Const, 11}, + {"EM_PPC", Const, 0}, + {"EM_PPC64", Const, 0}, + {"EM_PRISM", Const, 11}, + {"EM_QDSP6", Const, 11}, + {"EM_R32C", Const, 11}, + {"EM_RCE", Const, 0}, + {"EM_RH32", Const, 0}, + {"EM_RISCV", Const, 11}, + {"EM_RL78", Const, 11}, + {"EM_RS08", Const, 11}, + {"EM_RX", Const, 11}, + {"EM_S370", Const, 0}, + {"EM_S390", Const, 0}, + {"EM_SCORE7", Const, 11}, + {"EM_SEP", Const, 11}, + {"EM_SE_C17", Const, 11}, + {"EM_SE_C33", Const, 11}, + {"EM_SH", Const, 0}, + {"EM_SHARC", Const, 11}, + {"EM_SLE9X", Const, 11}, + {"EM_SNP1K", Const, 11}, + {"EM_SPARC", Const, 0}, + {"EM_SPARC32PLUS", Const, 0}, + {"EM_SPARCV9", Const, 0}, + {"EM_ST100", Const, 0}, + {"EM_ST19", Const, 11}, + {"EM_ST200", Const, 11}, + {"EM_ST7", Const, 11}, + {"EM_ST9PLUS", Const, 11}, + {"EM_STARCORE", Const, 0}, + {"EM_STM8", Const, 11}, + {"EM_STXP7X", Const, 11}, + {"EM_SVX", Const, 11}, + {"EM_TILE64", Const, 11}, + {"EM_TILEGX", Const, 11}, + {"EM_TILEPRO", Const, 11}, + {"EM_TINYJ", Const, 0}, + {"EM_TI_ARP32", Const, 11}, + {"EM_TI_C2000", Const, 11}, + {"EM_TI_C5500", Const, 11}, + {"EM_TI_C6000", Const, 11}, + {"EM_TI_PRU", Const, 11}, + {"EM_TMM_GPP", Const, 11}, + {"EM_TPC", Const, 11}, + {"EM_TRICORE", Const, 0}, + {"EM_TRIMEDIA", Const, 11}, + {"EM_TSK3000", Const, 11}, + {"EM_UNICORE", Const, 11}, + {"EM_V800", Const, 0}, + {"EM_V850", Const, 11}, + {"EM_VAX", Const, 11}, + {"EM_VIDEOCORE", Const, 11}, + {"EM_VIDEOCORE3", Const, 11}, + {"EM_VIDEOCORE5", Const, 11}, + {"EM_VISIUM", Const, 11}, + {"EM_VPP500", Const, 0}, + {"EM_X86_64", Const, 0}, + {"EM_XCORE", Const, 11}, + {"EM_XGATE", Const, 11}, + {"EM_XIMO16", Const, 11}, + {"EM_XTENSA", Const, 11}, + {"EM_Z80", Const, 11}, + {"EM_ZSP", Const, 11}, + {"ET_CORE", Const, 0}, + {"ET_DYN", Const, 0}, + {"ET_EXEC", Const, 0}, + {"ET_HIOS", Const, 0}, + {"ET_HIPROC", Const, 0}, + {"ET_LOOS", Const, 0}, + {"ET_LOPROC", Const, 0}, + {"ET_NONE", Const, 0}, + {"ET_REL", Const, 0}, + {"EV_CURRENT", Const, 0}, + {"EV_NONE", Const, 0}, + {"ErrNoSymbols", Var, 4}, + {"File", Type, 0}, + {"File.FileHeader", Field, 0}, + {"File.Progs", Field, 0}, + {"File.Sections", Field, 0}, + {"FileHeader", Type, 0}, + {"FileHeader.ABIVersion", Field, 0}, + {"FileHeader.ByteOrder", Field, 0}, + {"FileHeader.Class", Field, 0}, + {"FileHeader.Data", Field, 0}, + {"FileHeader.Entry", Field, 1}, + {"FileHeader.Machine", Field, 0}, + {"FileHeader.OSABI", Field, 0}, + {"FileHeader.Type", Field, 0}, + {"FileHeader.Version", Field, 0}, + {"FormatError", Type, 0}, + {"Header32", Type, 0}, + {"Header32.Ehsize", Field, 0}, + {"Header32.Entry", Field, 0}, + {"Header32.Flags", Field, 0}, + {"Header32.Ident", Field, 0}, + {"Header32.Machine", Field, 0}, + {"Header32.Phentsize", Field, 0}, + {"Header32.Phnum", Field, 0}, + {"Header32.Phoff", Field, 0}, + {"Header32.Shentsize", Field, 0}, + {"Header32.Shnum", Field, 0}, + {"Header32.Shoff", Field, 0}, + {"Header32.Shstrndx", Field, 0}, + {"Header32.Type", Field, 0}, + {"Header32.Version", Field, 0}, + {"Header64", Type, 0}, + {"Header64.Ehsize", Field, 0}, + {"Header64.Entry", Field, 0}, + {"Header64.Flags", Field, 0}, + {"Header64.Ident", Field, 0}, + {"Header64.Machine", Field, 0}, + {"Header64.Phentsize", Field, 0}, + {"Header64.Phnum", Field, 0}, + {"Header64.Phoff", Field, 0}, + {"Header64.Shentsize", Field, 0}, + {"Header64.Shnum", Field, 0}, + {"Header64.Shoff", Field, 0}, + {"Header64.Shstrndx", Field, 0}, + {"Header64.Type", Field, 0}, + {"Header64.Version", Field, 0}, + {"ImportedSymbol", Type, 0}, + {"ImportedSymbol.Library", Field, 0}, + {"ImportedSymbol.Name", Field, 0}, + {"ImportedSymbol.Version", Field, 0}, + {"Machine", Type, 0}, + {"NT_FPREGSET", Const, 0}, + {"NT_PRPSINFO", Const, 0}, + {"NT_PRSTATUS", Const, 0}, + {"NType", Type, 0}, + {"NewFile", Func, 0}, + {"OSABI", Type, 0}, + {"Open", Func, 0}, + {"PF_MASKOS", Const, 0}, + {"PF_MASKPROC", Const, 0}, + {"PF_R", Const, 0}, + {"PF_W", Const, 0}, + {"PF_X", Const, 0}, + {"PT_AARCH64_ARCHEXT", Const, 16}, + {"PT_AARCH64_UNWIND", Const, 16}, + {"PT_ARM_ARCHEXT", Const, 16}, + {"PT_ARM_EXIDX", Const, 16}, + {"PT_DYNAMIC", Const, 0}, + {"PT_GNU_EH_FRAME", Const, 16}, + {"PT_GNU_MBIND_HI", Const, 16}, + {"PT_GNU_MBIND_LO", Const, 16}, + {"PT_GNU_PROPERTY", Const, 16}, + {"PT_GNU_RELRO", Const, 16}, + {"PT_GNU_STACK", Const, 16}, + {"PT_HIOS", Const, 0}, + {"PT_HIPROC", Const, 0}, + {"PT_INTERP", Const, 0}, + {"PT_LOAD", Const, 0}, + {"PT_LOOS", Const, 0}, + {"PT_LOPROC", Const, 0}, + {"PT_MIPS_ABIFLAGS", Const, 16}, + {"PT_MIPS_OPTIONS", Const, 16}, + {"PT_MIPS_REGINFO", Const, 16}, + {"PT_MIPS_RTPROC", Const, 16}, + {"PT_NOTE", Const, 0}, + {"PT_NULL", Const, 0}, + {"PT_OPENBSD_BOOTDATA", Const, 16}, + {"PT_OPENBSD_NOBTCFI", Const, 23}, + {"PT_OPENBSD_RANDOMIZE", Const, 16}, + {"PT_OPENBSD_WXNEEDED", Const, 16}, + {"PT_PAX_FLAGS", Const, 16}, + {"PT_PHDR", Const, 0}, + {"PT_S390_PGSTE", Const, 16}, + {"PT_SHLIB", Const, 0}, + {"PT_SUNWSTACK", Const, 16}, + {"PT_SUNW_EH_FRAME", Const, 16}, + {"PT_TLS", Const, 0}, + {"Prog", Type, 0}, + {"Prog.ProgHeader", Field, 0}, + {"Prog.ReaderAt", Field, 0}, + {"Prog32", Type, 0}, + {"Prog32.Align", Field, 0}, + {"Prog32.Filesz", Field, 0}, + {"Prog32.Flags", Field, 0}, + {"Prog32.Memsz", Field, 0}, + {"Prog32.Off", Field, 0}, + {"Prog32.Paddr", Field, 0}, + {"Prog32.Type", Field, 0}, + {"Prog32.Vaddr", Field, 0}, + {"Prog64", Type, 0}, + {"Prog64.Align", Field, 0}, + {"Prog64.Filesz", Field, 0}, + {"Prog64.Flags", Field, 0}, + {"Prog64.Memsz", Field, 0}, + {"Prog64.Off", Field, 0}, + {"Prog64.Paddr", Field, 0}, + {"Prog64.Type", Field, 0}, + {"Prog64.Vaddr", Field, 0}, + {"ProgFlag", Type, 0}, + {"ProgHeader", Type, 0}, + {"ProgHeader.Align", Field, 0}, + {"ProgHeader.Filesz", Field, 0}, + {"ProgHeader.Flags", Field, 0}, + {"ProgHeader.Memsz", Field, 0}, + {"ProgHeader.Off", Field, 0}, + {"ProgHeader.Paddr", Field, 0}, + {"ProgHeader.Type", Field, 0}, + {"ProgHeader.Vaddr", Field, 0}, + {"ProgType", Type, 0}, + {"R_386", Type, 0}, + {"R_386_16", Const, 10}, + {"R_386_32", Const, 0}, + {"R_386_32PLT", Const, 10}, + {"R_386_8", Const, 10}, + {"R_386_COPY", Const, 0}, + {"R_386_GLOB_DAT", Const, 0}, + {"R_386_GOT32", Const, 0}, + {"R_386_GOT32X", Const, 10}, + {"R_386_GOTOFF", Const, 0}, + {"R_386_GOTPC", Const, 0}, + {"R_386_IRELATIVE", Const, 10}, + {"R_386_JMP_SLOT", Const, 0}, + {"R_386_NONE", Const, 0}, + {"R_386_PC16", Const, 10}, + {"R_386_PC32", Const, 0}, + {"R_386_PC8", Const, 10}, + {"R_386_PLT32", Const, 0}, + {"R_386_RELATIVE", Const, 0}, + {"R_386_SIZE32", Const, 10}, + {"R_386_TLS_DESC", Const, 10}, + {"R_386_TLS_DESC_CALL", Const, 10}, + {"R_386_TLS_DTPMOD32", Const, 0}, + {"R_386_TLS_DTPOFF32", Const, 0}, + {"R_386_TLS_GD", Const, 0}, + {"R_386_TLS_GD_32", Const, 0}, + {"R_386_TLS_GD_CALL", Const, 0}, + {"R_386_TLS_GD_POP", Const, 0}, + {"R_386_TLS_GD_PUSH", Const, 0}, + {"R_386_TLS_GOTDESC", Const, 10}, + {"R_386_TLS_GOTIE", Const, 0}, + {"R_386_TLS_IE", Const, 0}, + {"R_386_TLS_IE_32", Const, 0}, + {"R_386_TLS_LDM", Const, 0}, + {"R_386_TLS_LDM_32", Const, 0}, + {"R_386_TLS_LDM_CALL", Const, 0}, + {"R_386_TLS_LDM_POP", Const, 0}, + {"R_386_TLS_LDM_PUSH", Const, 0}, + {"R_386_TLS_LDO_32", Const, 0}, + {"R_386_TLS_LE", Const, 0}, + {"R_386_TLS_LE_32", Const, 0}, + {"R_386_TLS_TPOFF", Const, 0}, + {"R_386_TLS_TPOFF32", Const, 0}, + {"R_390", Type, 7}, + {"R_390_12", Const, 7}, + {"R_390_16", Const, 7}, + {"R_390_20", Const, 7}, + {"R_390_32", Const, 7}, + {"R_390_64", Const, 7}, + {"R_390_8", Const, 7}, + {"R_390_COPY", Const, 7}, + {"R_390_GLOB_DAT", Const, 7}, + {"R_390_GOT12", Const, 7}, + {"R_390_GOT16", Const, 7}, + {"R_390_GOT20", Const, 7}, + {"R_390_GOT32", Const, 7}, + {"R_390_GOT64", Const, 7}, + {"R_390_GOTENT", Const, 7}, + {"R_390_GOTOFF", Const, 7}, + {"R_390_GOTOFF16", Const, 7}, + {"R_390_GOTOFF64", Const, 7}, + {"R_390_GOTPC", Const, 7}, + {"R_390_GOTPCDBL", Const, 7}, + {"R_390_GOTPLT12", Const, 7}, + {"R_390_GOTPLT16", Const, 7}, + {"R_390_GOTPLT20", Const, 7}, + {"R_390_GOTPLT32", Const, 7}, + {"R_390_GOTPLT64", Const, 7}, + {"R_390_GOTPLTENT", Const, 7}, + {"R_390_GOTPLTOFF16", Const, 7}, + {"R_390_GOTPLTOFF32", Const, 7}, + {"R_390_GOTPLTOFF64", Const, 7}, + {"R_390_JMP_SLOT", Const, 7}, + {"R_390_NONE", Const, 7}, + {"R_390_PC16", Const, 7}, + {"R_390_PC16DBL", Const, 7}, + {"R_390_PC32", Const, 7}, + {"R_390_PC32DBL", Const, 7}, + {"R_390_PC64", Const, 7}, + {"R_390_PLT16DBL", Const, 7}, + {"R_390_PLT32", Const, 7}, + {"R_390_PLT32DBL", Const, 7}, + {"R_390_PLT64", Const, 7}, + {"R_390_RELATIVE", Const, 7}, + {"R_390_TLS_DTPMOD", Const, 7}, + {"R_390_TLS_DTPOFF", Const, 7}, + {"R_390_TLS_GD32", Const, 7}, + {"R_390_TLS_GD64", Const, 7}, + {"R_390_TLS_GDCALL", Const, 7}, + {"R_390_TLS_GOTIE12", Const, 7}, + {"R_390_TLS_GOTIE20", Const, 7}, + {"R_390_TLS_GOTIE32", Const, 7}, + {"R_390_TLS_GOTIE64", Const, 7}, + {"R_390_TLS_IE32", Const, 7}, + {"R_390_TLS_IE64", Const, 7}, + {"R_390_TLS_IEENT", Const, 7}, + {"R_390_TLS_LDCALL", Const, 7}, + {"R_390_TLS_LDM32", Const, 7}, + {"R_390_TLS_LDM64", Const, 7}, + {"R_390_TLS_LDO32", Const, 7}, + {"R_390_TLS_LDO64", Const, 7}, + {"R_390_TLS_LE32", Const, 7}, + {"R_390_TLS_LE64", Const, 7}, + {"R_390_TLS_LOAD", Const, 7}, + {"R_390_TLS_TPOFF", Const, 7}, + {"R_AARCH64", Type, 4}, + {"R_AARCH64_ABS16", Const, 4}, + {"R_AARCH64_ABS32", Const, 4}, + {"R_AARCH64_ABS64", Const, 4}, + {"R_AARCH64_ADD_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_ADR_GOT_PAGE", Const, 4}, + {"R_AARCH64_ADR_PREL_LO21", Const, 4}, + {"R_AARCH64_ADR_PREL_PG_HI21", Const, 4}, + {"R_AARCH64_ADR_PREL_PG_HI21_NC", Const, 4}, + {"R_AARCH64_CALL26", Const, 4}, + {"R_AARCH64_CONDBR19", Const, 4}, + {"R_AARCH64_COPY", Const, 4}, + {"R_AARCH64_GLOB_DAT", Const, 4}, + {"R_AARCH64_GOT_LD_PREL19", Const, 4}, + {"R_AARCH64_IRELATIVE", Const, 4}, + {"R_AARCH64_JUMP26", Const, 4}, + {"R_AARCH64_JUMP_SLOT", Const, 4}, + {"R_AARCH64_LD64_GOTOFF_LO15", Const, 10}, + {"R_AARCH64_LD64_GOTPAGE_LO15", Const, 10}, + {"R_AARCH64_LD64_GOT_LO12_NC", Const, 4}, + {"R_AARCH64_LDST128_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_LDST16_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_LDST32_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_LDST64_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_LDST8_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_LD_PREL_LO19", Const, 4}, + {"R_AARCH64_MOVW_SABS_G0", Const, 4}, + {"R_AARCH64_MOVW_SABS_G1", Const, 4}, + {"R_AARCH64_MOVW_SABS_G2", Const, 4}, + {"R_AARCH64_MOVW_UABS_G0", Const, 4}, + {"R_AARCH64_MOVW_UABS_G0_NC", Const, 4}, + {"R_AARCH64_MOVW_UABS_G1", Const, 4}, + {"R_AARCH64_MOVW_UABS_G1_NC", Const, 4}, + {"R_AARCH64_MOVW_UABS_G2", Const, 4}, + {"R_AARCH64_MOVW_UABS_G2_NC", Const, 4}, + {"R_AARCH64_MOVW_UABS_G3", Const, 4}, + {"R_AARCH64_NONE", Const, 4}, + {"R_AARCH64_NULL", Const, 4}, + {"R_AARCH64_P32_ABS16", Const, 4}, + {"R_AARCH64_P32_ABS32", Const, 4}, + {"R_AARCH64_P32_ADD_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_P32_ADR_GOT_PAGE", Const, 4}, + {"R_AARCH64_P32_ADR_PREL_LO21", Const, 4}, + {"R_AARCH64_P32_ADR_PREL_PG_HI21", Const, 4}, + {"R_AARCH64_P32_CALL26", Const, 4}, + {"R_AARCH64_P32_CONDBR19", Const, 4}, + {"R_AARCH64_P32_COPY", Const, 4}, + {"R_AARCH64_P32_GLOB_DAT", Const, 4}, + {"R_AARCH64_P32_GOT_LD_PREL19", Const, 4}, + {"R_AARCH64_P32_IRELATIVE", Const, 4}, + {"R_AARCH64_P32_JUMP26", Const, 4}, + {"R_AARCH64_P32_JUMP_SLOT", Const, 4}, + {"R_AARCH64_P32_LD32_GOT_LO12_NC", Const, 4}, + {"R_AARCH64_P32_LDST128_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_P32_LDST16_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_P32_LDST32_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_P32_LDST64_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_P32_LDST8_ABS_LO12_NC", Const, 4}, + {"R_AARCH64_P32_LD_PREL_LO19", Const, 4}, + {"R_AARCH64_P32_MOVW_SABS_G0", Const, 4}, + {"R_AARCH64_P32_MOVW_UABS_G0", Const, 4}, + {"R_AARCH64_P32_MOVW_UABS_G0_NC", Const, 4}, + {"R_AARCH64_P32_MOVW_UABS_G1", Const, 4}, + {"R_AARCH64_P32_PREL16", Const, 4}, + {"R_AARCH64_P32_PREL32", Const, 4}, + {"R_AARCH64_P32_RELATIVE", Const, 4}, + {"R_AARCH64_P32_TLSDESC", Const, 4}, + {"R_AARCH64_P32_TLSDESC_ADD_LO12_NC", Const, 4}, + {"R_AARCH64_P32_TLSDESC_ADR_PAGE21", Const, 4}, + {"R_AARCH64_P32_TLSDESC_ADR_PREL21", Const, 4}, + {"R_AARCH64_P32_TLSDESC_CALL", Const, 4}, + {"R_AARCH64_P32_TLSDESC_LD32_LO12_NC", Const, 4}, + {"R_AARCH64_P32_TLSDESC_LD_PREL19", Const, 4}, + {"R_AARCH64_P32_TLSGD_ADD_LO12_NC", Const, 4}, + {"R_AARCH64_P32_TLSGD_ADR_PAGE21", Const, 4}, + {"R_AARCH64_P32_TLSIE_ADR_GOTTPREL_PAGE21", Const, 4}, + {"R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC", Const, 4}, + {"R_AARCH64_P32_TLSIE_LD_GOTTPREL_PREL19", Const, 4}, + {"R_AARCH64_P32_TLSLE_ADD_TPREL_HI12", Const, 4}, + {"R_AARCH64_P32_TLSLE_ADD_TPREL_LO12", Const, 4}, + {"R_AARCH64_P32_TLSLE_ADD_TPREL_LO12_NC", Const, 4}, + {"R_AARCH64_P32_TLSLE_MOVW_TPREL_G0", Const, 4}, + {"R_AARCH64_P32_TLSLE_MOVW_TPREL_G0_NC", Const, 4}, + {"R_AARCH64_P32_TLSLE_MOVW_TPREL_G1", Const, 4}, + {"R_AARCH64_P32_TLS_DTPMOD", Const, 4}, + {"R_AARCH64_P32_TLS_DTPREL", Const, 4}, + {"R_AARCH64_P32_TLS_TPREL", Const, 4}, + {"R_AARCH64_P32_TSTBR14", Const, 4}, + {"R_AARCH64_PREL16", Const, 4}, + {"R_AARCH64_PREL32", Const, 4}, + {"R_AARCH64_PREL64", Const, 4}, + {"R_AARCH64_RELATIVE", Const, 4}, + {"R_AARCH64_TLSDESC", Const, 4}, + {"R_AARCH64_TLSDESC_ADD", Const, 4}, + {"R_AARCH64_TLSDESC_ADD_LO12_NC", Const, 4}, + {"R_AARCH64_TLSDESC_ADR_PAGE21", Const, 4}, + {"R_AARCH64_TLSDESC_ADR_PREL21", Const, 4}, + {"R_AARCH64_TLSDESC_CALL", Const, 4}, + {"R_AARCH64_TLSDESC_LD64_LO12_NC", Const, 4}, + {"R_AARCH64_TLSDESC_LDR", Const, 4}, + {"R_AARCH64_TLSDESC_LD_PREL19", Const, 4}, + {"R_AARCH64_TLSDESC_OFF_G0_NC", Const, 4}, + {"R_AARCH64_TLSDESC_OFF_G1", Const, 4}, + {"R_AARCH64_TLSGD_ADD_LO12_NC", Const, 4}, + {"R_AARCH64_TLSGD_ADR_PAGE21", Const, 4}, + {"R_AARCH64_TLSGD_ADR_PREL21", Const, 10}, + {"R_AARCH64_TLSGD_MOVW_G0_NC", Const, 10}, + {"R_AARCH64_TLSGD_MOVW_G1", Const, 10}, + {"R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21", Const, 4}, + {"R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC", Const, 4}, + {"R_AARCH64_TLSIE_LD_GOTTPREL_PREL19", Const, 4}, + {"R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC", Const, 4}, + {"R_AARCH64_TLSIE_MOVW_GOTTPREL_G1", Const, 4}, + {"R_AARCH64_TLSLD_ADR_PAGE21", Const, 10}, + {"R_AARCH64_TLSLD_ADR_PREL21", Const, 10}, + {"R_AARCH64_TLSLD_LDST128_DTPREL_LO12", Const, 10}, + {"R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC", Const, 10}, + {"R_AARCH64_TLSLE_ADD_TPREL_HI12", Const, 4}, + {"R_AARCH64_TLSLE_ADD_TPREL_LO12", Const, 4}, + {"R_AARCH64_TLSLE_ADD_TPREL_LO12_NC", Const, 4}, + {"R_AARCH64_TLSLE_LDST128_TPREL_LO12", Const, 10}, + {"R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC", Const, 10}, + {"R_AARCH64_TLSLE_MOVW_TPREL_G0", Const, 4}, + {"R_AARCH64_TLSLE_MOVW_TPREL_G0_NC", Const, 4}, + {"R_AARCH64_TLSLE_MOVW_TPREL_G1", Const, 4}, + {"R_AARCH64_TLSLE_MOVW_TPREL_G1_NC", Const, 4}, + {"R_AARCH64_TLSLE_MOVW_TPREL_G2", Const, 4}, + {"R_AARCH64_TLS_DTPMOD64", Const, 4}, + {"R_AARCH64_TLS_DTPREL64", Const, 4}, + {"R_AARCH64_TLS_TPREL64", Const, 4}, + {"R_AARCH64_TSTBR14", Const, 4}, + {"R_ALPHA", Type, 0}, + {"R_ALPHA_BRADDR", Const, 0}, + {"R_ALPHA_COPY", Const, 0}, + {"R_ALPHA_GLOB_DAT", Const, 0}, + {"R_ALPHA_GPDISP", Const, 0}, + {"R_ALPHA_GPREL32", Const, 0}, + {"R_ALPHA_GPRELHIGH", Const, 0}, + {"R_ALPHA_GPRELLOW", Const, 0}, + {"R_ALPHA_GPVALUE", Const, 0}, + {"R_ALPHA_HINT", Const, 0}, + {"R_ALPHA_IMMED_BR_HI32", Const, 0}, + {"R_ALPHA_IMMED_GP_16", Const, 0}, + {"R_ALPHA_IMMED_GP_HI32", Const, 0}, + {"R_ALPHA_IMMED_LO32", Const, 0}, + {"R_ALPHA_IMMED_SCN_HI32", Const, 0}, + {"R_ALPHA_JMP_SLOT", Const, 0}, + {"R_ALPHA_LITERAL", Const, 0}, + {"R_ALPHA_LITUSE", Const, 0}, + {"R_ALPHA_NONE", Const, 0}, + {"R_ALPHA_OP_PRSHIFT", Const, 0}, + {"R_ALPHA_OP_PSUB", Const, 0}, + {"R_ALPHA_OP_PUSH", Const, 0}, + {"R_ALPHA_OP_STORE", Const, 0}, + {"R_ALPHA_REFLONG", Const, 0}, + {"R_ALPHA_REFQUAD", Const, 0}, + {"R_ALPHA_RELATIVE", Const, 0}, + {"R_ALPHA_SREL16", Const, 0}, + {"R_ALPHA_SREL32", Const, 0}, + {"R_ALPHA_SREL64", Const, 0}, + {"R_ARM", Type, 0}, + {"R_ARM_ABS12", Const, 0}, + {"R_ARM_ABS16", Const, 0}, + {"R_ARM_ABS32", Const, 0}, + {"R_ARM_ABS32_NOI", Const, 10}, + {"R_ARM_ABS8", Const, 0}, + {"R_ARM_ALU_PCREL_15_8", Const, 10}, + {"R_ARM_ALU_PCREL_23_15", Const, 10}, + {"R_ARM_ALU_PCREL_7_0", Const, 10}, + {"R_ARM_ALU_PC_G0", Const, 10}, + {"R_ARM_ALU_PC_G0_NC", Const, 10}, + {"R_ARM_ALU_PC_G1", Const, 10}, + {"R_ARM_ALU_PC_G1_NC", Const, 10}, + {"R_ARM_ALU_PC_G2", Const, 10}, + {"R_ARM_ALU_SBREL_19_12_NC", Const, 10}, + {"R_ARM_ALU_SBREL_27_20_CK", Const, 10}, + {"R_ARM_ALU_SB_G0", Const, 10}, + {"R_ARM_ALU_SB_G0_NC", Const, 10}, + {"R_ARM_ALU_SB_G1", Const, 10}, + {"R_ARM_ALU_SB_G1_NC", Const, 10}, + {"R_ARM_ALU_SB_G2", Const, 10}, + {"R_ARM_AMP_VCALL9", Const, 0}, + {"R_ARM_BASE_ABS", Const, 10}, + {"R_ARM_CALL", Const, 10}, + {"R_ARM_COPY", Const, 0}, + {"R_ARM_GLOB_DAT", Const, 0}, + {"R_ARM_GNU_VTENTRY", Const, 0}, + {"R_ARM_GNU_VTINHERIT", Const, 0}, + {"R_ARM_GOT32", Const, 0}, + {"R_ARM_GOTOFF", Const, 0}, + {"R_ARM_GOTOFF12", Const, 10}, + {"R_ARM_GOTPC", Const, 0}, + {"R_ARM_GOTRELAX", Const, 10}, + {"R_ARM_GOT_ABS", Const, 10}, + {"R_ARM_GOT_BREL12", Const, 10}, + {"R_ARM_GOT_PREL", Const, 10}, + {"R_ARM_IRELATIVE", Const, 10}, + {"R_ARM_JUMP24", Const, 10}, + {"R_ARM_JUMP_SLOT", Const, 0}, + {"R_ARM_LDC_PC_G0", Const, 10}, + {"R_ARM_LDC_PC_G1", Const, 10}, + {"R_ARM_LDC_PC_G2", Const, 10}, + {"R_ARM_LDC_SB_G0", Const, 10}, + {"R_ARM_LDC_SB_G1", Const, 10}, + {"R_ARM_LDC_SB_G2", Const, 10}, + {"R_ARM_LDRS_PC_G0", Const, 10}, + {"R_ARM_LDRS_PC_G1", Const, 10}, + {"R_ARM_LDRS_PC_G2", Const, 10}, + {"R_ARM_LDRS_SB_G0", Const, 10}, + {"R_ARM_LDRS_SB_G1", Const, 10}, + {"R_ARM_LDRS_SB_G2", Const, 10}, + {"R_ARM_LDR_PC_G1", Const, 10}, + {"R_ARM_LDR_PC_G2", Const, 10}, + {"R_ARM_LDR_SBREL_11_10_NC", Const, 10}, + {"R_ARM_LDR_SB_G0", Const, 10}, + {"R_ARM_LDR_SB_G1", Const, 10}, + {"R_ARM_LDR_SB_G2", Const, 10}, + {"R_ARM_ME_TOO", Const, 10}, + {"R_ARM_MOVT_ABS", Const, 10}, + {"R_ARM_MOVT_BREL", Const, 10}, + {"R_ARM_MOVT_PREL", Const, 10}, + {"R_ARM_MOVW_ABS_NC", Const, 10}, + {"R_ARM_MOVW_BREL", Const, 10}, + {"R_ARM_MOVW_BREL_NC", Const, 10}, + {"R_ARM_MOVW_PREL_NC", Const, 10}, + {"R_ARM_NONE", Const, 0}, + {"R_ARM_PC13", Const, 0}, + {"R_ARM_PC24", Const, 0}, + {"R_ARM_PLT32", Const, 0}, + {"R_ARM_PLT32_ABS", Const, 10}, + {"R_ARM_PREL31", Const, 10}, + {"R_ARM_PRIVATE_0", Const, 10}, + {"R_ARM_PRIVATE_1", Const, 10}, + {"R_ARM_PRIVATE_10", Const, 10}, + {"R_ARM_PRIVATE_11", Const, 10}, + {"R_ARM_PRIVATE_12", Const, 10}, + {"R_ARM_PRIVATE_13", Const, 10}, + {"R_ARM_PRIVATE_14", Const, 10}, + {"R_ARM_PRIVATE_15", Const, 10}, + {"R_ARM_PRIVATE_2", Const, 10}, + {"R_ARM_PRIVATE_3", Const, 10}, + {"R_ARM_PRIVATE_4", Const, 10}, + {"R_ARM_PRIVATE_5", Const, 10}, + {"R_ARM_PRIVATE_6", Const, 10}, + {"R_ARM_PRIVATE_7", Const, 10}, + {"R_ARM_PRIVATE_8", Const, 10}, + {"R_ARM_PRIVATE_9", Const, 10}, + {"R_ARM_RABS32", Const, 0}, + {"R_ARM_RBASE", Const, 0}, + {"R_ARM_REL32", Const, 0}, + {"R_ARM_REL32_NOI", Const, 10}, + {"R_ARM_RELATIVE", Const, 0}, + {"R_ARM_RPC24", Const, 0}, + {"R_ARM_RREL32", Const, 0}, + {"R_ARM_RSBREL32", Const, 0}, + {"R_ARM_RXPC25", Const, 10}, + {"R_ARM_SBREL31", Const, 10}, + {"R_ARM_SBREL32", Const, 0}, + {"R_ARM_SWI24", Const, 0}, + {"R_ARM_TARGET1", Const, 10}, + {"R_ARM_TARGET2", Const, 10}, + {"R_ARM_THM_ABS5", Const, 0}, + {"R_ARM_THM_ALU_ABS_G0_NC", Const, 10}, + {"R_ARM_THM_ALU_ABS_G1_NC", Const, 10}, + {"R_ARM_THM_ALU_ABS_G2_NC", Const, 10}, + {"R_ARM_THM_ALU_ABS_G3", Const, 10}, + {"R_ARM_THM_ALU_PREL_11_0", Const, 10}, + {"R_ARM_THM_GOT_BREL12", Const, 10}, + {"R_ARM_THM_JUMP11", Const, 10}, + {"R_ARM_THM_JUMP19", Const, 10}, + {"R_ARM_THM_JUMP24", Const, 10}, + {"R_ARM_THM_JUMP6", Const, 10}, + {"R_ARM_THM_JUMP8", Const, 10}, + {"R_ARM_THM_MOVT_ABS", Const, 10}, + {"R_ARM_THM_MOVT_BREL", Const, 10}, + {"R_ARM_THM_MOVT_PREL", Const, 10}, + {"R_ARM_THM_MOVW_ABS_NC", Const, 10}, + {"R_ARM_THM_MOVW_BREL", Const, 10}, + {"R_ARM_THM_MOVW_BREL_NC", Const, 10}, + {"R_ARM_THM_MOVW_PREL_NC", Const, 10}, + {"R_ARM_THM_PC12", Const, 10}, + {"R_ARM_THM_PC22", Const, 0}, + {"R_ARM_THM_PC8", Const, 0}, + {"R_ARM_THM_RPC22", Const, 0}, + {"R_ARM_THM_SWI8", Const, 0}, + {"R_ARM_THM_TLS_CALL", Const, 10}, + {"R_ARM_THM_TLS_DESCSEQ16", Const, 10}, + {"R_ARM_THM_TLS_DESCSEQ32", Const, 10}, + {"R_ARM_THM_XPC22", Const, 0}, + {"R_ARM_TLS_CALL", Const, 10}, + {"R_ARM_TLS_DESCSEQ", Const, 10}, + {"R_ARM_TLS_DTPMOD32", Const, 10}, + {"R_ARM_TLS_DTPOFF32", Const, 10}, + {"R_ARM_TLS_GD32", Const, 10}, + {"R_ARM_TLS_GOTDESC", Const, 10}, + {"R_ARM_TLS_IE12GP", Const, 10}, + {"R_ARM_TLS_IE32", Const, 10}, + {"R_ARM_TLS_LDM32", Const, 10}, + {"R_ARM_TLS_LDO12", Const, 10}, + {"R_ARM_TLS_LDO32", Const, 10}, + {"R_ARM_TLS_LE12", Const, 10}, + {"R_ARM_TLS_LE32", Const, 10}, + {"R_ARM_TLS_TPOFF32", Const, 10}, + {"R_ARM_V4BX", Const, 10}, + {"R_ARM_XPC25", Const, 0}, + {"R_INFO", Func, 0}, + {"R_INFO32", Func, 0}, + {"R_LARCH", Type, 19}, + {"R_LARCH_32", Const, 19}, + {"R_LARCH_32_PCREL", Const, 20}, + {"R_LARCH_64", Const, 19}, + {"R_LARCH_64_PCREL", Const, 22}, + {"R_LARCH_ABS64_HI12", Const, 20}, + {"R_LARCH_ABS64_LO20", Const, 20}, + {"R_LARCH_ABS_HI20", Const, 20}, + {"R_LARCH_ABS_LO12", Const, 20}, + {"R_LARCH_ADD16", Const, 19}, + {"R_LARCH_ADD24", Const, 19}, + {"R_LARCH_ADD32", Const, 19}, + {"R_LARCH_ADD6", Const, 22}, + {"R_LARCH_ADD64", Const, 19}, + {"R_LARCH_ADD8", Const, 19}, + {"R_LARCH_ADD_ULEB128", Const, 22}, + {"R_LARCH_ALIGN", Const, 22}, + {"R_LARCH_B16", Const, 20}, + {"R_LARCH_B21", Const, 20}, + {"R_LARCH_B26", Const, 20}, + {"R_LARCH_CFA", Const, 22}, + {"R_LARCH_COPY", Const, 19}, + {"R_LARCH_DELETE", Const, 22}, + {"R_LARCH_GNU_VTENTRY", Const, 20}, + {"R_LARCH_GNU_VTINHERIT", Const, 20}, + {"R_LARCH_GOT64_HI12", Const, 20}, + {"R_LARCH_GOT64_LO20", Const, 20}, + {"R_LARCH_GOT64_PC_HI12", Const, 20}, + {"R_LARCH_GOT64_PC_LO20", Const, 20}, + {"R_LARCH_GOT_HI20", Const, 20}, + {"R_LARCH_GOT_LO12", Const, 20}, + {"R_LARCH_GOT_PC_HI20", Const, 20}, + {"R_LARCH_GOT_PC_LO12", Const, 20}, + {"R_LARCH_IRELATIVE", Const, 19}, + {"R_LARCH_JUMP_SLOT", Const, 19}, + {"R_LARCH_MARK_LA", Const, 19}, + {"R_LARCH_MARK_PCREL", Const, 19}, + {"R_LARCH_NONE", Const, 19}, + {"R_LARCH_PCALA64_HI12", Const, 20}, + {"R_LARCH_PCALA64_LO20", Const, 20}, + {"R_LARCH_PCALA_HI20", Const, 20}, + {"R_LARCH_PCALA_LO12", Const, 20}, + {"R_LARCH_PCREL20_S2", Const, 22}, + {"R_LARCH_RELATIVE", Const, 19}, + {"R_LARCH_RELAX", Const, 20}, + {"R_LARCH_SOP_ADD", Const, 19}, + {"R_LARCH_SOP_AND", Const, 19}, + {"R_LARCH_SOP_ASSERT", Const, 19}, + {"R_LARCH_SOP_IF_ELSE", Const, 19}, + {"R_LARCH_SOP_NOT", Const, 19}, + {"R_LARCH_SOP_POP_32_S_0_10_10_16_S2", Const, 19}, + {"R_LARCH_SOP_POP_32_S_0_5_10_16_S2", Const, 19}, + {"R_LARCH_SOP_POP_32_S_10_12", Const, 19}, + {"R_LARCH_SOP_POP_32_S_10_16", Const, 19}, + {"R_LARCH_SOP_POP_32_S_10_16_S2", Const, 19}, + {"R_LARCH_SOP_POP_32_S_10_5", Const, 19}, + {"R_LARCH_SOP_POP_32_S_5_20", Const, 19}, + {"R_LARCH_SOP_POP_32_U", Const, 19}, + {"R_LARCH_SOP_POP_32_U_10_12", Const, 19}, + {"R_LARCH_SOP_PUSH_ABSOLUTE", Const, 19}, + {"R_LARCH_SOP_PUSH_DUP", Const, 19}, + {"R_LARCH_SOP_PUSH_GPREL", Const, 19}, + {"R_LARCH_SOP_PUSH_PCREL", Const, 19}, + {"R_LARCH_SOP_PUSH_PLT_PCREL", Const, 19}, + {"R_LARCH_SOP_PUSH_TLS_GD", Const, 19}, + {"R_LARCH_SOP_PUSH_TLS_GOT", Const, 19}, + {"R_LARCH_SOP_PUSH_TLS_TPREL", Const, 19}, + {"R_LARCH_SOP_SL", Const, 19}, + {"R_LARCH_SOP_SR", Const, 19}, + {"R_LARCH_SOP_SUB", Const, 19}, + {"R_LARCH_SUB16", Const, 19}, + {"R_LARCH_SUB24", Const, 19}, + {"R_LARCH_SUB32", Const, 19}, + {"R_LARCH_SUB6", Const, 22}, + {"R_LARCH_SUB64", Const, 19}, + {"R_LARCH_SUB8", Const, 19}, + {"R_LARCH_SUB_ULEB128", Const, 22}, + {"R_LARCH_TLS_DTPMOD32", Const, 19}, + {"R_LARCH_TLS_DTPMOD64", Const, 19}, + {"R_LARCH_TLS_DTPREL32", Const, 19}, + {"R_LARCH_TLS_DTPREL64", Const, 19}, + {"R_LARCH_TLS_GD_HI20", Const, 20}, + {"R_LARCH_TLS_GD_PC_HI20", Const, 20}, + {"R_LARCH_TLS_IE64_HI12", Const, 20}, + {"R_LARCH_TLS_IE64_LO20", Const, 20}, + {"R_LARCH_TLS_IE64_PC_HI12", Const, 20}, + {"R_LARCH_TLS_IE64_PC_LO20", Const, 20}, + {"R_LARCH_TLS_IE_HI20", Const, 20}, + {"R_LARCH_TLS_IE_LO12", Const, 20}, + {"R_LARCH_TLS_IE_PC_HI20", Const, 20}, + {"R_LARCH_TLS_IE_PC_LO12", Const, 20}, + {"R_LARCH_TLS_LD_HI20", Const, 20}, + {"R_LARCH_TLS_LD_PC_HI20", Const, 20}, + {"R_LARCH_TLS_LE64_HI12", Const, 20}, + {"R_LARCH_TLS_LE64_LO20", Const, 20}, + {"R_LARCH_TLS_LE_HI20", Const, 20}, + {"R_LARCH_TLS_LE_LO12", Const, 20}, + {"R_LARCH_TLS_TPREL32", Const, 19}, + {"R_LARCH_TLS_TPREL64", Const, 19}, + {"R_MIPS", Type, 6}, + {"R_MIPS_16", Const, 6}, + {"R_MIPS_26", Const, 6}, + {"R_MIPS_32", Const, 6}, + {"R_MIPS_64", Const, 6}, + {"R_MIPS_ADD_IMMEDIATE", Const, 6}, + {"R_MIPS_CALL16", Const, 6}, + {"R_MIPS_CALL_HI16", Const, 6}, + {"R_MIPS_CALL_LO16", Const, 6}, + {"R_MIPS_DELETE", Const, 6}, + {"R_MIPS_GOT16", Const, 6}, + {"R_MIPS_GOT_DISP", Const, 6}, + {"R_MIPS_GOT_HI16", Const, 6}, + {"R_MIPS_GOT_LO16", Const, 6}, + {"R_MIPS_GOT_OFST", Const, 6}, + {"R_MIPS_GOT_PAGE", Const, 6}, + {"R_MIPS_GPREL16", Const, 6}, + {"R_MIPS_GPREL32", Const, 6}, + {"R_MIPS_HI16", Const, 6}, + {"R_MIPS_HIGHER", Const, 6}, + {"R_MIPS_HIGHEST", Const, 6}, + {"R_MIPS_INSERT_A", Const, 6}, + {"R_MIPS_INSERT_B", Const, 6}, + {"R_MIPS_JALR", Const, 6}, + {"R_MIPS_LITERAL", Const, 6}, + {"R_MIPS_LO16", Const, 6}, + {"R_MIPS_NONE", Const, 6}, + {"R_MIPS_PC16", Const, 6}, + {"R_MIPS_PC32", Const, 22}, + {"R_MIPS_PJUMP", Const, 6}, + {"R_MIPS_REL16", Const, 6}, + {"R_MIPS_REL32", Const, 6}, + {"R_MIPS_RELGOT", Const, 6}, + {"R_MIPS_SCN_DISP", Const, 6}, + {"R_MIPS_SHIFT5", Const, 6}, + {"R_MIPS_SHIFT6", Const, 6}, + {"R_MIPS_SUB", Const, 6}, + {"R_MIPS_TLS_DTPMOD32", Const, 6}, + {"R_MIPS_TLS_DTPMOD64", Const, 6}, + {"R_MIPS_TLS_DTPREL32", Const, 6}, + {"R_MIPS_TLS_DTPREL64", Const, 6}, + {"R_MIPS_TLS_DTPREL_HI16", Const, 6}, + {"R_MIPS_TLS_DTPREL_LO16", Const, 6}, + {"R_MIPS_TLS_GD", Const, 6}, + {"R_MIPS_TLS_GOTTPREL", Const, 6}, + {"R_MIPS_TLS_LDM", Const, 6}, + {"R_MIPS_TLS_TPREL32", Const, 6}, + {"R_MIPS_TLS_TPREL64", Const, 6}, + {"R_MIPS_TLS_TPREL_HI16", Const, 6}, + {"R_MIPS_TLS_TPREL_LO16", Const, 6}, + {"R_PPC", Type, 0}, + {"R_PPC64", Type, 5}, + {"R_PPC64_ADDR14", Const, 5}, + {"R_PPC64_ADDR14_BRNTAKEN", Const, 5}, + {"R_PPC64_ADDR14_BRTAKEN", Const, 5}, + {"R_PPC64_ADDR16", Const, 5}, + {"R_PPC64_ADDR16_DS", Const, 5}, + {"R_PPC64_ADDR16_HA", Const, 5}, + {"R_PPC64_ADDR16_HI", Const, 5}, + {"R_PPC64_ADDR16_HIGH", Const, 10}, + {"R_PPC64_ADDR16_HIGHA", Const, 10}, + {"R_PPC64_ADDR16_HIGHER", Const, 5}, + {"R_PPC64_ADDR16_HIGHER34", Const, 20}, + {"R_PPC64_ADDR16_HIGHERA", Const, 5}, + {"R_PPC64_ADDR16_HIGHERA34", Const, 20}, + {"R_PPC64_ADDR16_HIGHEST", Const, 5}, + {"R_PPC64_ADDR16_HIGHEST34", Const, 20}, + {"R_PPC64_ADDR16_HIGHESTA", Const, 5}, + {"R_PPC64_ADDR16_HIGHESTA34", Const, 20}, + {"R_PPC64_ADDR16_LO", Const, 5}, + {"R_PPC64_ADDR16_LO_DS", Const, 5}, + {"R_PPC64_ADDR24", Const, 5}, + {"R_PPC64_ADDR32", Const, 5}, + {"R_PPC64_ADDR64", Const, 5}, + {"R_PPC64_ADDR64_LOCAL", Const, 10}, + {"R_PPC64_COPY", Const, 20}, + {"R_PPC64_D28", Const, 20}, + {"R_PPC64_D34", Const, 20}, + {"R_PPC64_D34_HA30", Const, 20}, + {"R_PPC64_D34_HI30", Const, 20}, + {"R_PPC64_D34_LO", Const, 20}, + {"R_PPC64_DTPMOD64", Const, 5}, + {"R_PPC64_DTPREL16", Const, 5}, + {"R_PPC64_DTPREL16_DS", Const, 5}, + {"R_PPC64_DTPREL16_HA", Const, 5}, + {"R_PPC64_DTPREL16_HI", Const, 5}, + {"R_PPC64_DTPREL16_HIGH", Const, 10}, + {"R_PPC64_DTPREL16_HIGHA", Const, 10}, + {"R_PPC64_DTPREL16_HIGHER", Const, 5}, + {"R_PPC64_DTPREL16_HIGHERA", Const, 5}, + {"R_PPC64_DTPREL16_HIGHEST", Const, 5}, + {"R_PPC64_DTPREL16_HIGHESTA", Const, 5}, + {"R_PPC64_DTPREL16_LO", Const, 5}, + {"R_PPC64_DTPREL16_LO_DS", Const, 5}, + {"R_PPC64_DTPREL34", Const, 20}, + {"R_PPC64_DTPREL64", Const, 5}, + {"R_PPC64_ENTRY", Const, 10}, + {"R_PPC64_GLOB_DAT", Const, 20}, + {"R_PPC64_GNU_VTENTRY", Const, 20}, + {"R_PPC64_GNU_VTINHERIT", Const, 20}, + {"R_PPC64_GOT16", Const, 5}, + {"R_PPC64_GOT16_DS", Const, 5}, + {"R_PPC64_GOT16_HA", Const, 5}, + {"R_PPC64_GOT16_HI", Const, 5}, + {"R_PPC64_GOT16_LO", Const, 5}, + {"R_PPC64_GOT16_LO_DS", Const, 5}, + {"R_PPC64_GOT_DTPREL16_DS", Const, 5}, + {"R_PPC64_GOT_DTPREL16_HA", Const, 5}, + {"R_PPC64_GOT_DTPREL16_HI", Const, 5}, + {"R_PPC64_GOT_DTPREL16_LO_DS", Const, 5}, + {"R_PPC64_GOT_DTPREL_PCREL34", Const, 20}, + {"R_PPC64_GOT_PCREL34", Const, 20}, + {"R_PPC64_GOT_TLSGD16", Const, 5}, + {"R_PPC64_GOT_TLSGD16_HA", Const, 5}, + {"R_PPC64_GOT_TLSGD16_HI", Const, 5}, + {"R_PPC64_GOT_TLSGD16_LO", Const, 5}, + {"R_PPC64_GOT_TLSGD_PCREL34", Const, 20}, + {"R_PPC64_GOT_TLSLD16", Const, 5}, + {"R_PPC64_GOT_TLSLD16_HA", Const, 5}, + {"R_PPC64_GOT_TLSLD16_HI", Const, 5}, + {"R_PPC64_GOT_TLSLD16_LO", Const, 5}, + {"R_PPC64_GOT_TLSLD_PCREL34", Const, 20}, + {"R_PPC64_GOT_TPREL16_DS", Const, 5}, + {"R_PPC64_GOT_TPREL16_HA", Const, 5}, + {"R_PPC64_GOT_TPREL16_HI", Const, 5}, + {"R_PPC64_GOT_TPREL16_LO_DS", Const, 5}, + {"R_PPC64_GOT_TPREL_PCREL34", Const, 20}, + {"R_PPC64_IRELATIVE", Const, 10}, + {"R_PPC64_JMP_IREL", Const, 10}, + {"R_PPC64_JMP_SLOT", Const, 5}, + {"R_PPC64_NONE", Const, 5}, + {"R_PPC64_PCREL28", Const, 20}, + {"R_PPC64_PCREL34", Const, 20}, + {"R_PPC64_PCREL_OPT", Const, 20}, + {"R_PPC64_PLT16_HA", Const, 20}, + {"R_PPC64_PLT16_HI", Const, 20}, + {"R_PPC64_PLT16_LO", Const, 20}, + {"R_PPC64_PLT16_LO_DS", Const, 10}, + {"R_PPC64_PLT32", Const, 20}, + {"R_PPC64_PLT64", Const, 20}, + {"R_PPC64_PLTCALL", Const, 20}, + {"R_PPC64_PLTCALL_NOTOC", Const, 20}, + {"R_PPC64_PLTGOT16", Const, 10}, + {"R_PPC64_PLTGOT16_DS", Const, 10}, + {"R_PPC64_PLTGOT16_HA", Const, 10}, + {"R_PPC64_PLTGOT16_HI", Const, 10}, + {"R_PPC64_PLTGOT16_LO", Const, 10}, + {"R_PPC64_PLTGOT_LO_DS", Const, 10}, + {"R_PPC64_PLTREL32", Const, 20}, + {"R_PPC64_PLTREL64", Const, 20}, + {"R_PPC64_PLTSEQ", Const, 20}, + {"R_PPC64_PLTSEQ_NOTOC", Const, 20}, + {"R_PPC64_PLT_PCREL34", Const, 20}, + {"R_PPC64_PLT_PCREL34_NOTOC", Const, 20}, + {"R_PPC64_REL14", Const, 5}, + {"R_PPC64_REL14_BRNTAKEN", Const, 5}, + {"R_PPC64_REL14_BRTAKEN", Const, 5}, + {"R_PPC64_REL16", Const, 5}, + {"R_PPC64_REL16DX_HA", Const, 10}, + {"R_PPC64_REL16_HA", Const, 5}, + {"R_PPC64_REL16_HI", Const, 5}, + {"R_PPC64_REL16_HIGH", Const, 20}, + {"R_PPC64_REL16_HIGHA", Const, 20}, + {"R_PPC64_REL16_HIGHER", Const, 20}, + {"R_PPC64_REL16_HIGHER34", Const, 20}, + {"R_PPC64_REL16_HIGHERA", Const, 20}, + {"R_PPC64_REL16_HIGHERA34", Const, 20}, + {"R_PPC64_REL16_HIGHEST", Const, 20}, + {"R_PPC64_REL16_HIGHEST34", Const, 20}, + {"R_PPC64_REL16_HIGHESTA", Const, 20}, + {"R_PPC64_REL16_HIGHESTA34", Const, 20}, + {"R_PPC64_REL16_LO", Const, 5}, + {"R_PPC64_REL24", Const, 5}, + {"R_PPC64_REL24_NOTOC", Const, 10}, + {"R_PPC64_REL24_P9NOTOC", Const, 21}, + {"R_PPC64_REL30", Const, 20}, + {"R_PPC64_REL32", Const, 5}, + {"R_PPC64_REL64", Const, 5}, + {"R_PPC64_RELATIVE", Const, 18}, + {"R_PPC64_SECTOFF", Const, 20}, + {"R_PPC64_SECTOFF_DS", Const, 10}, + {"R_PPC64_SECTOFF_HA", Const, 20}, + {"R_PPC64_SECTOFF_HI", Const, 20}, + {"R_PPC64_SECTOFF_LO", Const, 20}, + {"R_PPC64_SECTOFF_LO_DS", Const, 10}, + {"R_PPC64_TLS", Const, 5}, + {"R_PPC64_TLSGD", Const, 5}, + {"R_PPC64_TLSLD", Const, 5}, + {"R_PPC64_TOC", Const, 5}, + {"R_PPC64_TOC16", Const, 5}, + {"R_PPC64_TOC16_DS", Const, 5}, + {"R_PPC64_TOC16_HA", Const, 5}, + {"R_PPC64_TOC16_HI", Const, 5}, + {"R_PPC64_TOC16_LO", Const, 5}, + {"R_PPC64_TOC16_LO_DS", Const, 5}, + {"R_PPC64_TOCSAVE", Const, 10}, + {"R_PPC64_TPREL16", Const, 5}, + {"R_PPC64_TPREL16_DS", Const, 5}, + {"R_PPC64_TPREL16_HA", Const, 5}, + {"R_PPC64_TPREL16_HI", Const, 5}, + {"R_PPC64_TPREL16_HIGH", Const, 10}, + {"R_PPC64_TPREL16_HIGHA", Const, 10}, + {"R_PPC64_TPREL16_HIGHER", Const, 5}, + {"R_PPC64_TPREL16_HIGHERA", Const, 5}, + {"R_PPC64_TPREL16_HIGHEST", Const, 5}, + {"R_PPC64_TPREL16_HIGHESTA", Const, 5}, + {"R_PPC64_TPREL16_LO", Const, 5}, + {"R_PPC64_TPREL16_LO_DS", Const, 5}, + {"R_PPC64_TPREL34", Const, 20}, + {"R_PPC64_TPREL64", Const, 5}, + {"R_PPC64_UADDR16", Const, 20}, + {"R_PPC64_UADDR32", Const, 20}, + {"R_PPC64_UADDR64", Const, 20}, + {"R_PPC_ADDR14", Const, 0}, + {"R_PPC_ADDR14_BRNTAKEN", Const, 0}, + {"R_PPC_ADDR14_BRTAKEN", Const, 0}, + {"R_PPC_ADDR16", Const, 0}, + {"R_PPC_ADDR16_HA", Const, 0}, + {"R_PPC_ADDR16_HI", Const, 0}, + {"R_PPC_ADDR16_LO", Const, 0}, + {"R_PPC_ADDR24", Const, 0}, + {"R_PPC_ADDR32", Const, 0}, + {"R_PPC_COPY", Const, 0}, + {"R_PPC_DTPMOD32", Const, 0}, + {"R_PPC_DTPREL16", Const, 0}, + {"R_PPC_DTPREL16_HA", Const, 0}, + {"R_PPC_DTPREL16_HI", Const, 0}, + {"R_PPC_DTPREL16_LO", Const, 0}, + {"R_PPC_DTPREL32", Const, 0}, + {"R_PPC_EMB_BIT_FLD", Const, 0}, + {"R_PPC_EMB_MRKREF", Const, 0}, + {"R_PPC_EMB_NADDR16", Const, 0}, + {"R_PPC_EMB_NADDR16_HA", Const, 0}, + {"R_PPC_EMB_NADDR16_HI", Const, 0}, + {"R_PPC_EMB_NADDR16_LO", Const, 0}, + {"R_PPC_EMB_NADDR32", Const, 0}, + {"R_PPC_EMB_RELSDA", Const, 0}, + {"R_PPC_EMB_RELSEC16", Const, 0}, + {"R_PPC_EMB_RELST_HA", Const, 0}, + {"R_PPC_EMB_RELST_HI", Const, 0}, + {"R_PPC_EMB_RELST_LO", Const, 0}, + {"R_PPC_EMB_SDA21", Const, 0}, + {"R_PPC_EMB_SDA2I16", Const, 0}, + {"R_PPC_EMB_SDA2REL", Const, 0}, + {"R_PPC_EMB_SDAI16", Const, 0}, + {"R_PPC_GLOB_DAT", Const, 0}, + {"R_PPC_GOT16", Const, 0}, + {"R_PPC_GOT16_HA", Const, 0}, + {"R_PPC_GOT16_HI", Const, 0}, + {"R_PPC_GOT16_LO", Const, 0}, + {"R_PPC_GOT_TLSGD16", Const, 0}, + {"R_PPC_GOT_TLSGD16_HA", Const, 0}, + {"R_PPC_GOT_TLSGD16_HI", Const, 0}, + {"R_PPC_GOT_TLSGD16_LO", Const, 0}, + {"R_PPC_GOT_TLSLD16", Const, 0}, + {"R_PPC_GOT_TLSLD16_HA", Const, 0}, + {"R_PPC_GOT_TLSLD16_HI", Const, 0}, + {"R_PPC_GOT_TLSLD16_LO", Const, 0}, + {"R_PPC_GOT_TPREL16", Const, 0}, + {"R_PPC_GOT_TPREL16_HA", Const, 0}, + {"R_PPC_GOT_TPREL16_HI", Const, 0}, + {"R_PPC_GOT_TPREL16_LO", Const, 0}, + {"R_PPC_JMP_SLOT", Const, 0}, + {"R_PPC_LOCAL24PC", Const, 0}, + {"R_PPC_NONE", Const, 0}, + {"R_PPC_PLT16_HA", Const, 0}, + {"R_PPC_PLT16_HI", Const, 0}, + {"R_PPC_PLT16_LO", Const, 0}, + {"R_PPC_PLT32", Const, 0}, + {"R_PPC_PLTREL24", Const, 0}, + {"R_PPC_PLTREL32", Const, 0}, + {"R_PPC_REL14", Const, 0}, + {"R_PPC_REL14_BRNTAKEN", Const, 0}, + {"R_PPC_REL14_BRTAKEN", Const, 0}, + {"R_PPC_REL24", Const, 0}, + {"R_PPC_REL32", Const, 0}, + {"R_PPC_RELATIVE", Const, 0}, + {"R_PPC_SDAREL16", Const, 0}, + {"R_PPC_SECTOFF", Const, 0}, + {"R_PPC_SECTOFF_HA", Const, 0}, + {"R_PPC_SECTOFF_HI", Const, 0}, + {"R_PPC_SECTOFF_LO", Const, 0}, + {"R_PPC_TLS", Const, 0}, + {"R_PPC_TPREL16", Const, 0}, + {"R_PPC_TPREL16_HA", Const, 0}, + {"R_PPC_TPREL16_HI", Const, 0}, + {"R_PPC_TPREL16_LO", Const, 0}, + {"R_PPC_TPREL32", Const, 0}, + {"R_PPC_UADDR16", Const, 0}, + {"R_PPC_UADDR32", Const, 0}, + {"R_RISCV", Type, 11}, + {"R_RISCV_32", Const, 11}, + {"R_RISCV_32_PCREL", Const, 12}, + {"R_RISCV_64", Const, 11}, + {"R_RISCV_ADD16", Const, 11}, + {"R_RISCV_ADD32", Const, 11}, + {"R_RISCV_ADD64", Const, 11}, + {"R_RISCV_ADD8", Const, 11}, + {"R_RISCV_ALIGN", Const, 11}, + {"R_RISCV_BRANCH", Const, 11}, + {"R_RISCV_CALL", Const, 11}, + {"R_RISCV_CALL_PLT", Const, 11}, + {"R_RISCV_COPY", Const, 11}, + {"R_RISCV_GNU_VTENTRY", Const, 11}, + {"R_RISCV_GNU_VTINHERIT", Const, 11}, + {"R_RISCV_GOT_HI20", Const, 11}, + {"R_RISCV_GPREL_I", Const, 11}, + {"R_RISCV_GPREL_S", Const, 11}, + {"R_RISCV_HI20", Const, 11}, + {"R_RISCV_JAL", Const, 11}, + {"R_RISCV_JUMP_SLOT", Const, 11}, + {"R_RISCV_LO12_I", Const, 11}, + {"R_RISCV_LO12_S", Const, 11}, + {"R_RISCV_NONE", Const, 11}, + {"R_RISCV_PCREL_HI20", Const, 11}, + {"R_RISCV_PCREL_LO12_I", Const, 11}, + {"R_RISCV_PCREL_LO12_S", Const, 11}, + {"R_RISCV_RELATIVE", Const, 11}, + {"R_RISCV_RELAX", Const, 11}, + {"R_RISCV_RVC_BRANCH", Const, 11}, + {"R_RISCV_RVC_JUMP", Const, 11}, + {"R_RISCV_RVC_LUI", Const, 11}, + {"R_RISCV_SET16", Const, 11}, + {"R_RISCV_SET32", Const, 11}, + {"R_RISCV_SET6", Const, 11}, + {"R_RISCV_SET8", Const, 11}, + {"R_RISCV_SUB16", Const, 11}, + {"R_RISCV_SUB32", Const, 11}, + {"R_RISCV_SUB6", Const, 11}, + {"R_RISCV_SUB64", Const, 11}, + {"R_RISCV_SUB8", Const, 11}, + {"R_RISCV_TLS_DTPMOD32", Const, 11}, + {"R_RISCV_TLS_DTPMOD64", Const, 11}, + {"R_RISCV_TLS_DTPREL32", Const, 11}, + {"R_RISCV_TLS_DTPREL64", Const, 11}, + {"R_RISCV_TLS_GD_HI20", Const, 11}, + {"R_RISCV_TLS_GOT_HI20", Const, 11}, + {"R_RISCV_TLS_TPREL32", Const, 11}, + {"R_RISCV_TLS_TPREL64", Const, 11}, + {"R_RISCV_TPREL_ADD", Const, 11}, + {"R_RISCV_TPREL_HI20", Const, 11}, + {"R_RISCV_TPREL_I", Const, 11}, + {"R_RISCV_TPREL_LO12_I", Const, 11}, + {"R_RISCV_TPREL_LO12_S", Const, 11}, + {"R_RISCV_TPREL_S", Const, 11}, + {"R_SPARC", Type, 0}, + {"R_SPARC_10", Const, 0}, + {"R_SPARC_11", Const, 0}, + {"R_SPARC_13", Const, 0}, + {"R_SPARC_16", Const, 0}, + {"R_SPARC_22", Const, 0}, + {"R_SPARC_32", Const, 0}, + {"R_SPARC_5", Const, 0}, + {"R_SPARC_6", Const, 0}, + {"R_SPARC_64", Const, 0}, + {"R_SPARC_7", Const, 0}, + {"R_SPARC_8", Const, 0}, + {"R_SPARC_COPY", Const, 0}, + {"R_SPARC_DISP16", Const, 0}, + {"R_SPARC_DISP32", Const, 0}, + {"R_SPARC_DISP64", Const, 0}, + {"R_SPARC_DISP8", Const, 0}, + {"R_SPARC_GLOB_DAT", Const, 0}, + {"R_SPARC_GLOB_JMP", Const, 0}, + {"R_SPARC_GOT10", Const, 0}, + {"R_SPARC_GOT13", Const, 0}, + {"R_SPARC_GOT22", Const, 0}, + {"R_SPARC_H44", Const, 0}, + {"R_SPARC_HH22", Const, 0}, + {"R_SPARC_HI22", Const, 0}, + {"R_SPARC_HIPLT22", Const, 0}, + {"R_SPARC_HIX22", Const, 0}, + {"R_SPARC_HM10", Const, 0}, + {"R_SPARC_JMP_SLOT", Const, 0}, + {"R_SPARC_L44", Const, 0}, + {"R_SPARC_LM22", Const, 0}, + {"R_SPARC_LO10", Const, 0}, + {"R_SPARC_LOPLT10", Const, 0}, + {"R_SPARC_LOX10", Const, 0}, + {"R_SPARC_M44", Const, 0}, + {"R_SPARC_NONE", Const, 0}, + {"R_SPARC_OLO10", Const, 0}, + {"R_SPARC_PC10", Const, 0}, + {"R_SPARC_PC22", Const, 0}, + {"R_SPARC_PCPLT10", Const, 0}, + {"R_SPARC_PCPLT22", Const, 0}, + {"R_SPARC_PCPLT32", Const, 0}, + {"R_SPARC_PC_HH22", Const, 0}, + {"R_SPARC_PC_HM10", Const, 0}, + {"R_SPARC_PC_LM22", Const, 0}, + {"R_SPARC_PLT32", Const, 0}, + {"R_SPARC_PLT64", Const, 0}, + {"R_SPARC_REGISTER", Const, 0}, + {"R_SPARC_RELATIVE", Const, 0}, + {"R_SPARC_UA16", Const, 0}, + {"R_SPARC_UA32", Const, 0}, + {"R_SPARC_UA64", Const, 0}, + {"R_SPARC_WDISP16", Const, 0}, + {"R_SPARC_WDISP19", Const, 0}, + {"R_SPARC_WDISP22", Const, 0}, + {"R_SPARC_WDISP30", Const, 0}, + {"R_SPARC_WPLT30", Const, 0}, + {"R_SYM32", Func, 0}, + {"R_SYM64", Func, 0}, + {"R_TYPE32", Func, 0}, + {"R_TYPE64", Func, 0}, + {"R_X86_64", Type, 0}, + {"R_X86_64_16", Const, 0}, + {"R_X86_64_32", Const, 0}, + {"R_X86_64_32S", Const, 0}, + {"R_X86_64_64", Const, 0}, + {"R_X86_64_8", Const, 0}, + {"R_X86_64_COPY", Const, 0}, + {"R_X86_64_DTPMOD64", Const, 0}, + {"R_X86_64_DTPOFF32", Const, 0}, + {"R_X86_64_DTPOFF64", Const, 0}, + {"R_X86_64_GLOB_DAT", Const, 0}, + {"R_X86_64_GOT32", Const, 0}, + {"R_X86_64_GOT64", Const, 10}, + {"R_X86_64_GOTOFF64", Const, 10}, + {"R_X86_64_GOTPC32", Const, 10}, + {"R_X86_64_GOTPC32_TLSDESC", Const, 10}, + {"R_X86_64_GOTPC64", Const, 10}, + {"R_X86_64_GOTPCREL", Const, 0}, + {"R_X86_64_GOTPCREL64", Const, 10}, + {"R_X86_64_GOTPCRELX", Const, 10}, + {"R_X86_64_GOTPLT64", Const, 10}, + {"R_X86_64_GOTTPOFF", Const, 0}, + {"R_X86_64_IRELATIVE", Const, 10}, + {"R_X86_64_JMP_SLOT", Const, 0}, + {"R_X86_64_NONE", Const, 0}, + {"R_X86_64_PC16", Const, 0}, + {"R_X86_64_PC32", Const, 0}, + {"R_X86_64_PC32_BND", Const, 10}, + {"R_X86_64_PC64", Const, 10}, + {"R_X86_64_PC8", Const, 0}, + {"R_X86_64_PLT32", Const, 0}, + {"R_X86_64_PLT32_BND", Const, 10}, + {"R_X86_64_PLTOFF64", Const, 10}, + {"R_X86_64_RELATIVE", Const, 0}, + {"R_X86_64_RELATIVE64", Const, 10}, + {"R_X86_64_REX_GOTPCRELX", Const, 10}, + {"R_X86_64_SIZE32", Const, 10}, + {"R_X86_64_SIZE64", Const, 10}, + {"R_X86_64_TLSDESC", Const, 10}, + {"R_X86_64_TLSDESC_CALL", Const, 10}, + {"R_X86_64_TLSGD", Const, 0}, + {"R_X86_64_TLSLD", Const, 0}, + {"R_X86_64_TPOFF32", Const, 0}, + {"R_X86_64_TPOFF64", Const, 0}, + {"Rel32", Type, 0}, + {"Rel32.Info", Field, 0}, + {"Rel32.Off", Field, 0}, + {"Rel64", Type, 0}, + {"Rel64.Info", Field, 0}, + {"Rel64.Off", Field, 0}, + {"Rela32", Type, 0}, + {"Rela32.Addend", Field, 0}, + {"Rela32.Info", Field, 0}, + {"Rela32.Off", Field, 0}, + {"Rela64", Type, 0}, + {"Rela64.Addend", Field, 0}, + {"Rela64.Info", Field, 0}, + {"Rela64.Off", Field, 0}, + {"SHF_ALLOC", Const, 0}, + {"SHF_COMPRESSED", Const, 6}, + {"SHF_EXECINSTR", Const, 0}, + {"SHF_GROUP", Const, 0}, + {"SHF_INFO_LINK", Const, 0}, + {"SHF_LINK_ORDER", Const, 0}, + {"SHF_MASKOS", Const, 0}, + {"SHF_MASKPROC", Const, 0}, + {"SHF_MERGE", Const, 0}, + {"SHF_OS_NONCONFORMING", Const, 0}, + {"SHF_STRINGS", Const, 0}, + {"SHF_TLS", Const, 0}, + {"SHF_WRITE", Const, 0}, + {"SHN_ABS", Const, 0}, + {"SHN_COMMON", Const, 0}, + {"SHN_HIOS", Const, 0}, + {"SHN_HIPROC", Const, 0}, + {"SHN_HIRESERVE", Const, 0}, + {"SHN_LOOS", Const, 0}, + {"SHN_LOPROC", Const, 0}, + {"SHN_LORESERVE", Const, 0}, + {"SHN_UNDEF", Const, 0}, + {"SHN_XINDEX", Const, 0}, + {"SHT_DYNAMIC", Const, 0}, + {"SHT_DYNSYM", Const, 0}, + {"SHT_FINI_ARRAY", Const, 0}, + {"SHT_GNU_ATTRIBUTES", Const, 0}, + {"SHT_GNU_HASH", Const, 0}, + {"SHT_GNU_LIBLIST", Const, 0}, + {"SHT_GNU_VERDEF", Const, 0}, + {"SHT_GNU_VERNEED", Const, 0}, + {"SHT_GNU_VERSYM", Const, 0}, + {"SHT_GROUP", Const, 0}, + {"SHT_HASH", Const, 0}, + {"SHT_HIOS", Const, 0}, + {"SHT_HIPROC", Const, 0}, + {"SHT_HIUSER", Const, 0}, + {"SHT_INIT_ARRAY", Const, 0}, + {"SHT_LOOS", Const, 0}, + {"SHT_LOPROC", Const, 0}, + {"SHT_LOUSER", Const, 0}, + {"SHT_MIPS_ABIFLAGS", Const, 17}, + {"SHT_NOBITS", Const, 0}, + {"SHT_NOTE", Const, 0}, + {"SHT_NULL", Const, 0}, + {"SHT_PREINIT_ARRAY", Const, 0}, + {"SHT_PROGBITS", Const, 0}, + {"SHT_REL", Const, 0}, + {"SHT_RELA", Const, 0}, + {"SHT_SHLIB", Const, 0}, + {"SHT_STRTAB", Const, 0}, + {"SHT_SYMTAB", Const, 0}, + {"SHT_SYMTAB_SHNDX", Const, 0}, + {"STB_GLOBAL", Const, 0}, + {"STB_HIOS", Const, 0}, + {"STB_HIPROC", Const, 0}, + {"STB_LOCAL", Const, 0}, + {"STB_LOOS", Const, 0}, + {"STB_LOPROC", Const, 0}, + {"STB_WEAK", Const, 0}, + {"STT_COMMON", Const, 0}, + {"STT_FILE", Const, 0}, + {"STT_FUNC", Const, 0}, + {"STT_GNU_IFUNC", Const, 23}, + {"STT_HIOS", Const, 0}, + {"STT_HIPROC", Const, 0}, + {"STT_LOOS", Const, 0}, + {"STT_LOPROC", Const, 0}, + {"STT_NOTYPE", Const, 0}, + {"STT_OBJECT", Const, 0}, + {"STT_RELC", Const, 23}, + {"STT_SECTION", Const, 0}, + {"STT_SRELC", Const, 23}, + {"STT_TLS", Const, 0}, + {"STV_DEFAULT", Const, 0}, + {"STV_HIDDEN", Const, 0}, + {"STV_INTERNAL", Const, 0}, + {"STV_PROTECTED", Const, 0}, + {"ST_BIND", Func, 0}, + {"ST_INFO", Func, 0}, + {"ST_TYPE", Func, 0}, + {"ST_VISIBILITY", Func, 0}, + {"Section", Type, 0}, + {"Section.ReaderAt", Field, 0}, + {"Section.SectionHeader", Field, 0}, + {"Section32", Type, 0}, + {"Section32.Addr", Field, 0}, + {"Section32.Addralign", Field, 0}, + {"Section32.Entsize", Field, 0}, + {"Section32.Flags", Field, 0}, + {"Section32.Info", Field, 0}, + {"Section32.Link", Field, 0}, + {"Section32.Name", Field, 0}, + {"Section32.Off", Field, 0}, + {"Section32.Size", Field, 0}, + {"Section32.Type", Field, 0}, + {"Section64", Type, 0}, + {"Section64.Addr", Field, 0}, + {"Section64.Addralign", Field, 0}, + {"Section64.Entsize", Field, 0}, + {"Section64.Flags", Field, 0}, + {"Section64.Info", Field, 0}, + {"Section64.Link", Field, 0}, + {"Section64.Name", Field, 0}, + {"Section64.Off", Field, 0}, + {"Section64.Size", Field, 0}, + {"Section64.Type", Field, 0}, + {"SectionFlag", Type, 0}, + {"SectionHeader", Type, 0}, + {"SectionHeader.Addr", Field, 0}, + {"SectionHeader.Addralign", Field, 0}, + {"SectionHeader.Entsize", Field, 0}, + {"SectionHeader.FileSize", Field, 6}, + {"SectionHeader.Flags", Field, 0}, + {"SectionHeader.Info", Field, 0}, + {"SectionHeader.Link", Field, 0}, + {"SectionHeader.Name", Field, 0}, + {"SectionHeader.Offset", Field, 0}, + {"SectionHeader.Size", Field, 0}, + {"SectionHeader.Type", Field, 0}, + {"SectionIndex", Type, 0}, + {"SectionType", Type, 0}, + {"Sym32", Type, 0}, + {"Sym32.Info", Field, 0}, + {"Sym32.Name", Field, 0}, + {"Sym32.Other", Field, 0}, + {"Sym32.Shndx", Field, 0}, + {"Sym32.Size", Field, 0}, + {"Sym32.Value", Field, 0}, + {"Sym32Size", Const, 0}, + {"Sym64", Type, 0}, + {"Sym64.Info", Field, 0}, + {"Sym64.Name", Field, 0}, + {"Sym64.Other", Field, 0}, + {"Sym64.Shndx", Field, 0}, + {"Sym64.Size", Field, 0}, + {"Sym64.Value", Field, 0}, + {"Sym64Size", Const, 0}, + {"SymBind", Type, 0}, + {"SymType", Type, 0}, + {"SymVis", Type, 0}, + {"Symbol", Type, 0}, + {"Symbol.Info", Field, 0}, + {"Symbol.Library", Field, 13}, + {"Symbol.Name", Field, 0}, + {"Symbol.Other", Field, 0}, + {"Symbol.Section", Field, 0}, + {"Symbol.Size", Field, 0}, + {"Symbol.Value", Field, 0}, + {"Symbol.Version", Field, 13}, + {"Type", Type, 0}, + {"Version", Type, 0}, + }, + "debug/gosym": { + {"(*DecodingError).Error", Method, 0}, + {"(*LineTable).LineToPC", Method, 0}, + {"(*LineTable).PCToLine", Method, 0}, + {"(*Sym).BaseName", Method, 0}, + {"(*Sym).PackageName", Method, 0}, + {"(*Sym).ReceiverName", Method, 0}, + {"(*Sym).Static", Method, 0}, + {"(*Table).LineToPC", Method, 0}, + {"(*Table).LookupFunc", Method, 0}, + {"(*Table).LookupSym", Method, 0}, + {"(*Table).PCToFunc", Method, 0}, + {"(*Table).PCToLine", Method, 0}, + {"(*Table).SymByAddr", Method, 0}, + {"(*UnknownLineError).Error", Method, 0}, + {"(Func).BaseName", Method, 0}, + {"(Func).PackageName", Method, 0}, + {"(Func).ReceiverName", Method, 0}, + {"(Func).Static", Method, 0}, + {"(UnknownFileError).Error", Method, 0}, + {"DecodingError", Type, 0}, + {"Func", Type, 0}, + {"Func.End", Field, 0}, + {"Func.Entry", Field, 0}, + {"Func.FrameSize", Field, 0}, + {"Func.LineTable", Field, 0}, + {"Func.Locals", Field, 0}, + {"Func.Obj", Field, 0}, + {"Func.Params", Field, 0}, + {"Func.Sym", Field, 0}, + {"LineTable", Type, 0}, + {"LineTable.Data", Field, 0}, + {"LineTable.Line", Field, 0}, + {"LineTable.PC", Field, 0}, + {"NewLineTable", Func, 0}, + {"NewTable", Func, 0}, + {"Obj", Type, 0}, + {"Obj.Funcs", Field, 0}, + {"Obj.Paths", Field, 0}, + {"Sym", Type, 0}, + {"Sym.Func", Field, 0}, + {"Sym.GoType", Field, 0}, + {"Sym.Name", Field, 0}, + {"Sym.Type", Field, 0}, + {"Sym.Value", Field, 0}, + {"Table", Type, 0}, + {"Table.Files", Field, 0}, + {"Table.Funcs", Field, 0}, + {"Table.Objs", Field, 0}, + {"Table.Syms", Field, 0}, + {"UnknownFileError", Type, 0}, + {"UnknownLineError", Type, 0}, + {"UnknownLineError.File", Field, 0}, + {"UnknownLineError.Line", Field, 0}, + }, + "debug/macho": { + {"(*FatFile).Close", Method, 3}, + {"(*File).Close", Method, 0}, + {"(*File).DWARF", Method, 0}, + {"(*File).ImportedLibraries", Method, 0}, + {"(*File).ImportedSymbols", Method, 0}, + {"(*File).Section", Method, 0}, + {"(*File).Segment", Method, 0}, + {"(*FormatError).Error", Method, 0}, + {"(*Section).Data", Method, 0}, + {"(*Section).Open", Method, 0}, + {"(*Segment).Data", Method, 0}, + {"(*Segment).Open", Method, 0}, + {"(Cpu).GoString", Method, 0}, + {"(Cpu).String", Method, 0}, + {"(Dylib).Raw", Method, 0}, + {"(Dysymtab).Raw", Method, 0}, + {"(FatArch).Close", Method, 3}, + {"(FatArch).DWARF", Method, 3}, + {"(FatArch).ImportedLibraries", Method, 3}, + {"(FatArch).ImportedSymbols", Method, 3}, + {"(FatArch).Section", Method, 3}, + {"(FatArch).Segment", Method, 3}, + {"(LoadBytes).Raw", Method, 0}, + {"(LoadCmd).GoString", Method, 0}, + {"(LoadCmd).String", Method, 0}, + {"(RelocTypeARM).GoString", Method, 10}, + {"(RelocTypeARM).String", Method, 10}, + {"(RelocTypeARM64).GoString", Method, 10}, + {"(RelocTypeARM64).String", Method, 10}, + {"(RelocTypeGeneric).GoString", Method, 10}, + {"(RelocTypeGeneric).String", Method, 10}, + {"(RelocTypeX86_64).GoString", Method, 10}, + {"(RelocTypeX86_64).String", Method, 10}, + {"(Rpath).Raw", Method, 10}, + {"(Section).ReadAt", Method, 0}, + {"(Segment).Raw", Method, 0}, + {"(Segment).ReadAt", Method, 0}, + {"(Symtab).Raw", Method, 0}, + {"(Type).GoString", Method, 10}, + {"(Type).String", Method, 10}, + {"ARM64_RELOC_ADDEND", Const, 10}, + {"ARM64_RELOC_BRANCH26", Const, 10}, + {"ARM64_RELOC_GOT_LOAD_PAGE21", Const, 10}, + {"ARM64_RELOC_GOT_LOAD_PAGEOFF12", Const, 10}, + {"ARM64_RELOC_PAGE21", Const, 10}, + {"ARM64_RELOC_PAGEOFF12", Const, 10}, + {"ARM64_RELOC_POINTER_TO_GOT", Const, 10}, + {"ARM64_RELOC_SUBTRACTOR", Const, 10}, + {"ARM64_RELOC_TLVP_LOAD_PAGE21", Const, 10}, + {"ARM64_RELOC_TLVP_LOAD_PAGEOFF12", Const, 10}, + {"ARM64_RELOC_UNSIGNED", Const, 10}, + {"ARM_RELOC_BR24", Const, 10}, + {"ARM_RELOC_HALF", Const, 10}, + {"ARM_RELOC_HALF_SECTDIFF", Const, 10}, + {"ARM_RELOC_LOCAL_SECTDIFF", Const, 10}, + {"ARM_RELOC_PAIR", Const, 10}, + {"ARM_RELOC_PB_LA_PTR", Const, 10}, + {"ARM_RELOC_SECTDIFF", Const, 10}, + {"ARM_RELOC_VANILLA", Const, 10}, + {"ARM_THUMB_32BIT_BRANCH", Const, 10}, + {"ARM_THUMB_RELOC_BR22", Const, 10}, + {"Cpu", Type, 0}, + {"Cpu386", Const, 0}, + {"CpuAmd64", Const, 0}, + {"CpuArm", Const, 3}, + {"CpuArm64", Const, 11}, + {"CpuPpc", Const, 3}, + {"CpuPpc64", Const, 3}, + {"Dylib", Type, 0}, + {"Dylib.CompatVersion", Field, 0}, + {"Dylib.CurrentVersion", Field, 0}, + {"Dylib.LoadBytes", Field, 0}, + {"Dylib.Name", Field, 0}, + {"Dylib.Time", Field, 0}, + {"DylibCmd", Type, 0}, + {"DylibCmd.Cmd", Field, 0}, + {"DylibCmd.CompatVersion", Field, 0}, + {"DylibCmd.CurrentVersion", Field, 0}, + {"DylibCmd.Len", Field, 0}, + {"DylibCmd.Name", Field, 0}, + {"DylibCmd.Time", Field, 0}, + {"Dysymtab", Type, 0}, + {"Dysymtab.DysymtabCmd", Field, 0}, + {"Dysymtab.IndirectSyms", Field, 0}, + {"Dysymtab.LoadBytes", Field, 0}, + {"DysymtabCmd", Type, 0}, + {"DysymtabCmd.Cmd", Field, 0}, + {"DysymtabCmd.Extrefsymoff", Field, 0}, + {"DysymtabCmd.Extreloff", Field, 0}, + {"DysymtabCmd.Iextdefsym", Field, 0}, + {"DysymtabCmd.Ilocalsym", Field, 0}, + {"DysymtabCmd.Indirectsymoff", Field, 0}, + {"DysymtabCmd.Iundefsym", Field, 0}, + {"DysymtabCmd.Len", Field, 0}, + {"DysymtabCmd.Locreloff", Field, 0}, + {"DysymtabCmd.Modtaboff", Field, 0}, + {"DysymtabCmd.Nextdefsym", Field, 0}, + {"DysymtabCmd.Nextrefsyms", Field, 0}, + {"DysymtabCmd.Nextrel", Field, 0}, + {"DysymtabCmd.Nindirectsyms", Field, 0}, + {"DysymtabCmd.Nlocalsym", Field, 0}, + {"DysymtabCmd.Nlocrel", Field, 0}, + {"DysymtabCmd.Nmodtab", Field, 0}, + {"DysymtabCmd.Ntoc", Field, 0}, + {"DysymtabCmd.Nundefsym", Field, 0}, + {"DysymtabCmd.Tocoffset", Field, 0}, + {"ErrNotFat", Var, 3}, + {"FatArch", Type, 3}, + {"FatArch.FatArchHeader", Field, 3}, + {"FatArch.File", Field, 3}, + {"FatArchHeader", Type, 3}, + {"FatArchHeader.Align", Field, 3}, + {"FatArchHeader.Cpu", Field, 3}, + {"FatArchHeader.Offset", Field, 3}, + {"FatArchHeader.Size", Field, 3}, + {"FatArchHeader.SubCpu", Field, 3}, + {"FatFile", Type, 3}, + {"FatFile.Arches", Field, 3}, + {"FatFile.Magic", Field, 3}, + {"File", Type, 0}, + {"File.ByteOrder", Field, 0}, + {"File.Dysymtab", Field, 0}, + {"File.FileHeader", Field, 0}, + {"File.Loads", Field, 0}, + {"File.Sections", Field, 0}, + {"File.Symtab", Field, 0}, + {"FileHeader", Type, 0}, + {"FileHeader.Cmdsz", Field, 0}, + {"FileHeader.Cpu", Field, 0}, + {"FileHeader.Flags", Field, 0}, + {"FileHeader.Magic", Field, 0}, + {"FileHeader.Ncmd", Field, 0}, + {"FileHeader.SubCpu", Field, 0}, + {"FileHeader.Type", Field, 0}, + {"FlagAllModsBound", Const, 10}, + {"FlagAllowStackExecution", Const, 10}, + {"FlagAppExtensionSafe", Const, 10}, + {"FlagBindAtLoad", Const, 10}, + {"FlagBindsToWeak", Const, 10}, + {"FlagCanonical", Const, 10}, + {"FlagDeadStrippableDylib", Const, 10}, + {"FlagDyldLink", Const, 10}, + {"FlagForceFlat", Const, 10}, + {"FlagHasTLVDescriptors", Const, 10}, + {"FlagIncrLink", Const, 10}, + {"FlagLazyInit", Const, 10}, + {"FlagNoFixPrebinding", Const, 10}, + {"FlagNoHeapExecution", Const, 10}, + {"FlagNoMultiDefs", Const, 10}, + {"FlagNoReexportedDylibs", Const, 10}, + {"FlagNoUndefs", Const, 10}, + {"FlagPIE", Const, 10}, + {"FlagPrebindable", Const, 10}, + {"FlagPrebound", Const, 10}, + {"FlagRootSafe", Const, 10}, + {"FlagSetuidSafe", Const, 10}, + {"FlagSplitSegs", Const, 10}, + {"FlagSubsectionsViaSymbols", Const, 10}, + {"FlagTwoLevel", Const, 10}, + {"FlagWeakDefines", Const, 10}, + {"FormatError", Type, 0}, + {"GENERIC_RELOC_LOCAL_SECTDIFF", Const, 10}, + {"GENERIC_RELOC_PAIR", Const, 10}, + {"GENERIC_RELOC_PB_LA_PTR", Const, 10}, + {"GENERIC_RELOC_SECTDIFF", Const, 10}, + {"GENERIC_RELOC_TLV", Const, 10}, + {"GENERIC_RELOC_VANILLA", Const, 10}, + {"Load", Type, 0}, + {"LoadBytes", Type, 0}, + {"LoadCmd", Type, 0}, + {"LoadCmdDylib", Const, 0}, + {"LoadCmdDylinker", Const, 0}, + {"LoadCmdDysymtab", Const, 0}, + {"LoadCmdRpath", Const, 10}, + {"LoadCmdSegment", Const, 0}, + {"LoadCmdSegment64", Const, 0}, + {"LoadCmdSymtab", Const, 0}, + {"LoadCmdThread", Const, 0}, + {"LoadCmdUnixThread", Const, 0}, + {"Magic32", Const, 0}, + {"Magic64", Const, 0}, + {"MagicFat", Const, 3}, + {"NewFatFile", Func, 3}, + {"NewFile", Func, 0}, + {"Nlist32", Type, 0}, + {"Nlist32.Desc", Field, 0}, + {"Nlist32.Name", Field, 0}, + {"Nlist32.Sect", Field, 0}, + {"Nlist32.Type", Field, 0}, + {"Nlist32.Value", Field, 0}, + {"Nlist64", Type, 0}, + {"Nlist64.Desc", Field, 0}, + {"Nlist64.Name", Field, 0}, + {"Nlist64.Sect", Field, 0}, + {"Nlist64.Type", Field, 0}, + {"Nlist64.Value", Field, 0}, + {"Open", Func, 0}, + {"OpenFat", Func, 3}, + {"Regs386", Type, 0}, + {"Regs386.AX", Field, 0}, + {"Regs386.BP", Field, 0}, + {"Regs386.BX", Field, 0}, + {"Regs386.CS", Field, 0}, + {"Regs386.CX", Field, 0}, + {"Regs386.DI", Field, 0}, + {"Regs386.DS", Field, 0}, + {"Regs386.DX", Field, 0}, + {"Regs386.ES", Field, 0}, + {"Regs386.FLAGS", Field, 0}, + {"Regs386.FS", Field, 0}, + {"Regs386.GS", Field, 0}, + {"Regs386.IP", Field, 0}, + {"Regs386.SI", Field, 0}, + {"Regs386.SP", Field, 0}, + {"Regs386.SS", Field, 0}, + {"RegsAMD64", Type, 0}, + {"RegsAMD64.AX", Field, 0}, + {"RegsAMD64.BP", Field, 0}, + {"RegsAMD64.BX", Field, 0}, + {"RegsAMD64.CS", Field, 0}, + {"RegsAMD64.CX", Field, 0}, + {"RegsAMD64.DI", Field, 0}, + {"RegsAMD64.DX", Field, 0}, + {"RegsAMD64.FLAGS", Field, 0}, + {"RegsAMD64.FS", Field, 0}, + {"RegsAMD64.GS", Field, 0}, + {"RegsAMD64.IP", Field, 0}, + {"RegsAMD64.R10", Field, 0}, + {"RegsAMD64.R11", Field, 0}, + {"RegsAMD64.R12", Field, 0}, + {"RegsAMD64.R13", Field, 0}, + {"RegsAMD64.R14", Field, 0}, + {"RegsAMD64.R15", Field, 0}, + {"RegsAMD64.R8", Field, 0}, + {"RegsAMD64.R9", Field, 0}, + {"RegsAMD64.SI", Field, 0}, + {"RegsAMD64.SP", Field, 0}, + {"Reloc", Type, 10}, + {"Reloc.Addr", Field, 10}, + {"Reloc.Extern", Field, 10}, + {"Reloc.Len", Field, 10}, + {"Reloc.Pcrel", Field, 10}, + {"Reloc.Scattered", Field, 10}, + {"Reloc.Type", Field, 10}, + {"Reloc.Value", Field, 10}, + {"RelocTypeARM", Type, 10}, + {"RelocTypeARM64", Type, 10}, + {"RelocTypeGeneric", Type, 10}, + {"RelocTypeX86_64", Type, 10}, + {"Rpath", Type, 10}, + {"Rpath.LoadBytes", Field, 10}, + {"Rpath.Path", Field, 10}, + {"RpathCmd", Type, 10}, + {"RpathCmd.Cmd", Field, 10}, + {"RpathCmd.Len", Field, 10}, + {"RpathCmd.Path", Field, 10}, + {"Section", Type, 0}, + {"Section.ReaderAt", Field, 0}, + {"Section.Relocs", Field, 10}, + {"Section.SectionHeader", Field, 0}, + {"Section32", Type, 0}, + {"Section32.Addr", Field, 0}, + {"Section32.Align", Field, 0}, + {"Section32.Flags", Field, 0}, + {"Section32.Name", Field, 0}, + {"Section32.Nreloc", Field, 0}, + {"Section32.Offset", Field, 0}, + {"Section32.Reloff", Field, 0}, + {"Section32.Reserve1", Field, 0}, + {"Section32.Reserve2", Field, 0}, + {"Section32.Seg", Field, 0}, + {"Section32.Size", Field, 0}, + {"Section64", Type, 0}, + {"Section64.Addr", Field, 0}, + {"Section64.Align", Field, 0}, + {"Section64.Flags", Field, 0}, + {"Section64.Name", Field, 0}, + {"Section64.Nreloc", Field, 0}, + {"Section64.Offset", Field, 0}, + {"Section64.Reloff", Field, 0}, + {"Section64.Reserve1", Field, 0}, + {"Section64.Reserve2", Field, 0}, + {"Section64.Reserve3", Field, 0}, + {"Section64.Seg", Field, 0}, + {"Section64.Size", Field, 0}, + {"SectionHeader", Type, 0}, + {"SectionHeader.Addr", Field, 0}, + {"SectionHeader.Align", Field, 0}, + {"SectionHeader.Flags", Field, 0}, + {"SectionHeader.Name", Field, 0}, + {"SectionHeader.Nreloc", Field, 0}, + {"SectionHeader.Offset", Field, 0}, + {"SectionHeader.Reloff", Field, 0}, + {"SectionHeader.Seg", Field, 0}, + {"SectionHeader.Size", Field, 0}, + {"Segment", Type, 0}, + {"Segment.LoadBytes", Field, 0}, + {"Segment.ReaderAt", Field, 0}, + {"Segment.SegmentHeader", Field, 0}, + {"Segment32", Type, 0}, + {"Segment32.Addr", Field, 0}, + {"Segment32.Cmd", Field, 0}, + {"Segment32.Filesz", Field, 0}, + {"Segment32.Flag", Field, 0}, + {"Segment32.Len", Field, 0}, + {"Segment32.Maxprot", Field, 0}, + {"Segment32.Memsz", Field, 0}, + {"Segment32.Name", Field, 0}, + {"Segment32.Nsect", Field, 0}, + {"Segment32.Offset", Field, 0}, + {"Segment32.Prot", Field, 0}, + {"Segment64", Type, 0}, + {"Segment64.Addr", Field, 0}, + {"Segment64.Cmd", Field, 0}, + {"Segment64.Filesz", Field, 0}, + {"Segment64.Flag", Field, 0}, + {"Segment64.Len", Field, 0}, + {"Segment64.Maxprot", Field, 0}, + {"Segment64.Memsz", Field, 0}, + {"Segment64.Name", Field, 0}, + {"Segment64.Nsect", Field, 0}, + {"Segment64.Offset", Field, 0}, + {"Segment64.Prot", Field, 0}, + {"SegmentHeader", Type, 0}, + {"SegmentHeader.Addr", Field, 0}, + {"SegmentHeader.Cmd", Field, 0}, + {"SegmentHeader.Filesz", Field, 0}, + {"SegmentHeader.Flag", Field, 0}, + {"SegmentHeader.Len", Field, 0}, + {"SegmentHeader.Maxprot", Field, 0}, + {"SegmentHeader.Memsz", Field, 0}, + {"SegmentHeader.Name", Field, 0}, + {"SegmentHeader.Nsect", Field, 0}, + {"SegmentHeader.Offset", Field, 0}, + {"SegmentHeader.Prot", Field, 0}, + {"Symbol", Type, 0}, + {"Symbol.Desc", Field, 0}, + {"Symbol.Name", Field, 0}, + {"Symbol.Sect", Field, 0}, + {"Symbol.Type", Field, 0}, + {"Symbol.Value", Field, 0}, + {"Symtab", Type, 0}, + {"Symtab.LoadBytes", Field, 0}, + {"Symtab.Syms", Field, 0}, + {"Symtab.SymtabCmd", Field, 0}, + {"SymtabCmd", Type, 0}, + {"SymtabCmd.Cmd", Field, 0}, + {"SymtabCmd.Len", Field, 0}, + {"SymtabCmd.Nsyms", Field, 0}, + {"SymtabCmd.Stroff", Field, 0}, + {"SymtabCmd.Strsize", Field, 0}, + {"SymtabCmd.Symoff", Field, 0}, + {"Thread", Type, 0}, + {"Thread.Cmd", Field, 0}, + {"Thread.Data", Field, 0}, + {"Thread.Len", Field, 0}, + {"Thread.Type", Field, 0}, + {"Type", Type, 0}, + {"TypeBundle", Const, 3}, + {"TypeDylib", Const, 3}, + {"TypeExec", Const, 0}, + {"TypeObj", Const, 0}, + {"X86_64_RELOC_BRANCH", Const, 10}, + {"X86_64_RELOC_GOT", Const, 10}, + {"X86_64_RELOC_GOT_LOAD", Const, 10}, + {"X86_64_RELOC_SIGNED", Const, 10}, + {"X86_64_RELOC_SIGNED_1", Const, 10}, + {"X86_64_RELOC_SIGNED_2", Const, 10}, + {"X86_64_RELOC_SIGNED_4", Const, 10}, + {"X86_64_RELOC_SUBTRACTOR", Const, 10}, + {"X86_64_RELOC_TLV", Const, 10}, + {"X86_64_RELOC_UNSIGNED", Const, 10}, + }, + "debug/pe": { + {"(*COFFSymbol).FullName", Method, 8}, + {"(*File).COFFSymbolReadSectionDefAux", Method, 19}, + {"(*File).Close", Method, 0}, + {"(*File).DWARF", Method, 0}, + {"(*File).ImportedLibraries", Method, 0}, + {"(*File).ImportedSymbols", Method, 0}, + {"(*File).Section", Method, 0}, + {"(*FormatError).Error", Method, 0}, + {"(*Section).Data", Method, 0}, + {"(*Section).Open", Method, 0}, + {"(Section).ReadAt", Method, 0}, + {"(StringTable).String", Method, 8}, + {"COFFSymbol", Type, 1}, + {"COFFSymbol.Name", Field, 1}, + {"COFFSymbol.NumberOfAuxSymbols", Field, 1}, + {"COFFSymbol.SectionNumber", Field, 1}, + {"COFFSymbol.StorageClass", Field, 1}, + {"COFFSymbol.Type", Field, 1}, + {"COFFSymbol.Value", Field, 1}, + {"COFFSymbolAuxFormat5", Type, 19}, + {"COFFSymbolAuxFormat5.Checksum", Field, 19}, + {"COFFSymbolAuxFormat5.NumLineNumbers", Field, 19}, + {"COFFSymbolAuxFormat5.NumRelocs", Field, 19}, + {"COFFSymbolAuxFormat5.SecNum", Field, 19}, + {"COFFSymbolAuxFormat5.Selection", Field, 19}, + {"COFFSymbolAuxFormat5.Size", Field, 19}, + {"COFFSymbolSize", Const, 1}, + {"DataDirectory", Type, 3}, + {"DataDirectory.Size", Field, 3}, + {"DataDirectory.VirtualAddress", Field, 3}, + {"File", Type, 0}, + {"File.COFFSymbols", Field, 8}, + {"File.FileHeader", Field, 0}, + {"File.OptionalHeader", Field, 3}, + {"File.Sections", Field, 0}, + {"File.StringTable", Field, 8}, + {"File.Symbols", Field, 1}, + {"FileHeader", Type, 0}, + {"FileHeader.Characteristics", Field, 0}, + {"FileHeader.Machine", Field, 0}, + {"FileHeader.NumberOfSections", Field, 0}, + {"FileHeader.NumberOfSymbols", Field, 0}, + {"FileHeader.PointerToSymbolTable", Field, 0}, + {"FileHeader.SizeOfOptionalHeader", Field, 0}, + {"FileHeader.TimeDateStamp", Field, 0}, + {"FormatError", Type, 0}, + {"IMAGE_COMDAT_SELECT_ANY", Const, 19}, + {"IMAGE_COMDAT_SELECT_ASSOCIATIVE", Const, 19}, + {"IMAGE_COMDAT_SELECT_EXACT_MATCH", Const, 19}, + {"IMAGE_COMDAT_SELECT_LARGEST", Const, 19}, + {"IMAGE_COMDAT_SELECT_NODUPLICATES", Const, 19}, + {"IMAGE_COMDAT_SELECT_SAME_SIZE", Const, 19}, + {"IMAGE_DIRECTORY_ENTRY_ARCHITECTURE", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_BASERELOC", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_DEBUG", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_EXCEPTION", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_EXPORT", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_GLOBALPTR", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_IAT", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_IMPORT", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_LOAD_CONFIG", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_RESOURCE", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_SECURITY", Const, 11}, + {"IMAGE_DIRECTORY_ENTRY_TLS", Const, 11}, + {"IMAGE_DLLCHARACTERISTICS_APPCONTAINER", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_GUARD_CF", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_HIGH_ENTROPY_VA", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_NO_BIND", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_NO_ISOLATION", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_NO_SEH", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_NX_COMPAT", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE", Const, 15}, + {"IMAGE_DLLCHARACTERISTICS_WDM_DRIVER", Const, 15}, + {"IMAGE_FILE_32BIT_MACHINE", Const, 15}, + {"IMAGE_FILE_AGGRESIVE_WS_TRIM", Const, 15}, + {"IMAGE_FILE_BYTES_REVERSED_HI", Const, 15}, + {"IMAGE_FILE_BYTES_REVERSED_LO", Const, 15}, + {"IMAGE_FILE_DEBUG_STRIPPED", Const, 15}, + {"IMAGE_FILE_DLL", Const, 15}, + {"IMAGE_FILE_EXECUTABLE_IMAGE", Const, 15}, + {"IMAGE_FILE_LARGE_ADDRESS_AWARE", Const, 15}, + {"IMAGE_FILE_LINE_NUMS_STRIPPED", Const, 15}, + {"IMAGE_FILE_LOCAL_SYMS_STRIPPED", Const, 15}, + {"IMAGE_FILE_MACHINE_AM33", Const, 0}, + {"IMAGE_FILE_MACHINE_AMD64", Const, 0}, + {"IMAGE_FILE_MACHINE_ARM", Const, 0}, + {"IMAGE_FILE_MACHINE_ARM64", Const, 11}, + {"IMAGE_FILE_MACHINE_ARMNT", Const, 12}, + {"IMAGE_FILE_MACHINE_EBC", Const, 0}, + {"IMAGE_FILE_MACHINE_I386", Const, 0}, + {"IMAGE_FILE_MACHINE_IA64", Const, 0}, + {"IMAGE_FILE_MACHINE_LOONGARCH32", Const, 19}, + {"IMAGE_FILE_MACHINE_LOONGARCH64", Const, 19}, + {"IMAGE_FILE_MACHINE_M32R", Const, 0}, + {"IMAGE_FILE_MACHINE_MIPS16", Const, 0}, + {"IMAGE_FILE_MACHINE_MIPSFPU", Const, 0}, + {"IMAGE_FILE_MACHINE_MIPSFPU16", Const, 0}, + {"IMAGE_FILE_MACHINE_POWERPC", Const, 0}, + {"IMAGE_FILE_MACHINE_POWERPCFP", Const, 0}, + {"IMAGE_FILE_MACHINE_R4000", Const, 0}, + {"IMAGE_FILE_MACHINE_RISCV128", Const, 20}, + {"IMAGE_FILE_MACHINE_RISCV32", Const, 20}, + {"IMAGE_FILE_MACHINE_RISCV64", Const, 20}, + {"IMAGE_FILE_MACHINE_SH3", Const, 0}, + {"IMAGE_FILE_MACHINE_SH3DSP", Const, 0}, + {"IMAGE_FILE_MACHINE_SH4", Const, 0}, + {"IMAGE_FILE_MACHINE_SH5", Const, 0}, + {"IMAGE_FILE_MACHINE_THUMB", Const, 0}, + {"IMAGE_FILE_MACHINE_UNKNOWN", Const, 0}, + {"IMAGE_FILE_MACHINE_WCEMIPSV2", Const, 0}, + {"IMAGE_FILE_NET_RUN_FROM_SWAP", Const, 15}, + {"IMAGE_FILE_RELOCS_STRIPPED", Const, 15}, + {"IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP", Const, 15}, + {"IMAGE_FILE_SYSTEM", Const, 15}, + {"IMAGE_FILE_UP_SYSTEM_ONLY", Const, 15}, + {"IMAGE_SCN_CNT_CODE", Const, 19}, + {"IMAGE_SCN_CNT_INITIALIZED_DATA", Const, 19}, + {"IMAGE_SCN_CNT_UNINITIALIZED_DATA", Const, 19}, + {"IMAGE_SCN_LNK_COMDAT", Const, 19}, + {"IMAGE_SCN_MEM_DISCARDABLE", Const, 19}, + {"IMAGE_SCN_MEM_EXECUTE", Const, 19}, + {"IMAGE_SCN_MEM_READ", Const, 19}, + {"IMAGE_SCN_MEM_WRITE", Const, 19}, + {"IMAGE_SUBSYSTEM_EFI_APPLICATION", Const, 15}, + {"IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER", Const, 15}, + {"IMAGE_SUBSYSTEM_EFI_ROM", Const, 15}, + {"IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER", Const, 15}, + {"IMAGE_SUBSYSTEM_NATIVE", Const, 15}, + {"IMAGE_SUBSYSTEM_NATIVE_WINDOWS", Const, 15}, + {"IMAGE_SUBSYSTEM_OS2_CUI", Const, 15}, + {"IMAGE_SUBSYSTEM_POSIX_CUI", Const, 15}, + {"IMAGE_SUBSYSTEM_UNKNOWN", Const, 15}, + {"IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION", Const, 15}, + {"IMAGE_SUBSYSTEM_WINDOWS_CE_GUI", Const, 15}, + {"IMAGE_SUBSYSTEM_WINDOWS_CUI", Const, 15}, + {"IMAGE_SUBSYSTEM_WINDOWS_GUI", Const, 15}, + {"IMAGE_SUBSYSTEM_XBOX", Const, 15}, + {"ImportDirectory", Type, 0}, + {"ImportDirectory.FirstThunk", Field, 0}, + {"ImportDirectory.ForwarderChain", Field, 0}, + {"ImportDirectory.Name", Field, 0}, + {"ImportDirectory.OriginalFirstThunk", Field, 0}, + {"ImportDirectory.TimeDateStamp", Field, 0}, + {"NewFile", Func, 0}, + {"Open", Func, 0}, + {"OptionalHeader32", Type, 3}, + {"OptionalHeader32.AddressOfEntryPoint", Field, 3}, + {"OptionalHeader32.BaseOfCode", Field, 3}, + {"OptionalHeader32.BaseOfData", Field, 3}, + {"OptionalHeader32.CheckSum", Field, 3}, + {"OptionalHeader32.DataDirectory", Field, 3}, + {"OptionalHeader32.DllCharacteristics", Field, 3}, + {"OptionalHeader32.FileAlignment", Field, 3}, + {"OptionalHeader32.ImageBase", Field, 3}, + {"OptionalHeader32.LoaderFlags", Field, 3}, + {"OptionalHeader32.Magic", Field, 3}, + {"OptionalHeader32.MajorImageVersion", Field, 3}, + {"OptionalHeader32.MajorLinkerVersion", Field, 3}, + {"OptionalHeader32.MajorOperatingSystemVersion", Field, 3}, + {"OptionalHeader32.MajorSubsystemVersion", Field, 3}, + {"OptionalHeader32.MinorImageVersion", Field, 3}, + {"OptionalHeader32.MinorLinkerVersion", Field, 3}, + {"OptionalHeader32.MinorOperatingSystemVersion", Field, 3}, + {"OptionalHeader32.MinorSubsystemVersion", Field, 3}, + {"OptionalHeader32.NumberOfRvaAndSizes", Field, 3}, + {"OptionalHeader32.SectionAlignment", Field, 3}, + {"OptionalHeader32.SizeOfCode", Field, 3}, + {"OptionalHeader32.SizeOfHeaders", Field, 3}, + {"OptionalHeader32.SizeOfHeapCommit", Field, 3}, + {"OptionalHeader32.SizeOfHeapReserve", Field, 3}, + {"OptionalHeader32.SizeOfImage", Field, 3}, + {"OptionalHeader32.SizeOfInitializedData", Field, 3}, + {"OptionalHeader32.SizeOfStackCommit", Field, 3}, + {"OptionalHeader32.SizeOfStackReserve", Field, 3}, + {"OptionalHeader32.SizeOfUninitializedData", Field, 3}, + {"OptionalHeader32.Subsystem", Field, 3}, + {"OptionalHeader32.Win32VersionValue", Field, 3}, + {"OptionalHeader64", Type, 3}, + {"OptionalHeader64.AddressOfEntryPoint", Field, 3}, + {"OptionalHeader64.BaseOfCode", Field, 3}, + {"OptionalHeader64.CheckSum", Field, 3}, + {"OptionalHeader64.DataDirectory", Field, 3}, + {"OptionalHeader64.DllCharacteristics", Field, 3}, + {"OptionalHeader64.FileAlignment", Field, 3}, + {"OptionalHeader64.ImageBase", Field, 3}, + {"OptionalHeader64.LoaderFlags", Field, 3}, + {"OptionalHeader64.Magic", Field, 3}, + {"OptionalHeader64.MajorImageVersion", Field, 3}, + {"OptionalHeader64.MajorLinkerVersion", Field, 3}, + {"OptionalHeader64.MajorOperatingSystemVersion", Field, 3}, + {"OptionalHeader64.MajorSubsystemVersion", Field, 3}, + {"OptionalHeader64.MinorImageVersion", Field, 3}, + {"OptionalHeader64.MinorLinkerVersion", Field, 3}, + {"OptionalHeader64.MinorOperatingSystemVersion", Field, 3}, + {"OptionalHeader64.MinorSubsystemVersion", Field, 3}, + {"OptionalHeader64.NumberOfRvaAndSizes", Field, 3}, + {"OptionalHeader64.SectionAlignment", Field, 3}, + {"OptionalHeader64.SizeOfCode", Field, 3}, + {"OptionalHeader64.SizeOfHeaders", Field, 3}, + {"OptionalHeader64.SizeOfHeapCommit", Field, 3}, + {"OptionalHeader64.SizeOfHeapReserve", Field, 3}, + {"OptionalHeader64.SizeOfImage", Field, 3}, + {"OptionalHeader64.SizeOfInitializedData", Field, 3}, + {"OptionalHeader64.SizeOfStackCommit", Field, 3}, + {"OptionalHeader64.SizeOfStackReserve", Field, 3}, + {"OptionalHeader64.SizeOfUninitializedData", Field, 3}, + {"OptionalHeader64.Subsystem", Field, 3}, + {"OptionalHeader64.Win32VersionValue", Field, 3}, + {"Reloc", Type, 8}, + {"Reloc.SymbolTableIndex", Field, 8}, + {"Reloc.Type", Field, 8}, + {"Reloc.VirtualAddress", Field, 8}, + {"Section", Type, 0}, + {"Section.ReaderAt", Field, 0}, + {"Section.Relocs", Field, 8}, + {"Section.SectionHeader", Field, 0}, + {"SectionHeader", Type, 0}, + {"SectionHeader.Characteristics", Field, 0}, + {"SectionHeader.Name", Field, 0}, + {"SectionHeader.NumberOfLineNumbers", Field, 0}, + {"SectionHeader.NumberOfRelocations", Field, 0}, + {"SectionHeader.Offset", Field, 0}, + {"SectionHeader.PointerToLineNumbers", Field, 0}, + {"SectionHeader.PointerToRelocations", Field, 0}, + {"SectionHeader.Size", Field, 0}, + {"SectionHeader.VirtualAddress", Field, 0}, + {"SectionHeader.VirtualSize", Field, 0}, + {"SectionHeader32", Type, 0}, + {"SectionHeader32.Characteristics", Field, 0}, + {"SectionHeader32.Name", Field, 0}, + {"SectionHeader32.NumberOfLineNumbers", Field, 0}, + {"SectionHeader32.NumberOfRelocations", Field, 0}, + {"SectionHeader32.PointerToLineNumbers", Field, 0}, + {"SectionHeader32.PointerToRawData", Field, 0}, + {"SectionHeader32.PointerToRelocations", Field, 0}, + {"SectionHeader32.SizeOfRawData", Field, 0}, + {"SectionHeader32.VirtualAddress", Field, 0}, + {"SectionHeader32.VirtualSize", Field, 0}, + {"StringTable", Type, 8}, + {"Symbol", Type, 1}, + {"Symbol.Name", Field, 1}, + {"Symbol.SectionNumber", Field, 1}, + {"Symbol.StorageClass", Field, 1}, + {"Symbol.Type", Field, 1}, + {"Symbol.Value", Field, 1}, + }, + "debug/plan9obj": { + {"(*File).Close", Method, 3}, + {"(*File).Section", Method, 3}, + {"(*File).Symbols", Method, 3}, + {"(*Section).Data", Method, 3}, + {"(*Section).Open", Method, 3}, + {"(Section).ReadAt", Method, 3}, + {"ErrNoSymbols", Var, 18}, + {"File", Type, 3}, + {"File.FileHeader", Field, 3}, + {"File.Sections", Field, 3}, + {"FileHeader", Type, 3}, + {"FileHeader.Bss", Field, 3}, + {"FileHeader.Entry", Field, 3}, + {"FileHeader.HdrSize", Field, 4}, + {"FileHeader.LoadAddress", Field, 4}, + {"FileHeader.Magic", Field, 3}, + {"FileHeader.PtrSize", Field, 3}, + {"Magic386", Const, 3}, + {"Magic64", Const, 3}, + {"MagicAMD64", Const, 3}, + {"MagicARM", Const, 3}, + {"NewFile", Func, 3}, + {"Open", Func, 3}, + {"Section", Type, 3}, + {"Section.ReaderAt", Field, 3}, + {"Section.SectionHeader", Field, 3}, + {"SectionHeader", Type, 3}, + {"SectionHeader.Name", Field, 3}, + {"SectionHeader.Offset", Field, 3}, + {"SectionHeader.Size", Field, 3}, + {"Sym", Type, 3}, + {"Sym.Name", Field, 3}, + {"Sym.Type", Field, 3}, + {"Sym.Value", Field, 3}, + }, + "embed": { + {"(FS).Open", Method, 16}, + {"(FS).ReadDir", Method, 16}, + {"(FS).ReadFile", Method, 16}, + {"FS", Type, 16}, + }, + "encoding": { + {"BinaryMarshaler", Type, 2}, + {"BinaryUnmarshaler", Type, 2}, + {"TextMarshaler", Type, 2}, + {"TextUnmarshaler", Type, 2}, + }, + "encoding/ascii85": { + {"(CorruptInputError).Error", Method, 0}, + {"CorruptInputError", Type, 0}, + {"Decode", Func, 0}, + {"Encode", Func, 0}, + {"MaxEncodedLen", Func, 0}, + {"NewDecoder", Func, 0}, + {"NewEncoder", Func, 0}, + }, + "encoding/asn1": { + {"(BitString).At", Method, 0}, + {"(BitString).RightAlign", Method, 0}, + {"(ObjectIdentifier).Equal", Method, 0}, + {"(ObjectIdentifier).String", Method, 3}, + {"(StructuralError).Error", Method, 0}, + {"(SyntaxError).Error", Method, 0}, + {"BitString", Type, 0}, + {"BitString.BitLength", Field, 0}, + {"BitString.Bytes", Field, 0}, + {"ClassApplication", Const, 6}, + {"ClassContextSpecific", Const, 6}, + {"ClassPrivate", Const, 6}, + {"ClassUniversal", Const, 6}, + {"Enumerated", Type, 0}, + {"Flag", Type, 0}, + {"Marshal", Func, 0}, + {"MarshalWithParams", Func, 10}, + {"NullBytes", Var, 9}, + {"NullRawValue", Var, 9}, + {"ObjectIdentifier", Type, 0}, + {"RawContent", Type, 0}, + {"RawValue", Type, 0}, + {"RawValue.Bytes", Field, 0}, + {"RawValue.Class", Field, 0}, + {"RawValue.FullBytes", Field, 0}, + {"RawValue.IsCompound", Field, 0}, + {"RawValue.Tag", Field, 0}, + {"StructuralError", Type, 0}, + {"StructuralError.Msg", Field, 0}, + {"SyntaxError", Type, 0}, + {"SyntaxError.Msg", Field, 0}, + {"TagBMPString", Const, 14}, + {"TagBitString", Const, 6}, + {"TagBoolean", Const, 6}, + {"TagEnum", Const, 6}, + {"TagGeneralString", Const, 6}, + {"TagGeneralizedTime", Const, 6}, + {"TagIA5String", Const, 6}, + {"TagInteger", Const, 6}, + {"TagNull", Const, 9}, + {"TagNumericString", Const, 10}, + {"TagOID", Const, 6}, + {"TagOctetString", Const, 6}, + {"TagPrintableString", Const, 6}, + {"TagSequence", Const, 6}, + {"TagSet", Const, 6}, + {"TagT61String", Const, 6}, + {"TagUTCTime", Const, 6}, + {"TagUTF8String", Const, 6}, + {"Unmarshal", Func, 0}, + {"UnmarshalWithParams", Func, 0}, + }, + "encoding/base32": { + {"(*Encoding).AppendDecode", Method, 22}, + {"(*Encoding).AppendEncode", Method, 22}, + {"(*Encoding).Decode", Method, 0}, + {"(*Encoding).DecodeString", Method, 0}, + {"(*Encoding).DecodedLen", Method, 0}, + {"(*Encoding).Encode", Method, 0}, + {"(*Encoding).EncodeToString", Method, 0}, + {"(*Encoding).EncodedLen", Method, 0}, + {"(CorruptInputError).Error", Method, 0}, + {"(Encoding).WithPadding", Method, 9}, + {"CorruptInputError", Type, 0}, + {"Encoding", Type, 0}, + {"HexEncoding", Var, 0}, + {"NewDecoder", Func, 0}, + {"NewEncoder", Func, 0}, + {"NewEncoding", Func, 0}, + {"NoPadding", Const, 9}, + {"StdEncoding", Var, 0}, + {"StdPadding", Const, 9}, + }, + "encoding/base64": { + {"(*Encoding).AppendDecode", Method, 22}, + {"(*Encoding).AppendEncode", Method, 22}, + {"(*Encoding).Decode", Method, 0}, + {"(*Encoding).DecodeString", Method, 0}, + {"(*Encoding).DecodedLen", Method, 0}, + {"(*Encoding).Encode", Method, 0}, + {"(*Encoding).EncodeToString", Method, 0}, + {"(*Encoding).EncodedLen", Method, 0}, + {"(CorruptInputError).Error", Method, 0}, + {"(Encoding).Strict", Method, 8}, + {"(Encoding).WithPadding", Method, 5}, + {"CorruptInputError", Type, 0}, + {"Encoding", Type, 0}, + {"NewDecoder", Func, 0}, + {"NewEncoder", Func, 0}, + {"NewEncoding", Func, 0}, + {"NoPadding", Const, 5}, + {"RawStdEncoding", Var, 5}, + {"RawURLEncoding", Var, 5}, + {"StdEncoding", Var, 0}, + {"StdPadding", Const, 5}, + {"URLEncoding", Var, 0}, + }, + "encoding/binary": { + {"Append", Func, 23}, + {"AppendByteOrder", Type, 19}, + {"AppendUvarint", Func, 19}, + {"AppendVarint", Func, 19}, + {"BigEndian", Var, 0}, + {"ByteOrder", Type, 0}, + {"Decode", Func, 23}, + {"Encode", Func, 23}, + {"LittleEndian", Var, 0}, + {"MaxVarintLen16", Const, 0}, + {"MaxVarintLen32", Const, 0}, + {"MaxVarintLen64", Const, 0}, + {"NativeEndian", Var, 21}, + {"PutUvarint", Func, 0}, + {"PutVarint", Func, 0}, + {"Read", Func, 0}, + {"ReadUvarint", Func, 0}, + {"ReadVarint", Func, 0}, + {"Size", Func, 0}, + {"Uvarint", Func, 0}, + {"Varint", Func, 0}, + {"Write", Func, 0}, + }, + "encoding/csv": { + {"(*ParseError).Error", Method, 0}, + {"(*ParseError).Unwrap", Method, 13}, + {"(*Reader).FieldPos", Method, 17}, + {"(*Reader).InputOffset", Method, 19}, + {"(*Reader).Read", Method, 0}, + {"(*Reader).ReadAll", Method, 0}, + {"(*Writer).Error", Method, 1}, + {"(*Writer).Flush", Method, 0}, + {"(*Writer).Write", Method, 0}, + {"(*Writer).WriteAll", Method, 0}, + {"ErrBareQuote", Var, 0}, + {"ErrFieldCount", Var, 0}, + {"ErrQuote", Var, 0}, + {"ErrTrailingComma", Var, 0}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"ParseError", Type, 0}, + {"ParseError.Column", Field, 0}, + {"ParseError.Err", Field, 0}, + {"ParseError.Line", Field, 0}, + {"ParseError.StartLine", Field, 10}, + {"Reader", Type, 0}, + {"Reader.Comma", Field, 0}, + {"Reader.Comment", Field, 0}, + {"Reader.FieldsPerRecord", Field, 0}, + {"Reader.LazyQuotes", Field, 0}, + {"Reader.ReuseRecord", Field, 9}, + {"Reader.TrailingComma", Field, 0}, + {"Reader.TrimLeadingSpace", Field, 0}, + {"Writer", Type, 0}, + {"Writer.Comma", Field, 0}, + {"Writer.UseCRLF", Field, 0}, + }, + "encoding/gob": { + {"(*Decoder).Decode", Method, 0}, + {"(*Decoder).DecodeValue", Method, 0}, + {"(*Encoder).Encode", Method, 0}, + {"(*Encoder).EncodeValue", Method, 0}, + {"CommonType", Type, 0}, + {"CommonType.Id", Field, 0}, + {"CommonType.Name", Field, 0}, + {"Decoder", Type, 0}, + {"Encoder", Type, 0}, + {"GobDecoder", Type, 0}, + {"GobEncoder", Type, 0}, + {"NewDecoder", Func, 0}, + {"NewEncoder", Func, 0}, + {"Register", Func, 0}, + {"RegisterName", Func, 0}, + }, + "encoding/hex": { + {"(InvalidByteError).Error", Method, 0}, + {"AppendDecode", Func, 22}, + {"AppendEncode", Func, 22}, + {"Decode", Func, 0}, + {"DecodeString", Func, 0}, + {"DecodedLen", Func, 0}, + {"Dump", Func, 0}, + {"Dumper", Func, 0}, + {"Encode", Func, 0}, + {"EncodeToString", Func, 0}, + {"EncodedLen", Func, 0}, + {"ErrLength", Var, 0}, + {"InvalidByteError", Type, 0}, + {"NewDecoder", Func, 10}, + {"NewEncoder", Func, 10}, + }, + "encoding/json": { + {"(*Decoder).Buffered", Method, 1}, + {"(*Decoder).Decode", Method, 0}, + {"(*Decoder).DisallowUnknownFields", Method, 10}, + {"(*Decoder).InputOffset", Method, 14}, + {"(*Decoder).More", Method, 5}, + {"(*Decoder).Token", Method, 5}, + {"(*Decoder).UseNumber", Method, 1}, + {"(*Encoder).Encode", Method, 0}, + {"(*Encoder).SetEscapeHTML", Method, 7}, + {"(*Encoder).SetIndent", Method, 7}, + {"(*InvalidUTF8Error).Error", Method, 0}, + {"(*InvalidUnmarshalError).Error", Method, 0}, + {"(*MarshalerError).Error", Method, 0}, + {"(*MarshalerError).Unwrap", Method, 13}, + {"(*RawMessage).MarshalJSON", Method, 0}, + {"(*RawMessage).UnmarshalJSON", Method, 0}, + {"(*SyntaxError).Error", Method, 0}, + {"(*UnmarshalFieldError).Error", Method, 0}, + {"(*UnmarshalTypeError).Error", Method, 0}, + {"(*UnsupportedTypeError).Error", Method, 0}, + {"(*UnsupportedValueError).Error", Method, 0}, + {"(Delim).String", Method, 5}, + {"(Number).Float64", Method, 1}, + {"(Number).Int64", Method, 1}, + {"(Number).String", Method, 1}, + {"(RawMessage).MarshalJSON", Method, 8}, + {"Compact", Func, 0}, + {"Decoder", Type, 0}, + {"Delim", Type, 5}, + {"Encoder", Type, 0}, + {"HTMLEscape", Func, 0}, + {"Indent", Func, 0}, + {"InvalidUTF8Error", Type, 0}, + {"InvalidUTF8Error.S", Field, 0}, + {"InvalidUnmarshalError", Type, 0}, + {"InvalidUnmarshalError.Type", Field, 0}, + {"Marshal", Func, 0}, + {"MarshalIndent", Func, 0}, + {"Marshaler", Type, 0}, + {"MarshalerError", Type, 0}, + {"MarshalerError.Err", Field, 0}, + {"MarshalerError.Type", Field, 0}, + {"NewDecoder", Func, 0}, + {"NewEncoder", Func, 0}, + {"Number", Type, 1}, + {"RawMessage", Type, 0}, + {"SyntaxError", Type, 0}, + {"SyntaxError.Offset", Field, 0}, + {"Token", Type, 5}, + {"Unmarshal", Func, 0}, + {"UnmarshalFieldError", Type, 0}, + {"UnmarshalFieldError.Field", Field, 0}, + {"UnmarshalFieldError.Key", Field, 0}, + {"UnmarshalFieldError.Type", Field, 0}, + {"UnmarshalTypeError", Type, 0}, + {"UnmarshalTypeError.Field", Field, 8}, + {"UnmarshalTypeError.Offset", Field, 5}, + {"UnmarshalTypeError.Struct", Field, 8}, + {"UnmarshalTypeError.Type", Field, 0}, + {"UnmarshalTypeError.Value", Field, 0}, + {"Unmarshaler", Type, 0}, + {"UnsupportedTypeError", Type, 0}, + {"UnsupportedTypeError.Type", Field, 0}, + {"UnsupportedValueError", Type, 0}, + {"UnsupportedValueError.Str", Field, 0}, + {"UnsupportedValueError.Value", Field, 0}, + {"Valid", Func, 9}, + }, + "encoding/pem": { + {"Block", Type, 0}, + {"Block.Bytes", Field, 0}, + {"Block.Headers", Field, 0}, + {"Block.Type", Field, 0}, + {"Decode", Func, 0}, + {"Encode", Func, 0}, + {"EncodeToMemory", Func, 0}, + }, + "encoding/xml": { + {"(*Decoder).Decode", Method, 0}, + {"(*Decoder).DecodeElement", Method, 0}, + {"(*Decoder).InputOffset", Method, 4}, + {"(*Decoder).InputPos", Method, 19}, + {"(*Decoder).RawToken", Method, 0}, + {"(*Decoder).Skip", Method, 0}, + {"(*Decoder).Token", Method, 0}, + {"(*Encoder).Close", Method, 20}, + {"(*Encoder).Encode", Method, 0}, + {"(*Encoder).EncodeElement", Method, 2}, + {"(*Encoder).EncodeToken", Method, 2}, + {"(*Encoder).Flush", Method, 2}, + {"(*Encoder).Indent", Method, 1}, + {"(*SyntaxError).Error", Method, 0}, + {"(*TagPathError).Error", Method, 0}, + {"(*UnsupportedTypeError).Error", Method, 0}, + {"(CharData).Copy", Method, 0}, + {"(Comment).Copy", Method, 0}, + {"(Directive).Copy", Method, 0}, + {"(ProcInst).Copy", Method, 0}, + {"(StartElement).Copy", Method, 0}, + {"(StartElement).End", Method, 2}, + {"(UnmarshalError).Error", Method, 0}, + {"Attr", Type, 0}, + {"Attr.Name", Field, 0}, + {"Attr.Value", Field, 0}, + {"CharData", Type, 0}, + {"Comment", Type, 0}, + {"CopyToken", Func, 0}, + {"Decoder", Type, 0}, + {"Decoder.AutoClose", Field, 0}, + {"Decoder.CharsetReader", Field, 0}, + {"Decoder.DefaultSpace", Field, 1}, + {"Decoder.Entity", Field, 0}, + {"Decoder.Strict", Field, 0}, + {"Directive", Type, 0}, + {"Encoder", Type, 0}, + {"EndElement", Type, 0}, + {"EndElement.Name", Field, 0}, + {"Escape", Func, 0}, + {"EscapeText", Func, 1}, + {"HTMLAutoClose", Var, 0}, + {"HTMLEntity", Var, 0}, + {"Header", Const, 0}, + {"Marshal", Func, 0}, + {"MarshalIndent", Func, 0}, + {"Marshaler", Type, 2}, + {"MarshalerAttr", Type, 2}, + {"Name", Type, 0}, + {"Name.Local", Field, 0}, + {"Name.Space", Field, 0}, + {"NewDecoder", Func, 0}, + {"NewEncoder", Func, 0}, + {"NewTokenDecoder", Func, 10}, + {"ProcInst", Type, 0}, + {"ProcInst.Inst", Field, 0}, + {"ProcInst.Target", Field, 0}, + {"StartElement", Type, 0}, + {"StartElement.Attr", Field, 0}, + {"StartElement.Name", Field, 0}, + {"SyntaxError", Type, 0}, + {"SyntaxError.Line", Field, 0}, + {"SyntaxError.Msg", Field, 0}, + {"TagPathError", Type, 0}, + {"TagPathError.Field1", Field, 0}, + {"TagPathError.Field2", Field, 0}, + {"TagPathError.Struct", Field, 0}, + {"TagPathError.Tag1", Field, 0}, + {"TagPathError.Tag2", Field, 0}, + {"Token", Type, 0}, + {"TokenReader", Type, 10}, + {"Unmarshal", Func, 0}, + {"UnmarshalError", Type, 0}, + {"Unmarshaler", Type, 2}, + {"UnmarshalerAttr", Type, 2}, + {"UnsupportedTypeError", Type, 0}, + {"UnsupportedTypeError.Type", Field, 0}, + }, + "errors": { + {"As", Func, 13}, + {"ErrUnsupported", Var, 21}, + {"Is", Func, 13}, + {"Join", Func, 20}, + {"New", Func, 0}, + {"Unwrap", Func, 13}, + }, + "expvar": { + {"(*Float).Add", Method, 0}, + {"(*Float).Set", Method, 0}, + {"(*Float).String", Method, 0}, + {"(*Float).Value", Method, 8}, + {"(*Int).Add", Method, 0}, + {"(*Int).Set", Method, 0}, + {"(*Int).String", Method, 0}, + {"(*Int).Value", Method, 8}, + {"(*Map).Add", Method, 0}, + {"(*Map).AddFloat", Method, 0}, + {"(*Map).Delete", Method, 12}, + {"(*Map).Do", Method, 0}, + {"(*Map).Get", Method, 0}, + {"(*Map).Init", Method, 0}, + {"(*Map).Set", Method, 0}, + {"(*Map).String", Method, 0}, + {"(*String).Set", Method, 0}, + {"(*String).String", Method, 0}, + {"(*String).Value", Method, 8}, + {"(Func).String", Method, 0}, + {"(Func).Value", Method, 8}, + {"Do", Func, 0}, + {"Float", Type, 0}, + {"Func", Type, 0}, + {"Get", Func, 0}, + {"Handler", Func, 8}, + {"Int", Type, 0}, + {"KeyValue", Type, 0}, + {"KeyValue.Key", Field, 0}, + {"KeyValue.Value", Field, 0}, + {"Map", Type, 0}, + {"NewFloat", Func, 0}, + {"NewInt", Func, 0}, + {"NewMap", Func, 0}, + {"NewString", Func, 0}, + {"Publish", Func, 0}, + {"String", Type, 0}, + {"Var", Type, 0}, + }, + "flag": { + {"(*FlagSet).Arg", Method, 0}, + {"(*FlagSet).Args", Method, 0}, + {"(*FlagSet).Bool", Method, 0}, + {"(*FlagSet).BoolFunc", Method, 21}, + {"(*FlagSet).BoolVar", Method, 0}, + {"(*FlagSet).Duration", Method, 0}, + {"(*FlagSet).DurationVar", Method, 0}, + {"(*FlagSet).ErrorHandling", Method, 10}, + {"(*FlagSet).Float64", Method, 0}, + {"(*FlagSet).Float64Var", Method, 0}, + {"(*FlagSet).Func", Method, 16}, + {"(*FlagSet).Init", Method, 0}, + {"(*FlagSet).Int", Method, 0}, + {"(*FlagSet).Int64", Method, 0}, + {"(*FlagSet).Int64Var", Method, 0}, + {"(*FlagSet).IntVar", Method, 0}, + {"(*FlagSet).Lookup", Method, 0}, + {"(*FlagSet).NArg", Method, 0}, + {"(*FlagSet).NFlag", Method, 0}, + {"(*FlagSet).Name", Method, 10}, + {"(*FlagSet).Output", Method, 10}, + {"(*FlagSet).Parse", Method, 0}, + {"(*FlagSet).Parsed", Method, 0}, + {"(*FlagSet).PrintDefaults", Method, 0}, + {"(*FlagSet).Set", Method, 0}, + {"(*FlagSet).SetOutput", Method, 0}, + {"(*FlagSet).String", Method, 0}, + {"(*FlagSet).StringVar", Method, 0}, + {"(*FlagSet).TextVar", Method, 19}, + {"(*FlagSet).Uint", Method, 0}, + {"(*FlagSet).Uint64", Method, 0}, + {"(*FlagSet).Uint64Var", Method, 0}, + {"(*FlagSet).UintVar", Method, 0}, + {"(*FlagSet).Var", Method, 0}, + {"(*FlagSet).Visit", Method, 0}, + {"(*FlagSet).VisitAll", Method, 0}, + {"Arg", Func, 0}, + {"Args", Func, 0}, + {"Bool", Func, 0}, + {"BoolFunc", Func, 21}, + {"BoolVar", Func, 0}, + {"CommandLine", Var, 2}, + {"ContinueOnError", Const, 0}, + {"Duration", Func, 0}, + {"DurationVar", Func, 0}, + {"ErrHelp", Var, 0}, + {"ErrorHandling", Type, 0}, + {"ExitOnError", Const, 0}, + {"Flag", Type, 0}, + {"Flag.DefValue", Field, 0}, + {"Flag.Name", Field, 0}, + {"Flag.Usage", Field, 0}, + {"Flag.Value", Field, 0}, + {"FlagSet", Type, 0}, + {"FlagSet.Usage", Field, 0}, + {"Float64", Func, 0}, + {"Float64Var", Func, 0}, + {"Func", Func, 16}, + {"Getter", Type, 2}, + {"Int", Func, 0}, + {"Int64", Func, 0}, + {"Int64Var", Func, 0}, + {"IntVar", Func, 0}, + {"Lookup", Func, 0}, + {"NArg", Func, 0}, + {"NFlag", Func, 0}, + {"NewFlagSet", Func, 0}, + {"PanicOnError", Const, 0}, + {"Parse", Func, 0}, + {"Parsed", Func, 0}, + {"PrintDefaults", Func, 0}, + {"Set", Func, 0}, + {"String", Func, 0}, + {"StringVar", Func, 0}, + {"TextVar", Func, 19}, + {"Uint", Func, 0}, + {"Uint64", Func, 0}, + {"Uint64Var", Func, 0}, + {"UintVar", Func, 0}, + {"UnquoteUsage", Func, 5}, + {"Usage", Var, 0}, + {"Value", Type, 0}, + {"Var", Func, 0}, + {"Visit", Func, 0}, + {"VisitAll", Func, 0}, + }, + "fmt": { + {"Append", Func, 19}, + {"Appendf", Func, 19}, + {"Appendln", Func, 19}, + {"Errorf", Func, 0}, + {"FormatString", Func, 20}, + {"Formatter", Type, 0}, + {"Fprint", Func, 0}, + {"Fprintf", Func, 0}, + {"Fprintln", Func, 0}, + {"Fscan", Func, 0}, + {"Fscanf", Func, 0}, + {"Fscanln", Func, 0}, + {"GoStringer", Type, 0}, + {"Print", Func, 0}, + {"Printf", Func, 0}, + {"Println", Func, 0}, + {"Scan", Func, 0}, + {"ScanState", Type, 0}, + {"Scanf", Func, 0}, + {"Scanln", Func, 0}, + {"Scanner", Type, 0}, + {"Sprint", Func, 0}, + {"Sprintf", Func, 0}, + {"Sprintln", Func, 0}, + {"Sscan", Func, 0}, + {"Sscanf", Func, 0}, + {"Sscanln", Func, 0}, + {"State", Type, 0}, + {"Stringer", Type, 0}, + }, + "go/ast": { + {"(*ArrayType).End", Method, 0}, + {"(*ArrayType).Pos", Method, 0}, + {"(*AssignStmt).End", Method, 0}, + {"(*AssignStmt).Pos", Method, 0}, + {"(*BadDecl).End", Method, 0}, + {"(*BadDecl).Pos", Method, 0}, + {"(*BadExpr).End", Method, 0}, + {"(*BadExpr).Pos", Method, 0}, + {"(*BadStmt).End", Method, 0}, + {"(*BadStmt).Pos", Method, 0}, + {"(*BasicLit).End", Method, 0}, + {"(*BasicLit).Pos", Method, 0}, + {"(*BinaryExpr).End", Method, 0}, + {"(*BinaryExpr).Pos", Method, 0}, + {"(*BlockStmt).End", Method, 0}, + {"(*BlockStmt).Pos", Method, 0}, + {"(*BranchStmt).End", Method, 0}, + {"(*BranchStmt).Pos", Method, 0}, + {"(*CallExpr).End", Method, 0}, + {"(*CallExpr).Pos", Method, 0}, + {"(*CaseClause).End", Method, 0}, + {"(*CaseClause).Pos", Method, 0}, + {"(*ChanType).End", Method, 0}, + {"(*ChanType).Pos", Method, 0}, + {"(*CommClause).End", Method, 0}, + {"(*CommClause).Pos", Method, 0}, + {"(*Comment).End", Method, 0}, + {"(*Comment).Pos", Method, 0}, + {"(*CommentGroup).End", Method, 0}, + {"(*CommentGroup).Pos", Method, 0}, + {"(*CommentGroup).Text", Method, 0}, + {"(*CompositeLit).End", Method, 0}, + {"(*CompositeLit).Pos", Method, 0}, + {"(*DeclStmt).End", Method, 0}, + {"(*DeclStmt).Pos", Method, 0}, + {"(*DeferStmt).End", Method, 0}, + {"(*DeferStmt).Pos", Method, 0}, + {"(*Ellipsis).End", Method, 0}, + {"(*Ellipsis).Pos", Method, 0}, + {"(*EmptyStmt).End", Method, 0}, + {"(*EmptyStmt).Pos", Method, 0}, + {"(*ExprStmt).End", Method, 0}, + {"(*ExprStmt).Pos", Method, 0}, + {"(*Field).End", Method, 0}, + {"(*Field).Pos", Method, 0}, + {"(*FieldList).End", Method, 0}, + {"(*FieldList).NumFields", Method, 0}, + {"(*FieldList).Pos", Method, 0}, + {"(*File).End", Method, 0}, + {"(*File).Pos", Method, 0}, + {"(*ForStmt).End", Method, 0}, + {"(*ForStmt).Pos", Method, 0}, + {"(*FuncDecl).End", Method, 0}, + {"(*FuncDecl).Pos", Method, 0}, + {"(*FuncLit).End", Method, 0}, + {"(*FuncLit).Pos", Method, 0}, + {"(*FuncType).End", Method, 0}, + {"(*FuncType).Pos", Method, 0}, + {"(*GenDecl).End", Method, 0}, + {"(*GenDecl).Pos", Method, 0}, + {"(*GoStmt).End", Method, 0}, + {"(*GoStmt).Pos", Method, 0}, + {"(*Ident).End", Method, 0}, + {"(*Ident).IsExported", Method, 0}, + {"(*Ident).Pos", Method, 0}, + {"(*Ident).String", Method, 0}, + {"(*IfStmt).End", Method, 0}, + {"(*IfStmt).Pos", Method, 0}, + {"(*ImportSpec).End", Method, 0}, + {"(*ImportSpec).Pos", Method, 0}, + {"(*IncDecStmt).End", Method, 0}, + {"(*IncDecStmt).Pos", Method, 0}, + {"(*IndexExpr).End", Method, 0}, + {"(*IndexExpr).Pos", Method, 0}, + {"(*IndexListExpr).End", Method, 18}, + {"(*IndexListExpr).Pos", Method, 18}, + {"(*InterfaceType).End", Method, 0}, + {"(*InterfaceType).Pos", Method, 0}, + {"(*KeyValueExpr).End", Method, 0}, + {"(*KeyValueExpr).Pos", Method, 0}, + {"(*LabeledStmt).End", Method, 0}, + {"(*LabeledStmt).Pos", Method, 0}, + {"(*MapType).End", Method, 0}, + {"(*MapType).Pos", Method, 0}, + {"(*Object).Pos", Method, 0}, + {"(*Package).End", Method, 0}, + {"(*Package).Pos", Method, 0}, + {"(*ParenExpr).End", Method, 0}, + {"(*ParenExpr).Pos", Method, 0}, + {"(*RangeStmt).End", Method, 0}, + {"(*RangeStmt).Pos", Method, 0}, + {"(*ReturnStmt).End", Method, 0}, + {"(*ReturnStmt).Pos", Method, 0}, + {"(*Scope).Insert", Method, 0}, + {"(*Scope).Lookup", Method, 0}, + {"(*Scope).String", Method, 0}, + {"(*SelectStmt).End", Method, 0}, + {"(*SelectStmt).Pos", Method, 0}, + {"(*SelectorExpr).End", Method, 0}, + {"(*SelectorExpr).Pos", Method, 0}, + {"(*SendStmt).End", Method, 0}, + {"(*SendStmt).Pos", Method, 0}, + {"(*SliceExpr).End", Method, 0}, + {"(*SliceExpr).Pos", Method, 0}, + {"(*StarExpr).End", Method, 0}, + {"(*StarExpr).Pos", Method, 0}, + {"(*StructType).End", Method, 0}, + {"(*StructType).Pos", Method, 0}, + {"(*SwitchStmt).End", Method, 0}, + {"(*SwitchStmt).Pos", Method, 0}, + {"(*TypeAssertExpr).End", Method, 0}, + {"(*TypeAssertExpr).Pos", Method, 0}, + {"(*TypeSpec).End", Method, 0}, + {"(*TypeSpec).Pos", Method, 0}, + {"(*TypeSwitchStmt).End", Method, 0}, + {"(*TypeSwitchStmt).Pos", Method, 0}, + {"(*UnaryExpr).End", Method, 0}, + {"(*UnaryExpr).Pos", Method, 0}, + {"(*ValueSpec).End", Method, 0}, + {"(*ValueSpec).Pos", Method, 0}, + {"(CommentMap).Comments", Method, 1}, + {"(CommentMap).Filter", Method, 1}, + {"(CommentMap).String", Method, 1}, + {"(CommentMap).Update", Method, 1}, + {"(ObjKind).String", Method, 0}, + {"ArrayType", Type, 0}, + {"ArrayType.Elt", Field, 0}, + {"ArrayType.Lbrack", Field, 0}, + {"ArrayType.Len", Field, 0}, + {"AssignStmt", Type, 0}, + {"AssignStmt.Lhs", Field, 0}, + {"AssignStmt.Rhs", Field, 0}, + {"AssignStmt.Tok", Field, 0}, + {"AssignStmt.TokPos", Field, 0}, + {"Bad", Const, 0}, + {"BadDecl", Type, 0}, + {"BadDecl.From", Field, 0}, + {"BadDecl.To", Field, 0}, + {"BadExpr", Type, 0}, + {"BadExpr.From", Field, 0}, + {"BadExpr.To", Field, 0}, + {"BadStmt", Type, 0}, + {"BadStmt.From", Field, 0}, + {"BadStmt.To", Field, 0}, + {"BasicLit", Type, 0}, + {"BasicLit.Kind", Field, 0}, + {"BasicLit.Value", Field, 0}, + {"BasicLit.ValuePos", Field, 0}, + {"BinaryExpr", Type, 0}, + {"BinaryExpr.Op", Field, 0}, + {"BinaryExpr.OpPos", Field, 0}, + {"BinaryExpr.X", Field, 0}, + {"BinaryExpr.Y", Field, 0}, + {"BlockStmt", Type, 0}, + {"BlockStmt.Lbrace", Field, 0}, + {"BlockStmt.List", Field, 0}, + {"BlockStmt.Rbrace", Field, 0}, + {"BranchStmt", Type, 0}, + {"BranchStmt.Label", Field, 0}, + {"BranchStmt.Tok", Field, 0}, + {"BranchStmt.TokPos", Field, 0}, + {"CallExpr", Type, 0}, + {"CallExpr.Args", Field, 0}, + {"CallExpr.Ellipsis", Field, 0}, + {"CallExpr.Fun", Field, 0}, + {"CallExpr.Lparen", Field, 0}, + {"CallExpr.Rparen", Field, 0}, + {"CaseClause", Type, 0}, + {"CaseClause.Body", Field, 0}, + {"CaseClause.Case", Field, 0}, + {"CaseClause.Colon", Field, 0}, + {"CaseClause.List", Field, 0}, + {"ChanDir", Type, 0}, + {"ChanType", Type, 0}, + {"ChanType.Arrow", Field, 1}, + {"ChanType.Begin", Field, 0}, + {"ChanType.Dir", Field, 0}, + {"ChanType.Value", Field, 0}, + {"CommClause", Type, 0}, + {"CommClause.Body", Field, 0}, + {"CommClause.Case", Field, 0}, + {"CommClause.Colon", Field, 0}, + {"CommClause.Comm", Field, 0}, + {"Comment", Type, 0}, + {"Comment.Slash", Field, 0}, + {"Comment.Text", Field, 0}, + {"CommentGroup", Type, 0}, + {"CommentGroup.List", Field, 0}, + {"CommentMap", Type, 1}, + {"CompositeLit", Type, 0}, + {"CompositeLit.Elts", Field, 0}, + {"CompositeLit.Incomplete", Field, 11}, + {"CompositeLit.Lbrace", Field, 0}, + {"CompositeLit.Rbrace", Field, 0}, + {"CompositeLit.Type", Field, 0}, + {"Con", Const, 0}, + {"Decl", Type, 0}, + {"DeclStmt", Type, 0}, + {"DeclStmt.Decl", Field, 0}, + {"DeferStmt", Type, 0}, + {"DeferStmt.Call", Field, 0}, + {"DeferStmt.Defer", Field, 0}, + {"Ellipsis", Type, 0}, + {"Ellipsis.Ellipsis", Field, 0}, + {"Ellipsis.Elt", Field, 0}, + {"EmptyStmt", Type, 0}, + {"EmptyStmt.Implicit", Field, 5}, + {"EmptyStmt.Semicolon", Field, 0}, + {"Expr", Type, 0}, + {"ExprStmt", Type, 0}, + {"ExprStmt.X", Field, 0}, + {"Field", Type, 0}, + {"Field.Comment", Field, 0}, + {"Field.Doc", Field, 0}, + {"Field.Names", Field, 0}, + {"Field.Tag", Field, 0}, + {"Field.Type", Field, 0}, + {"FieldFilter", Type, 0}, + {"FieldList", Type, 0}, + {"FieldList.Closing", Field, 0}, + {"FieldList.List", Field, 0}, + {"FieldList.Opening", Field, 0}, + {"File", Type, 0}, + {"File.Comments", Field, 0}, + {"File.Decls", Field, 0}, + {"File.Doc", Field, 0}, + {"File.FileEnd", Field, 20}, + {"File.FileStart", Field, 20}, + {"File.GoVersion", Field, 21}, + {"File.Imports", Field, 0}, + {"File.Name", Field, 0}, + {"File.Package", Field, 0}, + {"File.Scope", Field, 0}, + {"File.Unresolved", Field, 0}, + {"FileExports", Func, 0}, + {"Filter", Type, 0}, + {"FilterDecl", Func, 0}, + {"FilterFile", Func, 0}, + {"FilterFuncDuplicates", Const, 0}, + {"FilterImportDuplicates", Const, 0}, + {"FilterPackage", Func, 0}, + {"FilterUnassociatedComments", Const, 0}, + {"ForStmt", Type, 0}, + {"ForStmt.Body", Field, 0}, + {"ForStmt.Cond", Field, 0}, + {"ForStmt.For", Field, 0}, + {"ForStmt.Init", Field, 0}, + {"ForStmt.Post", Field, 0}, + {"Fprint", Func, 0}, + {"Fun", Const, 0}, + {"FuncDecl", Type, 0}, + {"FuncDecl.Body", Field, 0}, + {"FuncDecl.Doc", Field, 0}, + {"FuncDecl.Name", Field, 0}, + {"FuncDecl.Recv", Field, 0}, + {"FuncDecl.Type", Field, 0}, + {"FuncLit", Type, 0}, + {"FuncLit.Body", Field, 0}, + {"FuncLit.Type", Field, 0}, + {"FuncType", Type, 0}, + {"FuncType.Func", Field, 0}, + {"FuncType.Params", Field, 0}, + {"FuncType.Results", Field, 0}, + {"FuncType.TypeParams", Field, 18}, + {"GenDecl", Type, 0}, + {"GenDecl.Doc", Field, 0}, + {"GenDecl.Lparen", Field, 0}, + {"GenDecl.Rparen", Field, 0}, + {"GenDecl.Specs", Field, 0}, + {"GenDecl.Tok", Field, 0}, + {"GenDecl.TokPos", Field, 0}, + {"GoStmt", Type, 0}, + {"GoStmt.Call", Field, 0}, + {"GoStmt.Go", Field, 0}, + {"Ident", Type, 0}, + {"Ident.Name", Field, 0}, + {"Ident.NamePos", Field, 0}, + {"Ident.Obj", Field, 0}, + {"IfStmt", Type, 0}, + {"IfStmt.Body", Field, 0}, + {"IfStmt.Cond", Field, 0}, + {"IfStmt.Else", Field, 0}, + {"IfStmt.If", Field, 0}, + {"IfStmt.Init", Field, 0}, + {"ImportSpec", Type, 0}, + {"ImportSpec.Comment", Field, 0}, + {"ImportSpec.Doc", Field, 0}, + {"ImportSpec.EndPos", Field, 0}, + {"ImportSpec.Name", Field, 0}, + {"ImportSpec.Path", Field, 0}, + {"Importer", Type, 0}, + {"IncDecStmt", Type, 0}, + {"IncDecStmt.Tok", Field, 0}, + {"IncDecStmt.TokPos", Field, 0}, + {"IncDecStmt.X", Field, 0}, + {"IndexExpr", Type, 0}, + {"IndexExpr.Index", Field, 0}, + {"IndexExpr.Lbrack", Field, 0}, + {"IndexExpr.Rbrack", Field, 0}, + {"IndexExpr.X", Field, 0}, + {"IndexListExpr", Type, 18}, + {"IndexListExpr.Indices", Field, 18}, + {"IndexListExpr.Lbrack", Field, 18}, + {"IndexListExpr.Rbrack", Field, 18}, + {"IndexListExpr.X", Field, 18}, + {"Inspect", Func, 0}, + {"InterfaceType", Type, 0}, + {"InterfaceType.Incomplete", Field, 0}, + {"InterfaceType.Interface", Field, 0}, + {"InterfaceType.Methods", Field, 0}, + {"IsExported", Func, 0}, + {"IsGenerated", Func, 21}, + {"KeyValueExpr", Type, 0}, + {"KeyValueExpr.Colon", Field, 0}, + {"KeyValueExpr.Key", Field, 0}, + {"KeyValueExpr.Value", Field, 0}, + {"LabeledStmt", Type, 0}, + {"LabeledStmt.Colon", Field, 0}, + {"LabeledStmt.Label", Field, 0}, + {"LabeledStmt.Stmt", Field, 0}, + {"Lbl", Const, 0}, + {"MapType", Type, 0}, + {"MapType.Key", Field, 0}, + {"MapType.Map", Field, 0}, + {"MapType.Value", Field, 0}, + {"MergeMode", Type, 0}, + {"MergePackageFiles", Func, 0}, + {"NewCommentMap", Func, 1}, + {"NewIdent", Func, 0}, + {"NewObj", Func, 0}, + {"NewPackage", Func, 0}, + {"NewScope", Func, 0}, + {"Node", Type, 0}, + {"NotNilFilter", Func, 0}, + {"ObjKind", Type, 0}, + {"Object", Type, 0}, + {"Object.Data", Field, 0}, + {"Object.Decl", Field, 0}, + {"Object.Kind", Field, 0}, + {"Object.Name", Field, 0}, + {"Object.Type", Field, 0}, + {"Package", Type, 0}, + {"Package.Files", Field, 0}, + {"Package.Imports", Field, 0}, + {"Package.Name", Field, 0}, + {"Package.Scope", Field, 0}, + {"PackageExports", Func, 0}, + {"ParenExpr", Type, 0}, + {"ParenExpr.Lparen", Field, 0}, + {"ParenExpr.Rparen", Field, 0}, + {"ParenExpr.X", Field, 0}, + {"Pkg", Const, 0}, + {"Preorder", Func, 23}, + {"Print", Func, 0}, + {"RECV", Const, 0}, + {"RangeStmt", Type, 0}, + {"RangeStmt.Body", Field, 0}, + {"RangeStmt.For", Field, 0}, + {"RangeStmt.Key", Field, 0}, + {"RangeStmt.Range", Field, 20}, + {"RangeStmt.Tok", Field, 0}, + {"RangeStmt.TokPos", Field, 0}, + {"RangeStmt.Value", Field, 0}, + {"RangeStmt.X", Field, 0}, + {"ReturnStmt", Type, 0}, + {"ReturnStmt.Results", Field, 0}, + {"ReturnStmt.Return", Field, 0}, + {"SEND", Const, 0}, + {"Scope", Type, 0}, + {"Scope.Objects", Field, 0}, + {"Scope.Outer", Field, 0}, + {"SelectStmt", Type, 0}, + {"SelectStmt.Body", Field, 0}, + {"SelectStmt.Select", Field, 0}, + {"SelectorExpr", Type, 0}, + {"SelectorExpr.Sel", Field, 0}, + {"SelectorExpr.X", Field, 0}, + {"SendStmt", Type, 0}, + {"SendStmt.Arrow", Field, 0}, + {"SendStmt.Chan", Field, 0}, + {"SendStmt.Value", Field, 0}, + {"SliceExpr", Type, 0}, + {"SliceExpr.High", Field, 0}, + {"SliceExpr.Lbrack", Field, 0}, + {"SliceExpr.Low", Field, 0}, + {"SliceExpr.Max", Field, 2}, + {"SliceExpr.Rbrack", Field, 0}, + {"SliceExpr.Slice3", Field, 2}, + {"SliceExpr.X", Field, 0}, + {"SortImports", Func, 0}, + {"Spec", Type, 0}, + {"StarExpr", Type, 0}, + {"StarExpr.Star", Field, 0}, + {"StarExpr.X", Field, 0}, + {"Stmt", Type, 0}, + {"StructType", Type, 0}, + {"StructType.Fields", Field, 0}, + {"StructType.Incomplete", Field, 0}, + {"StructType.Struct", Field, 0}, + {"SwitchStmt", Type, 0}, + {"SwitchStmt.Body", Field, 0}, + {"SwitchStmt.Init", Field, 0}, + {"SwitchStmt.Switch", Field, 0}, + {"SwitchStmt.Tag", Field, 0}, + {"Typ", Const, 0}, + {"TypeAssertExpr", Type, 0}, + {"TypeAssertExpr.Lparen", Field, 2}, + {"TypeAssertExpr.Rparen", Field, 2}, + {"TypeAssertExpr.Type", Field, 0}, + {"TypeAssertExpr.X", Field, 0}, + {"TypeSpec", Type, 0}, + {"TypeSpec.Assign", Field, 9}, + {"TypeSpec.Comment", Field, 0}, + {"TypeSpec.Doc", Field, 0}, + {"TypeSpec.Name", Field, 0}, + {"TypeSpec.Type", Field, 0}, + {"TypeSpec.TypeParams", Field, 18}, + {"TypeSwitchStmt", Type, 0}, + {"TypeSwitchStmt.Assign", Field, 0}, + {"TypeSwitchStmt.Body", Field, 0}, + {"TypeSwitchStmt.Init", Field, 0}, + {"TypeSwitchStmt.Switch", Field, 0}, + {"UnaryExpr", Type, 0}, + {"UnaryExpr.Op", Field, 0}, + {"UnaryExpr.OpPos", Field, 0}, + {"UnaryExpr.X", Field, 0}, + {"Unparen", Func, 22}, + {"ValueSpec", Type, 0}, + {"ValueSpec.Comment", Field, 0}, + {"ValueSpec.Doc", Field, 0}, + {"ValueSpec.Names", Field, 0}, + {"ValueSpec.Type", Field, 0}, + {"ValueSpec.Values", Field, 0}, + {"Var", Const, 0}, + {"Visitor", Type, 0}, + {"Walk", Func, 0}, + }, + "go/build": { + {"(*Context).Import", Method, 0}, + {"(*Context).ImportDir", Method, 0}, + {"(*Context).MatchFile", Method, 2}, + {"(*Context).SrcDirs", Method, 0}, + {"(*MultiplePackageError).Error", Method, 4}, + {"(*NoGoError).Error", Method, 0}, + {"(*Package).IsCommand", Method, 0}, + {"AllowBinary", Const, 0}, + {"ArchChar", Func, 0}, + {"Context", Type, 0}, + {"Context.BuildTags", Field, 0}, + {"Context.CgoEnabled", Field, 0}, + {"Context.Compiler", Field, 0}, + {"Context.Dir", Field, 14}, + {"Context.GOARCH", Field, 0}, + {"Context.GOOS", Field, 0}, + {"Context.GOPATH", Field, 0}, + {"Context.GOROOT", Field, 0}, + {"Context.HasSubdir", Field, 0}, + {"Context.InstallSuffix", Field, 1}, + {"Context.IsAbsPath", Field, 0}, + {"Context.IsDir", Field, 0}, + {"Context.JoinPath", Field, 0}, + {"Context.OpenFile", Field, 0}, + {"Context.ReadDir", Field, 0}, + {"Context.ReleaseTags", Field, 1}, + {"Context.SplitPathList", Field, 0}, + {"Context.ToolTags", Field, 17}, + {"Context.UseAllFiles", Field, 0}, + {"Default", Var, 0}, + {"Directive", Type, 21}, + {"Directive.Pos", Field, 21}, + {"Directive.Text", Field, 21}, + {"FindOnly", Const, 0}, + {"IgnoreVendor", Const, 6}, + {"Import", Func, 0}, + {"ImportComment", Const, 4}, + {"ImportDir", Func, 0}, + {"ImportMode", Type, 0}, + {"IsLocalImport", Func, 0}, + {"MultiplePackageError", Type, 4}, + {"MultiplePackageError.Dir", Field, 4}, + {"MultiplePackageError.Files", Field, 4}, + {"MultiplePackageError.Packages", Field, 4}, + {"NoGoError", Type, 0}, + {"NoGoError.Dir", Field, 0}, + {"Package", Type, 0}, + {"Package.AllTags", Field, 2}, + {"Package.BinDir", Field, 0}, + {"Package.BinaryOnly", Field, 7}, + {"Package.CFiles", Field, 0}, + {"Package.CXXFiles", Field, 2}, + {"Package.CgoCFLAGS", Field, 0}, + {"Package.CgoCPPFLAGS", Field, 2}, + {"Package.CgoCXXFLAGS", Field, 2}, + {"Package.CgoFFLAGS", Field, 7}, + {"Package.CgoFiles", Field, 0}, + {"Package.CgoLDFLAGS", Field, 0}, + {"Package.CgoPkgConfig", Field, 0}, + {"Package.ConflictDir", Field, 2}, + {"Package.Dir", Field, 0}, + {"Package.Directives", Field, 21}, + {"Package.Doc", Field, 0}, + {"Package.EmbedPatternPos", Field, 16}, + {"Package.EmbedPatterns", Field, 16}, + {"Package.FFiles", Field, 7}, + {"Package.GoFiles", Field, 0}, + {"Package.Goroot", Field, 0}, + {"Package.HFiles", Field, 0}, + {"Package.IgnoredGoFiles", Field, 1}, + {"Package.IgnoredOtherFiles", Field, 16}, + {"Package.ImportComment", Field, 4}, + {"Package.ImportPath", Field, 0}, + {"Package.ImportPos", Field, 0}, + {"Package.Imports", Field, 0}, + {"Package.InvalidGoFiles", Field, 6}, + {"Package.MFiles", Field, 3}, + {"Package.Name", Field, 0}, + {"Package.PkgObj", Field, 0}, + {"Package.PkgRoot", Field, 0}, + {"Package.PkgTargetRoot", Field, 5}, + {"Package.Root", Field, 0}, + {"Package.SFiles", Field, 0}, + {"Package.SrcRoot", Field, 0}, + {"Package.SwigCXXFiles", Field, 1}, + {"Package.SwigFiles", Field, 1}, + {"Package.SysoFiles", Field, 0}, + {"Package.TestDirectives", Field, 21}, + {"Package.TestEmbedPatternPos", Field, 16}, + {"Package.TestEmbedPatterns", Field, 16}, + {"Package.TestGoFiles", Field, 0}, + {"Package.TestImportPos", Field, 0}, + {"Package.TestImports", Field, 0}, + {"Package.XTestDirectives", Field, 21}, + {"Package.XTestEmbedPatternPos", Field, 16}, + {"Package.XTestEmbedPatterns", Field, 16}, + {"Package.XTestGoFiles", Field, 0}, + {"Package.XTestImportPos", Field, 0}, + {"Package.XTestImports", Field, 0}, + {"ToolDir", Var, 0}, + }, + "go/build/constraint": { + {"(*AndExpr).Eval", Method, 16}, + {"(*AndExpr).String", Method, 16}, + {"(*NotExpr).Eval", Method, 16}, + {"(*NotExpr).String", Method, 16}, + {"(*OrExpr).Eval", Method, 16}, + {"(*OrExpr).String", Method, 16}, + {"(*SyntaxError).Error", Method, 16}, + {"(*TagExpr).Eval", Method, 16}, + {"(*TagExpr).String", Method, 16}, + {"AndExpr", Type, 16}, + {"AndExpr.X", Field, 16}, + {"AndExpr.Y", Field, 16}, + {"Expr", Type, 16}, + {"GoVersion", Func, 21}, + {"IsGoBuild", Func, 16}, + {"IsPlusBuild", Func, 16}, + {"NotExpr", Type, 16}, + {"NotExpr.X", Field, 16}, + {"OrExpr", Type, 16}, + {"OrExpr.X", Field, 16}, + {"OrExpr.Y", Field, 16}, + {"Parse", Func, 16}, + {"PlusBuildLines", Func, 16}, + {"SyntaxError", Type, 16}, + {"SyntaxError.Err", Field, 16}, + {"SyntaxError.Offset", Field, 16}, + {"TagExpr", Type, 16}, + {"TagExpr.Tag", Field, 16}, + }, + "go/constant": { + {"(Kind).String", Method, 18}, + {"BinaryOp", Func, 5}, + {"BitLen", Func, 5}, + {"Bool", Const, 5}, + {"BoolVal", Func, 5}, + {"Bytes", Func, 5}, + {"Compare", Func, 5}, + {"Complex", Const, 5}, + {"Denom", Func, 5}, + {"Float", Const, 5}, + {"Float32Val", Func, 5}, + {"Float64Val", Func, 5}, + {"Imag", Func, 5}, + {"Int", Const, 5}, + {"Int64Val", Func, 5}, + {"Kind", Type, 5}, + {"Make", Func, 13}, + {"MakeBool", Func, 5}, + {"MakeFloat64", Func, 5}, + {"MakeFromBytes", Func, 5}, + {"MakeFromLiteral", Func, 5}, + {"MakeImag", Func, 5}, + {"MakeInt64", Func, 5}, + {"MakeString", Func, 5}, + {"MakeUint64", Func, 5}, + {"MakeUnknown", Func, 5}, + {"Num", Func, 5}, + {"Real", Func, 5}, + {"Shift", Func, 5}, + {"Sign", Func, 5}, + {"String", Const, 5}, + {"StringVal", Func, 5}, + {"ToComplex", Func, 6}, + {"ToFloat", Func, 6}, + {"ToInt", Func, 6}, + {"Uint64Val", Func, 5}, + {"UnaryOp", Func, 5}, + {"Unknown", Const, 5}, + {"Val", Func, 13}, + {"Value", Type, 5}, + }, + "go/doc": { + {"(*Package).Filter", Method, 0}, + {"(*Package).HTML", Method, 19}, + {"(*Package).Markdown", Method, 19}, + {"(*Package).Parser", Method, 19}, + {"(*Package).Printer", Method, 19}, + {"(*Package).Synopsis", Method, 19}, + {"(*Package).Text", Method, 19}, + {"AllDecls", Const, 0}, + {"AllMethods", Const, 0}, + {"Example", Type, 0}, + {"Example.Code", Field, 0}, + {"Example.Comments", Field, 0}, + {"Example.Doc", Field, 0}, + {"Example.EmptyOutput", Field, 1}, + {"Example.Name", Field, 0}, + {"Example.Order", Field, 1}, + {"Example.Output", Field, 0}, + {"Example.Play", Field, 1}, + {"Example.Suffix", Field, 14}, + {"Example.Unordered", Field, 7}, + {"Examples", Func, 0}, + {"Filter", Type, 0}, + {"Func", Type, 0}, + {"Func.Decl", Field, 0}, + {"Func.Doc", Field, 0}, + {"Func.Examples", Field, 14}, + {"Func.Level", Field, 0}, + {"Func.Name", Field, 0}, + {"Func.Orig", Field, 0}, + {"Func.Recv", Field, 0}, + {"IllegalPrefixes", Var, 1}, + {"IsPredeclared", Func, 8}, + {"Mode", Type, 0}, + {"New", Func, 0}, + {"NewFromFiles", Func, 14}, + {"Note", Type, 1}, + {"Note.Body", Field, 1}, + {"Note.End", Field, 1}, + {"Note.Pos", Field, 1}, + {"Note.UID", Field, 1}, + {"Package", Type, 0}, + {"Package.Bugs", Field, 0}, + {"Package.Consts", Field, 0}, + {"Package.Doc", Field, 0}, + {"Package.Examples", Field, 14}, + {"Package.Filenames", Field, 0}, + {"Package.Funcs", Field, 0}, + {"Package.ImportPath", Field, 0}, + {"Package.Imports", Field, 0}, + {"Package.Name", Field, 0}, + {"Package.Notes", Field, 1}, + {"Package.Types", Field, 0}, + {"Package.Vars", Field, 0}, + {"PreserveAST", Const, 12}, + {"Synopsis", Func, 0}, + {"ToHTML", Func, 0}, + {"ToText", Func, 0}, + {"Type", Type, 0}, + {"Type.Consts", Field, 0}, + {"Type.Decl", Field, 0}, + {"Type.Doc", Field, 0}, + {"Type.Examples", Field, 14}, + {"Type.Funcs", Field, 0}, + {"Type.Methods", Field, 0}, + {"Type.Name", Field, 0}, + {"Type.Vars", Field, 0}, + {"Value", Type, 0}, + {"Value.Decl", Field, 0}, + {"Value.Doc", Field, 0}, + {"Value.Names", Field, 0}, + }, + "go/doc/comment": { + {"(*DocLink).DefaultURL", Method, 19}, + {"(*Heading).DefaultID", Method, 19}, + {"(*List).BlankBefore", Method, 19}, + {"(*List).BlankBetween", Method, 19}, + {"(*Parser).Parse", Method, 19}, + {"(*Printer).Comment", Method, 19}, + {"(*Printer).HTML", Method, 19}, + {"(*Printer).Markdown", Method, 19}, + {"(*Printer).Text", Method, 19}, + {"Block", Type, 19}, + {"Code", Type, 19}, + {"Code.Text", Field, 19}, + {"DefaultLookupPackage", Func, 19}, + {"Doc", Type, 19}, + {"Doc.Content", Field, 19}, + {"Doc.Links", Field, 19}, + {"DocLink", Type, 19}, + {"DocLink.ImportPath", Field, 19}, + {"DocLink.Name", Field, 19}, + {"DocLink.Recv", Field, 19}, + {"DocLink.Text", Field, 19}, + {"Heading", Type, 19}, + {"Heading.Text", Field, 19}, + {"Italic", Type, 19}, + {"Link", Type, 19}, + {"Link.Auto", Field, 19}, + {"Link.Text", Field, 19}, + {"Link.URL", Field, 19}, + {"LinkDef", Type, 19}, + {"LinkDef.Text", Field, 19}, + {"LinkDef.URL", Field, 19}, + {"LinkDef.Used", Field, 19}, + {"List", Type, 19}, + {"List.ForceBlankBefore", Field, 19}, + {"List.ForceBlankBetween", Field, 19}, + {"List.Items", Field, 19}, + {"ListItem", Type, 19}, + {"ListItem.Content", Field, 19}, + {"ListItem.Number", Field, 19}, + {"Paragraph", Type, 19}, + {"Paragraph.Text", Field, 19}, + {"Parser", Type, 19}, + {"Parser.LookupPackage", Field, 19}, + {"Parser.LookupSym", Field, 19}, + {"Parser.Words", Field, 19}, + {"Plain", Type, 19}, + {"Printer", Type, 19}, + {"Printer.DocLinkBaseURL", Field, 19}, + {"Printer.DocLinkURL", Field, 19}, + {"Printer.HeadingID", Field, 19}, + {"Printer.HeadingLevel", Field, 19}, + {"Printer.TextCodePrefix", Field, 19}, + {"Printer.TextPrefix", Field, 19}, + {"Printer.TextWidth", Field, 19}, + {"Text", Type, 19}, + }, + "go/format": { + {"Node", Func, 1}, + {"Source", Func, 1}, + }, + "go/importer": { + {"Default", Func, 5}, + {"For", Func, 5}, + {"ForCompiler", Func, 12}, + {"Lookup", Type, 5}, + }, + "go/parser": { + {"AllErrors", Const, 1}, + {"DeclarationErrors", Const, 0}, + {"ImportsOnly", Const, 0}, + {"Mode", Type, 0}, + {"PackageClauseOnly", Const, 0}, + {"ParseComments", Const, 0}, + {"ParseDir", Func, 0}, + {"ParseExpr", Func, 0}, + {"ParseExprFrom", Func, 5}, + {"ParseFile", Func, 0}, + {"SkipObjectResolution", Const, 17}, + {"SpuriousErrors", Const, 0}, + {"Trace", Const, 0}, + }, + "go/printer": { + {"(*Config).Fprint", Method, 0}, + {"CommentedNode", Type, 0}, + {"CommentedNode.Comments", Field, 0}, + {"CommentedNode.Node", Field, 0}, + {"Config", Type, 0}, + {"Config.Indent", Field, 1}, + {"Config.Mode", Field, 0}, + {"Config.Tabwidth", Field, 0}, + {"Fprint", Func, 0}, + {"Mode", Type, 0}, + {"RawFormat", Const, 0}, + {"SourcePos", Const, 0}, + {"TabIndent", Const, 0}, + {"UseSpaces", Const, 0}, + }, + "go/scanner": { + {"(*ErrorList).Add", Method, 0}, + {"(*ErrorList).RemoveMultiples", Method, 0}, + {"(*ErrorList).Reset", Method, 0}, + {"(*Scanner).Init", Method, 0}, + {"(*Scanner).Scan", Method, 0}, + {"(Error).Error", Method, 0}, + {"(ErrorList).Err", Method, 0}, + {"(ErrorList).Error", Method, 0}, + {"(ErrorList).Len", Method, 0}, + {"(ErrorList).Less", Method, 0}, + {"(ErrorList).Sort", Method, 0}, + {"(ErrorList).Swap", Method, 0}, + {"Error", Type, 0}, + {"Error.Msg", Field, 0}, + {"Error.Pos", Field, 0}, + {"ErrorHandler", Type, 0}, + {"ErrorList", Type, 0}, + {"Mode", Type, 0}, + {"PrintError", Func, 0}, + {"ScanComments", Const, 0}, + {"Scanner", Type, 0}, + {"Scanner.ErrorCount", Field, 0}, + }, + "go/token": { + {"(*File).AddLine", Method, 0}, + {"(*File).AddLineColumnInfo", Method, 11}, + {"(*File).AddLineInfo", Method, 0}, + {"(*File).Base", Method, 0}, + {"(*File).Line", Method, 0}, + {"(*File).LineCount", Method, 0}, + {"(*File).LineStart", Method, 12}, + {"(*File).Lines", Method, 21}, + {"(*File).MergeLine", Method, 2}, + {"(*File).Name", Method, 0}, + {"(*File).Offset", Method, 0}, + {"(*File).Pos", Method, 0}, + {"(*File).Position", Method, 0}, + {"(*File).PositionFor", Method, 4}, + {"(*File).SetLines", Method, 0}, + {"(*File).SetLinesForContent", Method, 0}, + {"(*File).Size", Method, 0}, + {"(*FileSet).AddFile", Method, 0}, + {"(*FileSet).Base", Method, 0}, + {"(*FileSet).File", Method, 0}, + {"(*FileSet).Iterate", Method, 0}, + {"(*FileSet).Position", Method, 0}, + {"(*FileSet).PositionFor", Method, 4}, + {"(*FileSet).Read", Method, 0}, + {"(*FileSet).RemoveFile", Method, 20}, + {"(*FileSet).Write", Method, 0}, + {"(*Position).IsValid", Method, 0}, + {"(Pos).IsValid", Method, 0}, + {"(Position).String", Method, 0}, + {"(Token).IsKeyword", Method, 0}, + {"(Token).IsLiteral", Method, 0}, + {"(Token).IsOperator", Method, 0}, + {"(Token).Precedence", Method, 0}, + {"(Token).String", Method, 0}, + {"ADD", Const, 0}, + {"ADD_ASSIGN", Const, 0}, + {"AND", Const, 0}, + {"AND_ASSIGN", Const, 0}, + {"AND_NOT", Const, 0}, + {"AND_NOT_ASSIGN", Const, 0}, + {"ARROW", Const, 0}, + {"ASSIGN", Const, 0}, + {"BREAK", Const, 0}, + {"CASE", Const, 0}, + {"CHAN", Const, 0}, + {"CHAR", Const, 0}, + {"COLON", Const, 0}, + {"COMMA", Const, 0}, + {"COMMENT", Const, 0}, + {"CONST", Const, 0}, + {"CONTINUE", Const, 0}, + {"DEC", Const, 0}, + {"DEFAULT", Const, 0}, + {"DEFER", Const, 0}, + {"DEFINE", Const, 0}, + {"ELLIPSIS", Const, 0}, + {"ELSE", Const, 0}, + {"EOF", Const, 0}, + {"EQL", Const, 0}, + {"FALLTHROUGH", Const, 0}, + {"FLOAT", Const, 0}, + {"FOR", Const, 0}, + {"FUNC", Const, 0}, + {"File", Type, 0}, + {"FileSet", Type, 0}, + {"GEQ", Const, 0}, + {"GO", Const, 0}, + {"GOTO", Const, 0}, + {"GTR", Const, 0}, + {"HighestPrec", Const, 0}, + {"IDENT", Const, 0}, + {"IF", Const, 0}, + {"ILLEGAL", Const, 0}, + {"IMAG", Const, 0}, + {"IMPORT", Const, 0}, + {"INC", Const, 0}, + {"INT", Const, 0}, + {"INTERFACE", Const, 0}, + {"IsExported", Func, 13}, + {"IsIdentifier", Func, 13}, + {"IsKeyword", Func, 13}, + {"LAND", Const, 0}, + {"LBRACE", Const, 0}, + {"LBRACK", Const, 0}, + {"LEQ", Const, 0}, + {"LOR", Const, 0}, + {"LPAREN", Const, 0}, + {"LSS", Const, 0}, + {"Lookup", Func, 0}, + {"LowestPrec", Const, 0}, + {"MAP", Const, 0}, + {"MUL", Const, 0}, + {"MUL_ASSIGN", Const, 0}, + {"NEQ", Const, 0}, + {"NOT", Const, 0}, + {"NewFileSet", Func, 0}, + {"NoPos", Const, 0}, + {"OR", Const, 0}, + {"OR_ASSIGN", Const, 0}, + {"PACKAGE", Const, 0}, + {"PERIOD", Const, 0}, + {"Pos", Type, 0}, + {"Position", Type, 0}, + {"Position.Column", Field, 0}, + {"Position.Filename", Field, 0}, + {"Position.Line", Field, 0}, + {"Position.Offset", Field, 0}, + {"QUO", Const, 0}, + {"QUO_ASSIGN", Const, 0}, + {"RANGE", Const, 0}, + {"RBRACE", Const, 0}, + {"RBRACK", Const, 0}, + {"REM", Const, 0}, + {"REM_ASSIGN", Const, 0}, + {"RETURN", Const, 0}, + {"RPAREN", Const, 0}, + {"SELECT", Const, 0}, + {"SEMICOLON", Const, 0}, + {"SHL", Const, 0}, + {"SHL_ASSIGN", Const, 0}, + {"SHR", Const, 0}, + {"SHR_ASSIGN", Const, 0}, + {"STRING", Const, 0}, + {"STRUCT", Const, 0}, + {"SUB", Const, 0}, + {"SUB_ASSIGN", Const, 0}, + {"SWITCH", Const, 0}, + {"TILDE", Const, 18}, + {"TYPE", Const, 0}, + {"Token", Type, 0}, + {"UnaryPrec", Const, 0}, + {"VAR", Const, 0}, + {"XOR", Const, 0}, + {"XOR_ASSIGN", Const, 0}, + }, + "go/types": { + {"(*Alias).Obj", Method, 22}, + {"(*Alias).Origin", Method, 23}, + {"(*Alias).Rhs", Method, 23}, + {"(*Alias).SetTypeParams", Method, 23}, + {"(*Alias).String", Method, 22}, + {"(*Alias).TypeArgs", Method, 23}, + {"(*Alias).TypeParams", Method, 23}, + {"(*Alias).Underlying", Method, 22}, + {"(*ArgumentError).Error", Method, 18}, + {"(*ArgumentError).Unwrap", Method, 18}, + {"(*Array).Elem", Method, 5}, + {"(*Array).Len", Method, 5}, + {"(*Array).String", Method, 5}, + {"(*Array).Underlying", Method, 5}, + {"(*Basic).Info", Method, 5}, + {"(*Basic).Kind", Method, 5}, + {"(*Basic).Name", Method, 5}, + {"(*Basic).String", Method, 5}, + {"(*Basic).Underlying", Method, 5}, + {"(*Builtin).Exported", Method, 5}, + {"(*Builtin).Id", Method, 5}, + {"(*Builtin).Name", Method, 5}, + {"(*Builtin).Parent", Method, 5}, + {"(*Builtin).Pkg", Method, 5}, + {"(*Builtin).Pos", Method, 5}, + {"(*Builtin).String", Method, 5}, + {"(*Builtin).Type", Method, 5}, + {"(*Chan).Dir", Method, 5}, + {"(*Chan).Elem", Method, 5}, + {"(*Chan).String", Method, 5}, + {"(*Chan).Underlying", Method, 5}, + {"(*Checker).Files", Method, 5}, + {"(*Config).Check", Method, 5}, + {"(*Const).Exported", Method, 5}, + {"(*Const).Id", Method, 5}, + {"(*Const).Name", Method, 5}, + {"(*Const).Parent", Method, 5}, + {"(*Const).Pkg", Method, 5}, + {"(*Const).Pos", Method, 5}, + {"(*Const).String", Method, 5}, + {"(*Const).Type", Method, 5}, + {"(*Const).Val", Method, 5}, + {"(*Func).Exported", Method, 5}, + {"(*Func).FullName", Method, 5}, + {"(*Func).Id", Method, 5}, + {"(*Func).Name", Method, 5}, + {"(*Func).Origin", Method, 19}, + {"(*Func).Parent", Method, 5}, + {"(*Func).Pkg", Method, 5}, + {"(*Func).Pos", Method, 5}, + {"(*Func).Scope", Method, 5}, + {"(*Func).Signature", Method, 23}, + {"(*Func).String", Method, 5}, + {"(*Func).Type", Method, 5}, + {"(*Info).ObjectOf", Method, 5}, + {"(*Info).PkgNameOf", Method, 22}, + {"(*Info).TypeOf", Method, 5}, + {"(*Initializer).String", Method, 5}, + {"(*Interface).Complete", Method, 5}, + {"(*Interface).Embedded", Method, 5}, + {"(*Interface).EmbeddedType", Method, 11}, + {"(*Interface).Empty", Method, 5}, + {"(*Interface).ExplicitMethod", Method, 5}, + {"(*Interface).IsComparable", Method, 18}, + {"(*Interface).IsImplicit", Method, 18}, + {"(*Interface).IsMethodSet", Method, 18}, + {"(*Interface).MarkImplicit", Method, 18}, + {"(*Interface).Method", Method, 5}, + {"(*Interface).NumEmbeddeds", Method, 5}, + {"(*Interface).NumExplicitMethods", Method, 5}, + {"(*Interface).NumMethods", Method, 5}, + {"(*Interface).String", Method, 5}, + {"(*Interface).Underlying", Method, 5}, + {"(*Label).Exported", Method, 5}, + {"(*Label).Id", Method, 5}, + {"(*Label).Name", Method, 5}, + {"(*Label).Parent", Method, 5}, + {"(*Label).Pkg", Method, 5}, + {"(*Label).Pos", Method, 5}, + {"(*Label).String", Method, 5}, + {"(*Label).Type", Method, 5}, + {"(*Map).Elem", Method, 5}, + {"(*Map).Key", Method, 5}, + {"(*Map).String", Method, 5}, + {"(*Map).Underlying", Method, 5}, + {"(*MethodSet).At", Method, 5}, + {"(*MethodSet).Len", Method, 5}, + {"(*MethodSet).Lookup", Method, 5}, + {"(*MethodSet).String", Method, 5}, + {"(*Named).AddMethod", Method, 5}, + {"(*Named).Method", Method, 5}, + {"(*Named).NumMethods", Method, 5}, + {"(*Named).Obj", Method, 5}, + {"(*Named).Origin", Method, 18}, + {"(*Named).SetTypeParams", Method, 18}, + {"(*Named).SetUnderlying", Method, 5}, + {"(*Named).String", Method, 5}, + {"(*Named).TypeArgs", Method, 18}, + {"(*Named).TypeParams", Method, 18}, + {"(*Named).Underlying", Method, 5}, + {"(*Nil).Exported", Method, 5}, + {"(*Nil).Id", Method, 5}, + {"(*Nil).Name", Method, 5}, + {"(*Nil).Parent", Method, 5}, + {"(*Nil).Pkg", Method, 5}, + {"(*Nil).Pos", Method, 5}, + {"(*Nil).String", Method, 5}, + {"(*Nil).Type", Method, 5}, + {"(*Package).Complete", Method, 5}, + {"(*Package).GoVersion", Method, 21}, + {"(*Package).Imports", Method, 5}, + {"(*Package).MarkComplete", Method, 5}, + {"(*Package).Name", Method, 5}, + {"(*Package).Path", Method, 5}, + {"(*Package).Scope", Method, 5}, + {"(*Package).SetImports", Method, 5}, + {"(*Package).SetName", Method, 6}, + {"(*Package).String", Method, 5}, + {"(*PkgName).Exported", Method, 5}, + {"(*PkgName).Id", Method, 5}, + {"(*PkgName).Imported", Method, 5}, + {"(*PkgName).Name", Method, 5}, + {"(*PkgName).Parent", Method, 5}, + {"(*PkgName).Pkg", Method, 5}, + {"(*PkgName).Pos", Method, 5}, + {"(*PkgName).String", Method, 5}, + {"(*PkgName).Type", Method, 5}, + {"(*Pointer).Elem", Method, 5}, + {"(*Pointer).String", Method, 5}, + {"(*Pointer).Underlying", Method, 5}, + {"(*Scope).Child", Method, 5}, + {"(*Scope).Contains", Method, 5}, + {"(*Scope).End", Method, 5}, + {"(*Scope).Innermost", Method, 5}, + {"(*Scope).Insert", Method, 5}, + {"(*Scope).Len", Method, 5}, + {"(*Scope).Lookup", Method, 5}, + {"(*Scope).LookupParent", Method, 5}, + {"(*Scope).Names", Method, 5}, + {"(*Scope).NumChildren", Method, 5}, + {"(*Scope).Parent", Method, 5}, + {"(*Scope).Pos", Method, 5}, + {"(*Scope).String", Method, 5}, + {"(*Scope).WriteTo", Method, 5}, + {"(*Selection).Index", Method, 5}, + {"(*Selection).Indirect", Method, 5}, + {"(*Selection).Kind", Method, 5}, + {"(*Selection).Obj", Method, 5}, + {"(*Selection).Recv", Method, 5}, + {"(*Selection).String", Method, 5}, + {"(*Selection).Type", Method, 5}, + {"(*Signature).Params", Method, 5}, + {"(*Signature).Recv", Method, 5}, + {"(*Signature).RecvTypeParams", Method, 18}, + {"(*Signature).Results", Method, 5}, + {"(*Signature).String", Method, 5}, + {"(*Signature).TypeParams", Method, 18}, + {"(*Signature).Underlying", Method, 5}, + {"(*Signature).Variadic", Method, 5}, + {"(*Slice).Elem", Method, 5}, + {"(*Slice).String", Method, 5}, + {"(*Slice).Underlying", Method, 5}, + {"(*StdSizes).Alignof", Method, 5}, + {"(*StdSizes).Offsetsof", Method, 5}, + {"(*StdSizes).Sizeof", Method, 5}, + {"(*Struct).Field", Method, 5}, + {"(*Struct).NumFields", Method, 5}, + {"(*Struct).String", Method, 5}, + {"(*Struct).Tag", Method, 5}, + {"(*Struct).Underlying", Method, 5}, + {"(*Term).String", Method, 18}, + {"(*Term).Tilde", Method, 18}, + {"(*Term).Type", Method, 18}, + {"(*Tuple).At", Method, 5}, + {"(*Tuple).Len", Method, 5}, + {"(*Tuple).String", Method, 5}, + {"(*Tuple).Underlying", Method, 5}, + {"(*TypeList).At", Method, 18}, + {"(*TypeList).Len", Method, 18}, + {"(*TypeName).Exported", Method, 5}, + {"(*TypeName).Id", Method, 5}, + {"(*TypeName).IsAlias", Method, 9}, + {"(*TypeName).Name", Method, 5}, + {"(*TypeName).Parent", Method, 5}, + {"(*TypeName).Pkg", Method, 5}, + {"(*TypeName).Pos", Method, 5}, + {"(*TypeName).String", Method, 5}, + {"(*TypeName).Type", Method, 5}, + {"(*TypeParam).Constraint", Method, 18}, + {"(*TypeParam).Index", Method, 18}, + {"(*TypeParam).Obj", Method, 18}, + {"(*TypeParam).SetConstraint", Method, 18}, + {"(*TypeParam).String", Method, 18}, + {"(*TypeParam).Underlying", Method, 18}, + {"(*TypeParamList).At", Method, 18}, + {"(*TypeParamList).Len", Method, 18}, + {"(*Union).Len", Method, 18}, + {"(*Union).String", Method, 18}, + {"(*Union).Term", Method, 18}, + {"(*Union).Underlying", Method, 18}, + {"(*Var).Anonymous", Method, 5}, + {"(*Var).Embedded", Method, 11}, + {"(*Var).Exported", Method, 5}, + {"(*Var).Id", Method, 5}, + {"(*Var).IsField", Method, 5}, + {"(*Var).Name", Method, 5}, + {"(*Var).Origin", Method, 19}, + {"(*Var).Parent", Method, 5}, + {"(*Var).Pkg", Method, 5}, + {"(*Var).Pos", Method, 5}, + {"(*Var).String", Method, 5}, + {"(*Var).Type", Method, 5}, + {"(Checker).ObjectOf", Method, 5}, + {"(Checker).PkgNameOf", Method, 22}, + {"(Checker).TypeOf", Method, 5}, + {"(Error).Error", Method, 5}, + {"(TypeAndValue).Addressable", Method, 5}, + {"(TypeAndValue).Assignable", Method, 5}, + {"(TypeAndValue).HasOk", Method, 5}, + {"(TypeAndValue).IsBuiltin", Method, 5}, + {"(TypeAndValue).IsNil", Method, 5}, + {"(TypeAndValue).IsType", Method, 5}, + {"(TypeAndValue).IsValue", Method, 5}, + {"(TypeAndValue).IsVoid", Method, 5}, + {"Alias", Type, 22}, + {"ArgumentError", Type, 18}, + {"ArgumentError.Err", Field, 18}, + {"ArgumentError.Index", Field, 18}, + {"Array", Type, 5}, + {"AssertableTo", Func, 5}, + {"AssignableTo", Func, 5}, + {"Basic", Type, 5}, + {"BasicInfo", Type, 5}, + {"BasicKind", Type, 5}, + {"Bool", Const, 5}, + {"Builtin", Type, 5}, + {"Byte", Const, 5}, + {"Chan", Type, 5}, + {"ChanDir", Type, 5}, + {"CheckExpr", Func, 13}, + {"Checker", Type, 5}, + {"Checker.Info", Field, 5}, + {"Comparable", Func, 5}, + {"Complex128", Const, 5}, + {"Complex64", Const, 5}, + {"Config", Type, 5}, + {"Config.Context", Field, 18}, + {"Config.DisableUnusedImportCheck", Field, 5}, + {"Config.Error", Field, 5}, + {"Config.FakeImportC", Field, 5}, + {"Config.GoVersion", Field, 18}, + {"Config.IgnoreFuncBodies", Field, 5}, + {"Config.Importer", Field, 5}, + {"Config.Sizes", Field, 5}, + {"Const", Type, 5}, + {"Context", Type, 18}, + {"ConvertibleTo", Func, 5}, + {"DefPredeclaredTestFuncs", Func, 5}, + {"Default", Func, 8}, + {"Error", Type, 5}, + {"Error.Fset", Field, 5}, + {"Error.Msg", Field, 5}, + {"Error.Pos", Field, 5}, + {"Error.Soft", Field, 5}, + {"Eval", Func, 5}, + {"ExprString", Func, 5}, + {"FieldVal", Const, 5}, + {"Float32", Const, 5}, + {"Float64", Const, 5}, + {"Func", Type, 5}, + {"Id", Func, 5}, + {"Identical", Func, 5}, + {"IdenticalIgnoreTags", Func, 8}, + {"Implements", Func, 5}, + {"ImportMode", Type, 6}, + {"Importer", Type, 5}, + {"ImporterFrom", Type, 6}, + {"Info", Type, 5}, + {"Info.Defs", Field, 5}, + {"Info.FileVersions", Field, 22}, + {"Info.Implicits", Field, 5}, + {"Info.InitOrder", Field, 5}, + {"Info.Instances", Field, 18}, + {"Info.Scopes", Field, 5}, + {"Info.Selections", Field, 5}, + {"Info.Types", Field, 5}, + {"Info.Uses", Field, 5}, + {"Initializer", Type, 5}, + {"Initializer.Lhs", Field, 5}, + {"Initializer.Rhs", Field, 5}, + {"Instance", Type, 18}, + {"Instance.Type", Field, 18}, + {"Instance.TypeArgs", Field, 18}, + {"Instantiate", Func, 18}, + {"Int", Const, 5}, + {"Int16", Const, 5}, + {"Int32", Const, 5}, + {"Int64", Const, 5}, + {"Int8", Const, 5}, + {"Interface", Type, 5}, + {"Invalid", Const, 5}, + {"IsBoolean", Const, 5}, + {"IsComplex", Const, 5}, + {"IsConstType", Const, 5}, + {"IsFloat", Const, 5}, + {"IsInteger", Const, 5}, + {"IsInterface", Func, 5}, + {"IsNumeric", Const, 5}, + {"IsOrdered", Const, 5}, + {"IsString", Const, 5}, + {"IsUnsigned", Const, 5}, + {"IsUntyped", Const, 5}, + {"Label", Type, 5}, + {"LookupFieldOrMethod", Func, 5}, + {"Map", Type, 5}, + {"MethodExpr", Const, 5}, + {"MethodSet", Type, 5}, + {"MethodVal", Const, 5}, + {"MissingMethod", Func, 5}, + {"Named", Type, 5}, + {"NewAlias", Func, 22}, + {"NewArray", Func, 5}, + {"NewChan", Func, 5}, + {"NewChecker", Func, 5}, + {"NewConst", Func, 5}, + {"NewContext", Func, 18}, + {"NewField", Func, 5}, + {"NewFunc", Func, 5}, + {"NewInterface", Func, 5}, + {"NewInterfaceType", Func, 11}, + {"NewLabel", Func, 5}, + {"NewMap", Func, 5}, + {"NewMethodSet", Func, 5}, + {"NewNamed", Func, 5}, + {"NewPackage", Func, 5}, + {"NewParam", Func, 5}, + {"NewPkgName", Func, 5}, + {"NewPointer", Func, 5}, + {"NewScope", Func, 5}, + {"NewSignature", Func, 5}, + {"NewSignatureType", Func, 18}, + {"NewSlice", Func, 5}, + {"NewStruct", Func, 5}, + {"NewTerm", Func, 18}, + {"NewTuple", Func, 5}, + {"NewTypeName", Func, 5}, + {"NewTypeParam", Func, 18}, + {"NewUnion", Func, 18}, + {"NewVar", Func, 5}, + {"Nil", Type, 5}, + {"Object", Type, 5}, + {"ObjectString", Func, 5}, + {"Package", Type, 5}, + {"PkgName", Type, 5}, + {"Pointer", Type, 5}, + {"Qualifier", Type, 5}, + {"RecvOnly", Const, 5}, + {"RelativeTo", Func, 5}, + {"Rune", Const, 5}, + {"Satisfies", Func, 20}, + {"Scope", Type, 5}, + {"Selection", Type, 5}, + {"SelectionKind", Type, 5}, + {"SelectionString", Func, 5}, + {"SendOnly", Const, 5}, + {"SendRecv", Const, 5}, + {"Signature", Type, 5}, + {"Sizes", Type, 5}, + {"SizesFor", Func, 9}, + {"Slice", Type, 5}, + {"StdSizes", Type, 5}, + {"StdSizes.MaxAlign", Field, 5}, + {"StdSizes.WordSize", Field, 5}, + {"String", Const, 5}, + {"Struct", Type, 5}, + {"Term", Type, 18}, + {"Tuple", Type, 5}, + {"Typ", Var, 5}, + {"Type", Type, 5}, + {"TypeAndValue", Type, 5}, + {"TypeAndValue.Type", Field, 5}, + {"TypeAndValue.Value", Field, 5}, + {"TypeList", Type, 18}, + {"TypeName", Type, 5}, + {"TypeParam", Type, 18}, + {"TypeParamList", Type, 18}, + {"TypeString", Func, 5}, + {"Uint", Const, 5}, + {"Uint16", Const, 5}, + {"Uint32", Const, 5}, + {"Uint64", Const, 5}, + {"Uint8", Const, 5}, + {"Uintptr", Const, 5}, + {"Unalias", Func, 22}, + {"Union", Type, 18}, + {"Universe", Var, 5}, + {"Unsafe", Var, 5}, + {"UnsafePointer", Const, 5}, + {"UntypedBool", Const, 5}, + {"UntypedComplex", Const, 5}, + {"UntypedFloat", Const, 5}, + {"UntypedInt", Const, 5}, + {"UntypedNil", Const, 5}, + {"UntypedRune", Const, 5}, + {"UntypedString", Const, 5}, + {"Var", Type, 5}, + {"WriteExpr", Func, 5}, + {"WriteSignature", Func, 5}, + {"WriteType", Func, 5}, + }, + "go/version": { + {"Compare", Func, 22}, + {"IsValid", Func, 22}, + {"Lang", Func, 22}, + }, + "hash": { + {"Hash", Type, 0}, + {"Hash32", Type, 0}, + {"Hash64", Type, 0}, + }, + "hash/adler32": { + {"Checksum", Func, 0}, + {"New", Func, 0}, + {"Size", Const, 0}, + }, + "hash/crc32": { + {"Castagnoli", Const, 0}, + {"Checksum", Func, 0}, + {"ChecksumIEEE", Func, 0}, + {"IEEE", Const, 0}, + {"IEEETable", Var, 0}, + {"Koopman", Const, 0}, + {"MakeTable", Func, 0}, + {"New", Func, 0}, + {"NewIEEE", Func, 0}, + {"Size", Const, 0}, + {"Table", Type, 0}, + {"Update", Func, 0}, + }, + "hash/crc64": { + {"Checksum", Func, 0}, + {"ECMA", Const, 0}, + {"ISO", Const, 0}, + {"MakeTable", Func, 0}, + {"New", Func, 0}, + {"Size", Const, 0}, + {"Table", Type, 0}, + {"Update", Func, 0}, + }, + "hash/fnv": { + {"New128", Func, 9}, + {"New128a", Func, 9}, + {"New32", Func, 0}, + {"New32a", Func, 0}, + {"New64", Func, 0}, + {"New64a", Func, 0}, + }, + "hash/maphash": { + {"(*Hash).BlockSize", Method, 14}, + {"(*Hash).Reset", Method, 14}, + {"(*Hash).Seed", Method, 14}, + {"(*Hash).SetSeed", Method, 14}, + {"(*Hash).Size", Method, 14}, + {"(*Hash).Sum", Method, 14}, + {"(*Hash).Sum64", Method, 14}, + {"(*Hash).Write", Method, 14}, + {"(*Hash).WriteByte", Method, 14}, + {"(*Hash).WriteString", Method, 14}, + {"Bytes", Func, 19}, + {"Hash", Type, 14}, + {"MakeSeed", Func, 14}, + {"Seed", Type, 14}, + {"String", Func, 19}, + }, + "html": { + {"EscapeString", Func, 0}, + {"UnescapeString", Func, 0}, + }, + "html/template": { + {"(*Error).Error", Method, 0}, + {"(*Template).AddParseTree", Method, 0}, + {"(*Template).Clone", Method, 0}, + {"(*Template).DefinedTemplates", Method, 6}, + {"(*Template).Delims", Method, 0}, + {"(*Template).Execute", Method, 0}, + {"(*Template).ExecuteTemplate", Method, 0}, + {"(*Template).Funcs", Method, 0}, + {"(*Template).Lookup", Method, 0}, + {"(*Template).Name", Method, 0}, + {"(*Template).New", Method, 0}, + {"(*Template).Option", Method, 5}, + {"(*Template).Parse", Method, 0}, + {"(*Template).ParseFS", Method, 16}, + {"(*Template).ParseFiles", Method, 0}, + {"(*Template).ParseGlob", Method, 0}, + {"(*Template).Templates", Method, 0}, + {"CSS", Type, 0}, + {"ErrAmbigContext", Const, 0}, + {"ErrBadHTML", Const, 0}, + {"ErrBranchEnd", Const, 0}, + {"ErrEndContext", Const, 0}, + {"ErrJSTemplate", Const, 21}, + {"ErrNoSuchTemplate", Const, 0}, + {"ErrOutputContext", Const, 0}, + {"ErrPartialCharset", Const, 0}, + {"ErrPartialEscape", Const, 0}, + {"ErrPredefinedEscaper", Const, 9}, + {"ErrRangeLoopReentry", Const, 0}, + {"ErrSlashAmbig", Const, 0}, + {"Error", Type, 0}, + {"Error.Description", Field, 0}, + {"Error.ErrorCode", Field, 0}, + {"Error.Line", Field, 0}, + {"Error.Name", Field, 0}, + {"Error.Node", Field, 4}, + {"ErrorCode", Type, 0}, + {"FuncMap", Type, 0}, + {"HTML", Type, 0}, + {"HTMLAttr", Type, 0}, + {"HTMLEscape", Func, 0}, + {"HTMLEscapeString", Func, 0}, + {"HTMLEscaper", Func, 0}, + {"IsTrue", Func, 6}, + {"JS", Type, 0}, + {"JSEscape", Func, 0}, + {"JSEscapeString", Func, 0}, + {"JSEscaper", Func, 0}, + {"JSStr", Type, 0}, + {"Must", Func, 0}, + {"New", Func, 0}, + {"OK", Const, 0}, + {"ParseFS", Func, 16}, + {"ParseFiles", Func, 0}, + {"ParseGlob", Func, 0}, + {"Srcset", Type, 10}, + {"Template", Type, 0}, + {"Template.Tree", Field, 2}, + {"URL", Type, 0}, + {"URLQueryEscaper", Func, 0}, + }, + "image": { + {"(*Alpha).AlphaAt", Method, 4}, + {"(*Alpha).At", Method, 0}, + {"(*Alpha).Bounds", Method, 0}, + {"(*Alpha).ColorModel", Method, 0}, + {"(*Alpha).Opaque", Method, 0}, + {"(*Alpha).PixOffset", Method, 0}, + {"(*Alpha).RGBA64At", Method, 17}, + {"(*Alpha).Set", Method, 0}, + {"(*Alpha).SetAlpha", Method, 0}, + {"(*Alpha).SetRGBA64", Method, 17}, + {"(*Alpha).SubImage", Method, 0}, + {"(*Alpha16).Alpha16At", Method, 4}, + {"(*Alpha16).At", Method, 0}, + {"(*Alpha16).Bounds", Method, 0}, + {"(*Alpha16).ColorModel", Method, 0}, + {"(*Alpha16).Opaque", Method, 0}, + {"(*Alpha16).PixOffset", Method, 0}, + {"(*Alpha16).RGBA64At", Method, 17}, + {"(*Alpha16).Set", Method, 0}, + {"(*Alpha16).SetAlpha16", Method, 0}, + {"(*Alpha16).SetRGBA64", Method, 17}, + {"(*Alpha16).SubImage", Method, 0}, + {"(*CMYK).At", Method, 5}, + {"(*CMYK).Bounds", Method, 5}, + {"(*CMYK).CMYKAt", Method, 5}, + {"(*CMYK).ColorModel", Method, 5}, + {"(*CMYK).Opaque", Method, 5}, + {"(*CMYK).PixOffset", Method, 5}, + {"(*CMYK).RGBA64At", Method, 17}, + {"(*CMYK).Set", Method, 5}, + {"(*CMYK).SetCMYK", Method, 5}, + {"(*CMYK).SetRGBA64", Method, 17}, + {"(*CMYK).SubImage", Method, 5}, + {"(*Gray).At", Method, 0}, + {"(*Gray).Bounds", Method, 0}, + {"(*Gray).ColorModel", Method, 0}, + {"(*Gray).GrayAt", Method, 4}, + {"(*Gray).Opaque", Method, 0}, + {"(*Gray).PixOffset", Method, 0}, + {"(*Gray).RGBA64At", Method, 17}, + {"(*Gray).Set", Method, 0}, + {"(*Gray).SetGray", Method, 0}, + {"(*Gray).SetRGBA64", Method, 17}, + {"(*Gray).SubImage", Method, 0}, + {"(*Gray16).At", Method, 0}, + {"(*Gray16).Bounds", Method, 0}, + {"(*Gray16).ColorModel", Method, 0}, + {"(*Gray16).Gray16At", Method, 4}, + {"(*Gray16).Opaque", Method, 0}, + {"(*Gray16).PixOffset", Method, 0}, + {"(*Gray16).RGBA64At", Method, 17}, + {"(*Gray16).Set", Method, 0}, + {"(*Gray16).SetGray16", Method, 0}, + {"(*Gray16).SetRGBA64", Method, 17}, + {"(*Gray16).SubImage", Method, 0}, + {"(*NRGBA).At", Method, 0}, + {"(*NRGBA).Bounds", Method, 0}, + {"(*NRGBA).ColorModel", Method, 0}, + {"(*NRGBA).NRGBAAt", Method, 4}, + {"(*NRGBA).Opaque", Method, 0}, + {"(*NRGBA).PixOffset", Method, 0}, + {"(*NRGBA).RGBA64At", Method, 17}, + {"(*NRGBA).Set", Method, 0}, + {"(*NRGBA).SetNRGBA", Method, 0}, + {"(*NRGBA).SetRGBA64", Method, 17}, + {"(*NRGBA).SubImage", Method, 0}, + {"(*NRGBA64).At", Method, 0}, + {"(*NRGBA64).Bounds", Method, 0}, + {"(*NRGBA64).ColorModel", Method, 0}, + {"(*NRGBA64).NRGBA64At", Method, 4}, + {"(*NRGBA64).Opaque", Method, 0}, + {"(*NRGBA64).PixOffset", Method, 0}, + {"(*NRGBA64).RGBA64At", Method, 17}, + {"(*NRGBA64).Set", Method, 0}, + {"(*NRGBA64).SetNRGBA64", Method, 0}, + {"(*NRGBA64).SetRGBA64", Method, 17}, + {"(*NRGBA64).SubImage", Method, 0}, + {"(*NYCbCrA).AOffset", Method, 6}, + {"(*NYCbCrA).At", Method, 6}, + {"(*NYCbCrA).Bounds", Method, 6}, + {"(*NYCbCrA).COffset", Method, 6}, + {"(*NYCbCrA).ColorModel", Method, 6}, + {"(*NYCbCrA).NYCbCrAAt", Method, 6}, + {"(*NYCbCrA).Opaque", Method, 6}, + {"(*NYCbCrA).RGBA64At", Method, 17}, + {"(*NYCbCrA).SubImage", Method, 6}, + {"(*NYCbCrA).YCbCrAt", Method, 6}, + {"(*NYCbCrA).YOffset", Method, 6}, + {"(*Paletted).At", Method, 0}, + {"(*Paletted).Bounds", Method, 0}, + {"(*Paletted).ColorIndexAt", Method, 0}, + {"(*Paletted).ColorModel", Method, 0}, + {"(*Paletted).Opaque", Method, 0}, + {"(*Paletted).PixOffset", Method, 0}, + {"(*Paletted).RGBA64At", Method, 17}, + {"(*Paletted).Set", Method, 0}, + {"(*Paletted).SetColorIndex", Method, 0}, + {"(*Paletted).SetRGBA64", Method, 17}, + {"(*Paletted).SubImage", Method, 0}, + {"(*RGBA).At", Method, 0}, + {"(*RGBA).Bounds", Method, 0}, + {"(*RGBA).ColorModel", Method, 0}, + {"(*RGBA).Opaque", Method, 0}, + {"(*RGBA).PixOffset", Method, 0}, + {"(*RGBA).RGBA64At", Method, 17}, + {"(*RGBA).RGBAAt", Method, 4}, + {"(*RGBA).Set", Method, 0}, + {"(*RGBA).SetRGBA", Method, 0}, + {"(*RGBA).SetRGBA64", Method, 17}, + {"(*RGBA).SubImage", Method, 0}, + {"(*RGBA64).At", Method, 0}, + {"(*RGBA64).Bounds", Method, 0}, + {"(*RGBA64).ColorModel", Method, 0}, + {"(*RGBA64).Opaque", Method, 0}, + {"(*RGBA64).PixOffset", Method, 0}, + {"(*RGBA64).RGBA64At", Method, 4}, + {"(*RGBA64).Set", Method, 0}, + {"(*RGBA64).SetRGBA64", Method, 0}, + {"(*RGBA64).SubImage", Method, 0}, + {"(*Uniform).At", Method, 0}, + {"(*Uniform).Bounds", Method, 0}, + {"(*Uniform).ColorModel", Method, 0}, + {"(*Uniform).Convert", Method, 0}, + {"(*Uniform).Opaque", Method, 0}, + {"(*Uniform).RGBA", Method, 0}, + {"(*Uniform).RGBA64At", Method, 17}, + {"(*YCbCr).At", Method, 0}, + {"(*YCbCr).Bounds", Method, 0}, + {"(*YCbCr).COffset", Method, 0}, + {"(*YCbCr).ColorModel", Method, 0}, + {"(*YCbCr).Opaque", Method, 0}, + {"(*YCbCr).RGBA64At", Method, 17}, + {"(*YCbCr).SubImage", Method, 0}, + {"(*YCbCr).YCbCrAt", Method, 4}, + {"(*YCbCr).YOffset", Method, 0}, + {"(Point).Add", Method, 0}, + {"(Point).Div", Method, 0}, + {"(Point).Eq", Method, 0}, + {"(Point).In", Method, 0}, + {"(Point).Mod", Method, 0}, + {"(Point).Mul", Method, 0}, + {"(Point).String", Method, 0}, + {"(Point).Sub", Method, 0}, + {"(Rectangle).Add", Method, 0}, + {"(Rectangle).At", Method, 5}, + {"(Rectangle).Bounds", Method, 5}, + {"(Rectangle).Canon", Method, 0}, + {"(Rectangle).ColorModel", Method, 5}, + {"(Rectangle).Dx", Method, 0}, + {"(Rectangle).Dy", Method, 0}, + {"(Rectangle).Empty", Method, 0}, + {"(Rectangle).Eq", Method, 0}, + {"(Rectangle).In", Method, 0}, + {"(Rectangle).Inset", Method, 0}, + {"(Rectangle).Intersect", Method, 0}, + {"(Rectangle).Overlaps", Method, 0}, + {"(Rectangle).RGBA64At", Method, 17}, + {"(Rectangle).Size", Method, 0}, + {"(Rectangle).String", Method, 0}, + {"(Rectangle).Sub", Method, 0}, + {"(Rectangle).Union", Method, 0}, + {"(YCbCrSubsampleRatio).String", Method, 0}, + {"Alpha", Type, 0}, + {"Alpha.Pix", Field, 0}, + {"Alpha.Rect", Field, 0}, + {"Alpha.Stride", Field, 0}, + {"Alpha16", Type, 0}, + {"Alpha16.Pix", Field, 0}, + {"Alpha16.Rect", Field, 0}, + {"Alpha16.Stride", Field, 0}, + {"Black", Var, 0}, + {"CMYK", Type, 5}, + {"CMYK.Pix", Field, 5}, + {"CMYK.Rect", Field, 5}, + {"CMYK.Stride", Field, 5}, + {"Config", Type, 0}, + {"Config.ColorModel", Field, 0}, + {"Config.Height", Field, 0}, + {"Config.Width", Field, 0}, + {"Decode", Func, 0}, + {"DecodeConfig", Func, 0}, + {"ErrFormat", Var, 0}, + {"Gray", Type, 0}, + {"Gray.Pix", Field, 0}, + {"Gray.Rect", Field, 0}, + {"Gray.Stride", Field, 0}, + {"Gray16", Type, 0}, + {"Gray16.Pix", Field, 0}, + {"Gray16.Rect", Field, 0}, + {"Gray16.Stride", Field, 0}, + {"Image", Type, 0}, + {"NRGBA", Type, 0}, + {"NRGBA.Pix", Field, 0}, + {"NRGBA.Rect", Field, 0}, + {"NRGBA.Stride", Field, 0}, + {"NRGBA64", Type, 0}, + {"NRGBA64.Pix", Field, 0}, + {"NRGBA64.Rect", Field, 0}, + {"NRGBA64.Stride", Field, 0}, + {"NYCbCrA", Type, 6}, + {"NYCbCrA.A", Field, 6}, + {"NYCbCrA.AStride", Field, 6}, + {"NYCbCrA.YCbCr", Field, 6}, + {"NewAlpha", Func, 0}, + {"NewAlpha16", Func, 0}, + {"NewCMYK", Func, 5}, + {"NewGray", Func, 0}, + {"NewGray16", Func, 0}, + {"NewNRGBA", Func, 0}, + {"NewNRGBA64", Func, 0}, + {"NewNYCbCrA", Func, 6}, + {"NewPaletted", Func, 0}, + {"NewRGBA", Func, 0}, + {"NewRGBA64", Func, 0}, + {"NewUniform", Func, 0}, + {"NewYCbCr", Func, 0}, + {"Opaque", Var, 0}, + {"Paletted", Type, 0}, + {"Paletted.Palette", Field, 0}, + {"Paletted.Pix", Field, 0}, + {"Paletted.Rect", Field, 0}, + {"Paletted.Stride", Field, 0}, + {"PalettedImage", Type, 0}, + {"Point", Type, 0}, + {"Point.X", Field, 0}, + {"Point.Y", Field, 0}, + {"Pt", Func, 0}, + {"RGBA", Type, 0}, + {"RGBA.Pix", Field, 0}, + {"RGBA.Rect", Field, 0}, + {"RGBA.Stride", Field, 0}, + {"RGBA64", Type, 0}, + {"RGBA64.Pix", Field, 0}, + {"RGBA64.Rect", Field, 0}, + {"RGBA64.Stride", Field, 0}, + {"RGBA64Image", Type, 17}, + {"Rect", Func, 0}, + {"Rectangle", Type, 0}, + {"Rectangle.Max", Field, 0}, + {"Rectangle.Min", Field, 0}, + {"RegisterFormat", Func, 0}, + {"Transparent", Var, 0}, + {"Uniform", Type, 0}, + {"Uniform.C", Field, 0}, + {"White", Var, 0}, + {"YCbCr", Type, 0}, + {"YCbCr.CStride", Field, 0}, + {"YCbCr.Cb", Field, 0}, + {"YCbCr.Cr", Field, 0}, + {"YCbCr.Rect", Field, 0}, + {"YCbCr.SubsampleRatio", Field, 0}, + {"YCbCr.Y", Field, 0}, + {"YCbCr.YStride", Field, 0}, + {"YCbCrSubsampleRatio", Type, 0}, + {"YCbCrSubsampleRatio410", Const, 5}, + {"YCbCrSubsampleRatio411", Const, 5}, + {"YCbCrSubsampleRatio420", Const, 0}, + {"YCbCrSubsampleRatio422", Const, 0}, + {"YCbCrSubsampleRatio440", Const, 1}, + {"YCbCrSubsampleRatio444", Const, 0}, + {"ZP", Var, 0}, + {"ZR", Var, 0}, + }, + "image/color": { + {"(Alpha).RGBA", Method, 0}, + {"(Alpha16).RGBA", Method, 0}, + {"(CMYK).RGBA", Method, 5}, + {"(Gray).RGBA", Method, 0}, + {"(Gray16).RGBA", Method, 0}, + {"(NRGBA).RGBA", Method, 0}, + {"(NRGBA64).RGBA", Method, 0}, + {"(NYCbCrA).RGBA", Method, 6}, + {"(Palette).Convert", Method, 0}, + {"(Palette).Index", Method, 0}, + {"(RGBA).RGBA", Method, 0}, + {"(RGBA64).RGBA", Method, 0}, + {"(YCbCr).RGBA", Method, 0}, + {"Alpha", Type, 0}, + {"Alpha.A", Field, 0}, + {"Alpha16", Type, 0}, + {"Alpha16.A", Field, 0}, + {"Alpha16Model", Var, 0}, + {"AlphaModel", Var, 0}, + {"Black", Var, 0}, + {"CMYK", Type, 5}, + {"CMYK.C", Field, 5}, + {"CMYK.K", Field, 5}, + {"CMYK.M", Field, 5}, + {"CMYK.Y", Field, 5}, + {"CMYKModel", Var, 5}, + {"CMYKToRGB", Func, 5}, + {"Color", Type, 0}, + {"Gray", Type, 0}, + {"Gray.Y", Field, 0}, + {"Gray16", Type, 0}, + {"Gray16.Y", Field, 0}, + {"Gray16Model", Var, 0}, + {"GrayModel", Var, 0}, + {"Model", Type, 0}, + {"ModelFunc", Func, 0}, + {"NRGBA", Type, 0}, + {"NRGBA.A", Field, 0}, + {"NRGBA.B", Field, 0}, + {"NRGBA.G", Field, 0}, + {"NRGBA.R", Field, 0}, + {"NRGBA64", Type, 0}, + {"NRGBA64.A", Field, 0}, + {"NRGBA64.B", Field, 0}, + {"NRGBA64.G", Field, 0}, + {"NRGBA64.R", Field, 0}, + {"NRGBA64Model", Var, 0}, + {"NRGBAModel", Var, 0}, + {"NYCbCrA", Type, 6}, + {"NYCbCrA.A", Field, 6}, + {"NYCbCrA.YCbCr", Field, 6}, + {"NYCbCrAModel", Var, 6}, + {"Opaque", Var, 0}, + {"Palette", Type, 0}, + {"RGBA", Type, 0}, + {"RGBA.A", Field, 0}, + {"RGBA.B", Field, 0}, + {"RGBA.G", Field, 0}, + {"RGBA.R", Field, 0}, + {"RGBA64", Type, 0}, + {"RGBA64.A", Field, 0}, + {"RGBA64.B", Field, 0}, + {"RGBA64.G", Field, 0}, + {"RGBA64.R", Field, 0}, + {"RGBA64Model", Var, 0}, + {"RGBAModel", Var, 0}, + {"RGBToCMYK", Func, 5}, + {"RGBToYCbCr", Func, 0}, + {"Transparent", Var, 0}, + {"White", Var, 0}, + {"YCbCr", Type, 0}, + {"YCbCr.Cb", Field, 0}, + {"YCbCr.Cr", Field, 0}, + {"YCbCr.Y", Field, 0}, + {"YCbCrModel", Var, 0}, + {"YCbCrToRGB", Func, 0}, + }, + "image/color/palette": { + {"Plan9", Var, 2}, + {"WebSafe", Var, 2}, + }, + "image/draw": { + {"(Op).Draw", Method, 2}, + {"Draw", Func, 0}, + {"DrawMask", Func, 0}, + {"Drawer", Type, 2}, + {"FloydSteinberg", Var, 2}, + {"Image", Type, 0}, + {"Op", Type, 0}, + {"Over", Const, 0}, + {"Quantizer", Type, 2}, + {"RGBA64Image", Type, 17}, + {"Src", Const, 0}, + }, + "image/gif": { + {"Decode", Func, 0}, + {"DecodeAll", Func, 0}, + {"DecodeConfig", Func, 0}, + {"DisposalBackground", Const, 5}, + {"DisposalNone", Const, 5}, + {"DisposalPrevious", Const, 5}, + {"Encode", Func, 2}, + {"EncodeAll", Func, 2}, + {"GIF", Type, 0}, + {"GIF.BackgroundIndex", Field, 5}, + {"GIF.Config", Field, 5}, + {"GIF.Delay", Field, 0}, + {"GIF.Disposal", Field, 5}, + {"GIF.Image", Field, 0}, + {"GIF.LoopCount", Field, 0}, + {"Options", Type, 2}, + {"Options.Drawer", Field, 2}, + {"Options.NumColors", Field, 2}, + {"Options.Quantizer", Field, 2}, + }, + "image/jpeg": { + {"(FormatError).Error", Method, 0}, + {"(UnsupportedError).Error", Method, 0}, + {"Decode", Func, 0}, + {"DecodeConfig", Func, 0}, + {"DefaultQuality", Const, 0}, + {"Encode", Func, 0}, + {"FormatError", Type, 0}, + {"Options", Type, 0}, + {"Options.Quality", Field, 0}, + {"Reader", Type, 0}, + {"UnsupportedError", Type, 0}, + }, + "image/png": { + {"(*Encoder).Encode", Method, 4}, + {"(FormatError).Error", Method, 0}, + {"(UnsupportedError).Error", Method, 0}, + {"BestCompression", Const, 4}, + {"BestSpeed", Const, 4}, + {"CompressionLevel", Type, 4}, + {"Decode", Func, 0}, + {"DecodeConfig", Func, 0}, + {"DefaultCompression", Const, 4}, + {"Encode", Func, 0}, + {"Encoder", Type, 4}, + {"Encoder.BufferPool", Field, 9}, + {"Encoder.CompressionLevel", Field, 4}, + {"EncoderBuffer", Type, 9}, + {"EncoderBufferPool", Type, 9}, + {"FormatError", Type, 0}, + {"NoCompression", Const, 4}, + {"UnsupportedError", Type, 0}, + }, + "index/suffixarray": { + {"(*Index).Bytes", Method, 0}, + {"(*Index).FindAllIndex", Method, 0}, + {"(*Index).Lookup", Method, 0}, + {"(*Index).Read", Method, 0}, + {"(*Index).Write", Method, 0}, + {"Index", Type, 0}, + {"New", Func, 0}, + }, + "io": { + {"(*LimitedReader).Read", Method, 0}, + {"(*OffsetWriter).Seek", Method, 20}, + {"(*OffsetWriter).Write", Method, 20}, + {"(*OffsetWriter).WriteAt", Method, 20}, + {"(*PipeReader).Close", Method, 0}, + {"(*PipeReader).CloseWithError", Method, 0}, + {"(*PipeReader).Read", Method, 0}, + {"(*PipeWriter).Close", Method, 0}, + {"(*PipeWriter).CloseWithError", Method, 0}, + {"(*PipeWriter).Write", Method, 0}, + {"(*SectionReader).Outer", Method, 22}, + {"(*SectionReader).Read", Method, 0}, + {"(*SectionReader).ReadAt", Method, 0}, + {"(*SectionReader).Seek", Method, 0}, + {"(*SectionReader).Size", Method, 0}, + {"ByteReader", Type, 0}, + {"ByteScanner", Type, 0}, + {"ByteWriter", Type, 1}, + {"Closer", Type, 0}, + {"Copy", Func, 0}, + {"CopyBuffer", Func, 5}, + {"CopyN", Func, 0}, + {"Discard", Var, 16}, + {"EOF", Var, 0}, + {"ErrClosedPipe", Var, 0}, + {"ErrNoProgress", Var, 1}, + {"ErrShortBuffer", Var, 0}, + {"ErrShortWrite", Var, 0}, + {"ErrUnexpectedEOF", Var, 0}, + {"LimitReader", Func, 0}, + {"LimitedReader", Type, 0}, + {"LimitedReader.N", Field, 0}, + {"LimitedReader.R", Field, 0}, + {"MultiReader", Func, 0}, + {"MultiWriter", Func, 0}, + {"NewOffsetWriter", Func, 20}, + {"NewSectionReader", Func, 0}, + {"NopCloser", Func, 16}, + {"OffsetWriter", Type, 20}, + {"Pipe", Func, 0}, + {"PipeReader", Type, 0}, + {"PipeWriter", Type, 0}, + {"ReadAll", Func, 16}, + {"ReadAtLeast", Func, 0}, + {"ReadCloser", Type, 0}, + {"ReadFull", Func, 0}, + {"ReadSeekCloser", Type, 16}, + {"ReadSeeker", Type, 0}, + {"ReadWriteCloser", Type, 0}, + {"ReadWriteSeeker", Type, 0}, + {"ReadWriter", Type, 0}, + {"Reader", Type, 0}, + {"ReaderAt", Type, 0}, + {"ReaderFrom", Type, 0}, + {"RuneReader", Type, 0}, + {"RuneScanner", Type, 0}, + {"SectionReader", Type, 0}, + {"SeekCurrent", Const, 7}, + {"SeekEnd", Const, 7}, + {"SeekStart", Const, 7}, + {"Seeker", Type, 0}, + {"StringWriter", Type, 12}, + {"TeeReader", Func, 0}, + {"WriteCloser", Type, 0}, + {"WriteSeeker", Type, 0}, + {"WriteString", Func, 0}, + {"Writer", Type, 0}, + {"WriterAt", Type, 0}, + {"WriterTo", Type, 0}, + }, + "io/fs": { + {"(*PathError).Error", Method, 16}, + {"(*PathError).Timeout", Method, 16}, + {"(*PathError).Unwrap", Method, 16}, + {"(FileMode).IsDir", Method, 16}, + {"(FileMode).IsRegular", Method, 16}, + {"(FileMode).Perm", Method, 16}, + {"(FileMode).String", Method, 16}, + {"(FileMode).Type", Method, 16}, + {"DirEntry", Type, 16}, + {"ErrClosed", Var, 16}, + {"ErrExist", Var, 16}, + {"ErrInvalid", Var, 16}, + {"ErrNotExist", Var, 16}, + {"ErrPermission", Var, 16}, + {"FS", Type, 16}, + {"File", Type, 16}, + {"FileInfo", Type, 16}, + {"FileInfoToDirEntry", Func, 17}, + {"FileMode", Type, 16}, + {"FormatDirEntry", Func, 21}, + {"FormatFileInfo", Func, 21}, + {"Glob", Func, 16}, + {"GlobFS", Type, 16}, + {"ModeAppend", Const, 16}, + {"ModeCharDevice", Const, 16}, + {"ModeDevice", Const, 16}, + {"ModeDir", Const, 16}, + {"ModeExclusive", Const, 16}, + {"ModeIrregular", Const, 16}, + {"ModeNamedPipe", Const, 16}, + {"ModePerm", Const, 16}, + {"ModeSetgid", Const, 16}, + {"ModeSetuid", Const, 16}, + {"ModeSocket", Const, 16}, + {"ModeSticky", Const, 16}, + {"ModeSymlink", Const, 16}, + {"ModeTemporary", Const, 16}, + {"ModeType", Const, 16}, + {"PathError", Type, 16}, + {"PathError.Err", Field, 16}, + {"PathError.Op", Field, 16}, + {"PathError.Path", Field, 16}, + {"ReadDir", Func, 16}, + {"ReadDirFS", Type, 16}, + {"ReadDirFile", Type, 16}, + {"ReadFile", Func, 16}, + {"ReadFileFS", Type, 16}, + {"SkipAll", Var, 20}, + {"SkipDir", Var, 16}, + {"Stat", Func, 16}, + {"StatFS", Type, 16}, + {"Sub", Func, 16}, + {"SubFS", Type, 16}, + {"ValidPath", Func, 16}, + {"WalkDir", Func, 16}, + {"WalkDirFunc", Type, 16}, + }, + "io/ioutil": { + {"Discard", Var, 0}, + {"NopCloser", Func, 0}, + {"ReadAll", Func, 0}, + {"ReadDir", Func, 0}, + {"ReadFile", Func, 0}, + {"TempDir", Func, 0}, + {"TempFile", Func, 0}, + {"WriteFile", Func, 0}, + }, + "iter": { + {"Pull", Func, 23}, + {"Pull2", Func, 23}, + {"Seq", Type, 23}, + {"Seq2", Type, 23}, + }, + "log": { + {"(*Logger).Fatal", Method, 0}, + {"(*Logger).Fatalf", Method, 0}, + {"(*Logger).Fatalln", Method, 0}, + {"(*Logger).Flags", Method, 0}, + {"(*Logger).Output", Method, 0}, + {"(*Logger).Panic", Method, 0}, + {"(*Logger).Panicf", Method, 0}, + {"(*Logger).Panicln", Method, 0}, + {"(*Logger).Prefix", Method, 0}, + {"(*Logger).Print", Method, 0}, + {"(*Logger).Printf", Method, 0}, + {"(*Logger).Println", Method, 0}, + {"(*Logger).SetFlags", Method, 0}, + {"(*Logger).SetOutput", Method, 5}, + {"(*Logger).SetPrefix", Method, 0}, + {"(*Logger).Writer", Method, 12}, + {"Default", Func, 16}, + {"Fatal", Func, 0}, + {"Fatalf", Func, 0}, + {"Fatalln", Func, 0}, + {"Flags", Func, 0}, + {"LUTC", Const, 5}, + {"Ldate", Const, 0}, + {"Llongfile", Const, 0}, + {"Lmicroseconds", Const, 0}, + {"Lmsgprefix", Const, 14}, + {"Logger", Type, 0}, + {"Lshortfile", Const, 0}, + {"LstdFlags", Const, 0}, + {"Ltime", Const, 0}, + {"New", Func, 0}, + {"Output", Func, 5}, + {"Panic", Func, 0}, + {"Panicf", Func, 0}, + {"Panicln", Func, 0}, + {"Prefix", Func, 0}, + {"Print", Func, 0}, + {"Printf", Func, 0}, + {"Println", Func, 0}, + {"SetFlags", Func, 0}, + {"SetOutput", Func, 0}, + {"SetPrefix", Func, 0}, + {"Writer", Func, 13}, + }, + "log/slog": { + {"(*JSONHandler).Enabled", Method, 21}, + {"(*JSONHandler).Handle", Method, 21}, + {"(*JSONHandler).WithAttrs", Method, 21}, + {"(*JSONHandler).WithGroup", Method, 21}, + {"(*Level).UnmarshalJSON", Method, 21}, + {"(*Level).UnmarshalText", Method, 21}, + {"(*LevelVar).Level", Method, 21}, + {"(*LevelVar).MarshalText", Method, 21}, + {"(*LevelVar).Set", Method, 21}, + {"(*LevelVar).String", Method, 21}, + {"(*LevelVar).UnmarshalText", Method, 21}, + {"(*Logger).Debug", Method, 21}, + {"(*Logger).DebugContext", Method, 21}, + {"(*Logger).Enabled", Method, 21}, + {"(*Logger).Error", Method, 21}, + {"(*Logger).ErrorContext", Method, 21}, + {"(*Logger).Handler", Method, 21}, + {"(*Logger).Info", Method, 21}, + {"(*Logger).InfoContext", Method, 21}, + {"(*Logger).Log", Method, 21}, + {"(*Logger).LogAttrs", Method, 21}, + {"(*Logger).Warn", Method, 21}, + {"(*Logger).WarnContext", Method, 21}, + {"(*Logger).With", Method, 21}, + {"(*Logger).WithGroup", Method, 21}, + {"(*Record).Add", Method, 21}, + {"(*Record).AddAttrs", Method, 21}, + {"(*TextHandler).Enabled", Method, 21}, + {"(*TextHandler).Handle", Method, 21}, + {"(*TextHandler).WithAttrs", Method, 21}, + {"(*TextHandler).WithGroup", Method, 21}, + {"(Attr).Equal", Method, 21}, + {"(Attr).String", Method, 21}, + {"(Kind).String", Method, 21}, + {"(Level).Level", Method, 21}, + {"(Level).MarshalJSON", Method, 21}, + {"(Level).MarshalText", Method, 21}, + {"(Level).String", Method, 21}, + {"(Record).Attrs", Method, 21}, + {"(Record).Clone", Method, 21}, + {"(Record).NumAttrs", Method, 21}, + {"(Value).Any", Method, 21}, + {"(Value).Bool", Method, 21}, + {"(Value).Duration", Method, 21}, + {"(Value).Equal", Method, 21}, + {"(Value).Float64", Method, 21}, + {"(Value).Group", Method, 21}, + {"(Value).Int64", Method, 21}, + {"(Value).Kind", Method, 21}, + {"(Value).LogValuer", Method, 21}, + {"(Value).Resolve", Method, 21}, + {"(Value).String", Method, 21}, + {"(Value).Time", Method, 21}, + {"(Value).Uint64", Method, 21}, + {"Any", Func, 21}, + {"AnyValue", Func, 21}, + {"Attr", Type, 21}, + {"Attr.Key", Field, 21}, + {"Attr.Value", Field, 21}, + {"Bool", Func, 21}, + {"BoolValue", Func, 21}, + {"Debug", Func, 21}, + {"DebugContext", Func, 21}, + {"Default", Func, 21}, + {"Duration", Func, 21}, + {"DurationValue", Func, 21}, + {"Error", Func, 21}, + {"ErrorContext", Func, 21}, + {"Float64", Func, 21}, + {"Float64Value", Func, 21}, + {"Group", Func, 21}, + {"GroupValue", Func, 21}, + {"Handler", Type, 21}, + {"HandlerOptions", Type, 21}, + {"HandlerOptions.AddSource", Field, 21}, + {"HandlerOptions.Level", Field, 21}, + {"HandlerOptions.ReplaceAttr", Field, 21}, + {"Info", Func, 21}, + {"InfoContext", Func, 21}, + {"Int", Func, 21}, + {"Int64", Func, 21}, + {"Int64Value", Func, 21}, + {"IntValue", Func, 21}, + {"JSONHandler", Type, 21}, + {"Kind", Type, 21}, + {"KindAny", Const, 21}, + {"KindBool", Const, 21}, + {"KindDuration", Const, 21}, + {"KindFloat64", Const, 21}, + {"KindGroup", Const, 21}, + {"KindInt64", Const, 21}, + {"KindLogValuer", Const, 21}, + {"KindString", Const, 21}, + {"KindTime", Const, 21}, + {"KindUint64", Const, 21}, + {"Level", Type, 21}, + {"LevelDebug", Const, 21}, + {"LevelError", Const, 21}, + {"LevelInfo", Const, 21}, + {"LevelKey", Const, 21}, + {"LevelVar", Type, 21}, + {"LevelWarn", Const, 21}, + {"Leveler", Type, 21}, + {"Log", Func, 21}, + {"LogAttrs", Func, 21}, + {"LogValuer", Type, 21}, + {"Logger", Type, 21}, + {"MessageKey", Const, 21}, + {"New", Func, 21}, + {"NewJSONHandler", Func, 21}, + {"NewLogLogger", Func, 21}, + {"NewRecord", Func, 21}, + {"NewTextHandler", Func, 21}, + {"Record", Type, 21}, + {"Record.Level", Field, 21}, + {"Record.Message", Field, 21}, + {"Record.PC", Field, 21}, + {"Record.Time", Field, 21}, + {"SetDefault", Func, 21}, + {"SetLogLoggerLevel", Func, 22}, + {"Source", Type, 21}, + {"Source.File", Field, 21}, + {"Source.Function", Field, 21}, + {"Source.Line", Field, 21}, + {"SourceKey", Const, 21}, + {"String", Func, 21}, + {"StringValue", Func, 21}, + {"TextHandler", Type, 21}, + {"Time", Func, 21}, + {"TimeKey", Const, 21}, + {"TimeValue", Func, 21}, + {"Uint64", Func, 21}, + {"Uint64Value", Func, 21}, + {"Value", Type, 21}, + {"Warn", Func, 21}, + {"WarnContext", Func, 21}, + {"With", Func, 21}, + }, + "log/syslog": { + {"(*Writer).Alert", Method, 0}, + {"(*Writer).Close", Method, 0}, + {"(*Writer).Crit", Method, 0}, + {"(*Writer).Debug", Method, 0}, + {"(*Writer).Emerg", Method, 0}, + {"(*Writer).Err", Method, 0}, + {"(*Writer).Info", Method, 0}, + {"(*Writer).Notice", Method, 0}, + {"(*Writer).Warning", Method, 0}, + {"(*Writer).Write", Method, 0}, + {"Dial", Func, 0}, + {"LOG_ALERT", Const, 0}, + {"LOG_AUTH", Const, 1}, + {"LOG_AUTHPRIV", Const, 1}, + {"LOG_CRIT", Const, 0}, + {"LOG_CRON", Const, 1}, + {"LOG_DAEMON", Const, 1}, + {"LOG_DEBUG", Const, 0}, + {"LOG_EMERG", Const, 0}, + {"LOG_ERR", Const, 0}, + {"LOG_FTP", Const, 1}, + {"LOG_INFO", Const, 0}, + {"LOG_KERN", Const, 1}, + {"LOG_LOCAL0", Const, 1}, + {"LOG_LOCAL1", Const, 1}, + {"LOG_LOCAL2", Const, 1}, + {"LOG_LOCAL3", Const, 1}, + {"LOG_LOCAL4", Const, 1}, + {"LOG_LOCAL5", Const, 1}, + {"LOG_LOCAL6", Const, 1}, + {"LOG_LOCAL7", Const, 1}, + {"LOG_LPR", Const, 1}, + {"LOG_MAIL", Const, 1}, + {"LOG_NEWS", Const, 1}, + {"LOG_NOTICE", Const, 0}, + {"LOG_SYSLOG", Const, 1}, + {"LOG_USER", Const, 1}, + {"LOG_UUCP", Const, 1}, + {"LOG_WARNING", Const, 0}, + {"New", Func, 0}, + {"NewLogger", Func, 0}, + {"Priority", Type, 0}, + {"Writer", Type, 0}, + }, + "maps": { + {"All", Func, 23}, + {"Clone", Func, 21}, + {"Collect", Func, 23}, + {"Copy", Func, 21}, + {"DeleteFunc", Func, 21}, + {"Equal", Func, 21}, + {"EqualFunc", Func, 21}, + {"Insert", Func, 23}, + {"Keys", Func, 23}, + {"Values", Func, 23}, + }, + "math": { + {"Abs", Func, 0}, + {"Acos", Func, 0}, + {"Acosh", Func, 0}, + {"Asin", Func, 0}, + {"Asinh", Func, 0}, + {"Atan", Func, 0}, + {"Atan2", Func, 0}, + {"Atanh", Func, 0}, + {"Cbrt", Func, 0}, + {"Ceil", Func, 0}, + {"Copysign", Func, 0}, + {"Cos", Func, 0}, + {"Cosh", Func, 0}, + {"Dim", Func, 0}, + {"E", Const, 0}, + {"Erf", Func, 0}, + {"Erfc", Func, 0}, + {"Erfcinv", Func, 10}, + {"Erfinv", Func, 10}, + {"Exp", Func, 0}, + {"Exp2", Func, 0}, + {"Expm1", Func, 0}, + {"FMA", Func, 14}, + {"Float32bits", Func, 0}, + {"Float32frombits", Func, 0}, + {"Float64bits", Func, 0}, + {"Float64frombits", Func, 0}, + {"Floor", Func, 0}, + {"Frexp", Func, 0}, + {"Gamma", Func, 0}, + {"Hypot", Func, 0}, + {"Ilogb", Func, 0}, + {"Inf", Func, 0}, + {"IsInf", Func, 0}, + {"IsNaN", Func, 0}, + {"J0", Func, 0}, + {"J1", Func, 0}, + {"Jn", Func, 0}, + {"Ldexp", Func, 0}, + {"Lgamma", Func, 0}, + {"Ln10", Const, 0}, + {"Ln2", Const, 0}, + {"Log", Func, 0}, + {"Log10", Func, 0}, + {"Log10E", Const, 0}, + {"Log1p", Func, 0}, + {"Log2", Func, 0}, + {"Log2E", Const, 0}, + {"Logb", Func, 0}, + {"Max", Func, 0}, + {"MaxFloat32", Const, 0}, + {"MaxFloat64", Const, 0}, + {"MaxInt", Const, 17}, + {"MaxInt16", Const, 0}, + {"MaxInt32", Const, 0}, + {"MaxInt64", Const, 0}, + {"MaxInt8", Const, 0}, + {"MaxUint", Const, 17}, + {"MaxUint16", Const, 0}, + {"MaxUint32", Const, 0}, + {"MaxUint64", Const, 0}, + {"MaxUint8", Const, 0}, + {"Min", Func, 0}, + {"MinInt", Const, 17}, + {"MinInt16", Const, 0}, + {"MinInt32", Const, 0}, + {"MinInt64", Const, 0}, + {"MinInt8", Const, 0}, + {"Mod", Func, 0}, + {"Modf", Func, 0}, + {"NaN", Func, 0}, + {"Nextafter", Func, 0}, + {"Nextafter32", Func, 4}, + {"Phi", Const, 0}, + {"Pi", Const, 0}, + {"Pow", Func, 0}, + {"Pow10", Func, 0}, + {"Remainder", Func, 0}, + {"Round", Func, 10}, + {"RoundToEven", Func, 10}, + {"Signbit", Func, 0}, + {"Sin", Func, 0}, + {"Sincos", Func, 0}, + {"Sinh", Func, 0}, + {"SmallestNonzeroFloat32", Const, 0}, + {"SmallestNonzeroFloat64", Const, 0}, + {"Sqrt", Func, 0}, + {"Sqrt2", Const, 0}, + {"SqrtE", Const, 0}, + {"SqrtPhi", Const, 0}, + {"SqrtPi", Const, 0}, + {"Tan", Func, 0}, + {"Tanh", Func, 0}, + {"Trunc", Func, 0}, + {"Y0", Func, 0}, + {"Y1", Func, 0}, + {"Yn", Func, 0}, + }, + "math/big": { + {"(*Float).Abs", Method, 5}, + {"(*Float).Acc", Method, 5}, + {"(*Float).Add", Method, 5}, + {"(*Float).Append", Method, 5}, + {"(*Float).Cmp", Method, 5}, + {"(*Float).Copy", Method, 5}, + {"(*Float).Float32", Method, 5}, + {"(*Float).Float64", Method, 5}, + {"(*Float).Format", Method, 5}, + {"(*Float).GobDecode", Method, 7}, + {"(*Float).GobEncode", Method, 7}, + {"(*Float).Int", Method, 5}, + {"(*Float).Int64", Method, 5}, + {"(*Float).IsInf", Method, 5}, + {"(*Float).IsInt", Method, 5}, + {"(*Float).MantExp", Method, 5}, + {"(*Float).MarshalText", Method, 6}, + {"(*Float).MinPrec", Method, 5}, + {"(*Float).Mode", Method, 5}, + {"(*Float).Mul", Method, 5}, + {"(*Float).Neg", Method, 5}, + {"(*Float).Parse", Method, 5}, + {"(*Float).Prec", Method, 5}, + {"(*Float).Quo", Method, 5}, + {"(*Float).Rat", Method, 5}, + {"(*Float).Scan", Method, 8}, + {"(*Float).Set", Method, 5}, + {"(*Float).SetFloat64", Method, 5}, + {"(*Float).SetInf", Method, 5}, + {"(*Float).SetInt", Method, 5}, + {"(*Float).SetInt64", Method, 5}, + {"(*Float).SetMantExp", Method, 5}, + {"(*Float).SetMode", Method, 5}, + {"(*Float).SetPrec", Method, 5}, + {"(*Float).SetRat", Method, 5}, + {"(*Float).SetString", Method, 5}, + {"(*Float).SetUint64", Method, 5}, + {"(*Float).Sign", Method, 5}, + {"(*Float).Signbit", Method, 5}, + {"(*Float).Sqrt", Method, 10}, + {"(*Float).String", Method, 5}, + {"(*Float).Sub", Method, 5}, + {"(*Float).Text", Method, 5}, + {"(*Float).Uint64", Method, 5}, + {"(*Float).UnmarshalText", Method, 6}, + {"(*Int).Abs", Method, 0}, + {"(*Int).Add", Method, 0}, + {"(*Int).And", Method, 0}, + {"(*Int).AndNot", Method, 0}, + {"(*Int).Append", Method, 6}, + {"(*Int).Binomial", Method, 0}, + {"(*Int).Bit", Method, 0}, + {"(*Int).BitLen", Method, 0}, + {"(*Int).Bits", Method, 0}, + {"(*Int).Bytes", Method, 0}, + {"(*Int).Cmp", Method, 0}, + {"(*Int).CmpAbs", Method, 10}, + {"(*Int).Div", Method, 0}, + {"(*Int).DivMod", Method, 0}, + {"(*Int).Exp", Method, 0}, + {"(*Int).FillBytes", Method, 15}, + {"(*Int).Float64", Method, 21}, + {"(*Int).Format", Method, 0}, + {"(*Int).GCD", Method, 0}, + {"(*Int).GobDecode", Method, 0}, + {"(*Int).GobEncode", Method, 0}, + {"(*Int).Int64", Method, 0}, + {"(*Int).IsInt64", Method, 9}, + {"(*Int).IsUint64", Method, 9}, + {"(*Int).Lsh", Method, 0}, + {"(*Int).MarshalJSON", Method, 1}, + {"(*Int).MarshalText", Method, 3}, + {"(*Int).Mod", Method, 0}, + {"(*Int).ModInverse", Method, 0}, + {"(*Int).ModSqrt", Method, 5}, + {"(*Int).Mul", Method, 0}, + {"(*Int).MulRange", Method, 0}, + {"(*Int).Neg", Method, 0}, + {"(*Int).Not", Method, 0}, + {"(*Int).Or", Method, 0}, + {"(*Int).ProbablyPrime", Method, 0}, + {"(*Int).Quo", Method, 0}, + {"(*Int).QuoRem", Method, 0}, + {"(*Int).Rand", Method, 0}, + {"(*Int).Rem", Method, 0}, + {"(*Int).Rsh", Method, 0}, + {"(*Int).Scan", Method, 0}, + {"(*Int).Set", Method, 0}, + {"(*Int).SetBit", Method, 0}, + {"(*Int).SetBits", Method, 0}, + {"(*Int).SetBytes", Method, 0}, + {"(*Int).SetInt64", Method, 0}, + {"(*Int).SetString", Method, 0}, + {"(*Int).SetUint64", Method, 1}, + {"(*Int).Sign", Method, 0}, + {"(*Int).Sqrt", Method, 8}, + {"(*Int).String", Method, 0}, + {"(*Int).Sub", Method, 0}, + {"(*Int).Text", Method, 6}, + {"(*Int).TrailingZeroBits", Method, 13}, + {"(*Int).Uint64", Method, 1}, + {"(*Int).UnmarshalJSON", Method, 1}, + {"(*Int).UnmarshalText", Method, 3}, + {"(*Int).Xor", Method, 0}, + {"(*Rat).Abs", Method, 0}, + {"(*Rat).Add", Method, 0}, + {"(*Rat).Cmp", Method, 0}, + {"(*Rat).Denom", Method, 0}, + {"(*Rat).Float32", Method, 4}, + {"(*Rat).Float64", Method, 1}, + {"(*Rat).FloatPrec", Method, 22}, + {"(*Rat).FloatString", Method, 0}, + {"(*Rat).GobDecode", Method, 0}, + {"(*Rat).GobEncode", Method, 0}, + {"(*Rat).Inv", Method, 0}, + {"(*Rat).IsInt", Method, 0}, + {"(*Rat).MarshalText", Method, 3}, + {"(*Rat).Mul", Method, 0}, + {"(*Rat).Neg", Method, 0}, + {"(*Rat).Num", Method, 0}, + {"(*Rat).Quo", Method, 0}, + {"(*Rat).RatString", Method, 0}, + {"(*Rat).Scan", Method, 0}, + {"(*Rat).Set", Method, 0}, + {"(*Rat).SetFloat64", Method, 1}, + {"(*Rat).SetFrac", Method, 0}, + {"(*Rat).SetFrac64", Method, 0}, + {"(*Rat).SetInt", Method, 0}, + {"(*Rat).SetInt64", Method, 0}, + {"(*Rat).SetString", Method, 0}, + {"(*Rat).SetUint64", Method, 13}, + {"(*Rat).Sign", Method, 0}, + {"(*Rat).String", Method, 0}, + {"(*Rat).Sub", Method, 0}, + {"(*Rat).UnmarshalText", Method, 3}, + {"(Accuracy).String", Method, 5}, + {"(ErrNaN).Error", Method, 5}, + {"(RoundingMode).String", Method, 5}, + {"Above", Const, 5}, + {"Accuracy", Type, 5}, + {"AwayFromZero", Const, 5}, + {"Below", Const, 5}, + {"ErrNaN", Type, 5}, + {"Exact", Const, 5}, + {"Float", Type, 5}, + {"Int", Type, 0}, + {"Jacobi", Func, 5}, + {"MaxBase", Const, 0}, + {"MaxExp", Const, 5}, + {"MaxPrec", Const, 5}, + {"MinExp", Const, 5}, + {"NewFloat", Func, 5}, + {"NewInt", Func, 0}, + {"NewRat", Func, 0}, + {"ParseFloat", Func, 5}, + {"Rat", Type, 0}, + {"RoundingMode", Type, 5}, + {"ToNearestAway", Const, 5}, + {"ToNearestEven", Const, 5}, + {"ToNegativeInf", Const, 5}, + {"ToPositiveInf", Const, 5}, + {"ToZero", Const, 5}, + {"Word", Type, 0}, + }, + "math/bits": { + {"Add", Func, 12}, + {"Add32", Func, 12}, + {"Add64", Func, 12}, + {"Div", Func, 12}, + {"Div32", Func, 12}, + {"Div64", Func, 12}, + {"LeadingZeros", Func, 9}, + {"LeadingZeros16", Func, 9}, + {"LeadingZeros32", Func, 9}, + {"LeadingZeros64", Func, 9}, + {"LeadingZeros8", Func, 9}, + {"Len", Func, 9}, + {"Len16", Func, 9}, + {"Len32", Func, 9}, + {"Len64", Func, 9}, + {"Len8", Func, 9}, + {"Mul", Func, 12}, + {"Mul32", Func, 12}, + {"Mul64", Func, 12}, + {"OnesCount", Func, 9}, + {"OnesCount16", Func, 9}, + {"OnesCount32", Func, 9}, + {"OnesCount64", Func, 9}, + {"OnesCount8", Func, 9}, + {"Rem", Func, 14}, + {"Rem32", Func, 14}, + {"Rem64", Func, 14}, + {"Reverse", Func, 9}, + {"Reverse16", Func, 9}, + {"Reverse32", Func, 9}, + {"Reverse64", Func, 9}, + {"Reverse8", Func, 9}, + {"ReverseBytes", Func, 9}, + {"ReverseBytes16", Func, 9}, + {"ReverseBytes32", Func, 9}, + {"ReverseBytes64", Func, 9}, + {"RotateLeft", Func, 9}, + {"RotateLeft16", Func, 9}, + {"RotateLeft32", Func, 9}, + {"RotateLeft64", Func, 9}, + {"RotateLeft8", Func, 9}, + {"Sub", Func, 12}, + {"Sub32", Func, 12}, + {"Sub64", Func, 12}, + {"TrailingZeros", Func, 9}, + {"TrailingZeros16", Func, 9}, + {"TrailingZeros32", Func, 9}, + {"TrailingZeros64", Func, 9}, + {"TrailingZeros8", Func, 9}, + {"UintSize", Const, 9}, + }, + "math/cmplx": { + {"Abs", Func, 0}, + {"Acos", Func, 0}, + {"Acosh", Func, 0}, + {"Asin", Func, 0}, + {"Asinh", Func, 0}, + {"Atan", Func, 0}, + {"Atanh", Func, 0}, + {"Conj", Func, 0}, + {"Cos", Func, 0}, + {"Cosh", Func, 0}, + {"Cot", Func, 0}, + {"Exp", Func, 0}, + {"Inf", Func, 0}, + {"IsInf", Func, 0}, + {"IsNaN", Func, 0}, + {"Log", Func, 0}, + {"Log10", Func, 0}, + {"NaN", Func, 0}, + {"Phase", Func, 0}, + {"Polar", Func, 0}, + {"Pow", Func, 0}, + {"Rect", Func, 0}, + {"Sin", Func, 0}, + {"Sinh", Func, 0}, + {"Sqrt", Func, 0}, + {"Tan", Func, 0}, + {"Tanh", Func, 0}, + }, + "math/rand": { + {"(*Rand).ExpFloat64", Method, 0}, + {"(*Rand).Float32", Method, 0}, + {"(*Rand).Float64", Method, 0}, + {"(*Rand).Int", Method, 0}, + {"(*Rand).Int31", Method, 0}, + {"(*Rand).Int31n", Method, 0}, + {"(*Rand).Int63", Method, 0}, + {"(*Rand).Int63n", Method, 0}, + {"(*Rand).Intn", Method, 0}, + {"(*Rand).NormFloat64", Method, 0}, + {"(*Rand).Perm", Method, 0}, + {"(*Rand).Read", Method, 6}, + {"(*Rand).Seed", Method, 0}, + {"(*Rand).Shuffle", Method, 10}, + {"(*Rand).Uint32", Method, 0}, + {"(*Rand).Uint64", Method, 8}, + {"(*Zipf).Uint64", Method, 0}, + {"ExpFloat64", Func, 0}, + {"Float32", Func, 0}, + {"Float64", Func, 0}, + {"Int", Func, 0}, + {"Int31", Func, 0}, + {"Int31n", Func, 0}, + {"Int63", Func, 0}, + {"Int63n", Func, 0}, + {"Intn", Func, 0}, + {"New", Func, 0}, + {"NewSource", Func, 0}, + {"NewZipf", Func, 0}, + {"NormFloat64", Func, 0}, + {"Perm", Func, 0}, + {"Rand", Type, 0}, + {"Read", Func, 6}, + {"Seed", Func, 0}, + {"Shuffle", Func, 10}, + {"Source", Type, 0}, + {"Source64", Type, 8}, + {"Uint32", Func, 0}, + {"Uint64", Func, 8}, + {"Zipf", Type, 0}, + }, + "math/rand/v2": { + {"(*ChaCha8).MarshalBinary", Method, 22}, + {"(*ChaCha8).Read", Method, 23}, + {"(*ChaCha8).Seed", Method, 22}, + {"(*ChaCha8).Uint64", Method, 22}, + {"(*ChaCha8).UnmarshalBinary", Method, 22}, + {"(*PCG).MarshalBinary", Method, 22}, + {"(*PCG).Seed", Method, 22}, + {"(*PCG).Uint64", Method, 22}, + {"(*PCG).UnmarshalBinary", Method, 22}, + {"(*Rand).ExpFloat64", Method, 22}, + {"(*Rand).Float32", Method, 22}, + {"(*Rand).Float64", Method, 22}, + {"(*Rand).Int", Method, 22}, + {"(*Rand).Int32", Method, 22}, + {"(*Rand).Int32N", Method, 22}, + {"(*Rand).Int64", Method, 22}, + {"(*Rand).Int64N", Method, 22}, + {"(*Rand).IntN", Method, 22}, + {"(*Rand).NormFloat64", Method, 22}, + {"(*Rand).Perm", Method, 22}, + {"(*Rand).Shuffle", Method, 22}, + {"(*Rand).Uint", Method, 23}, + {"(*Rand).Uint32", Method, 22}, + {"(*Rand).Uint32N", Method, 22}, + {"(*Rand).Uint64", Method, 22}, + {"(*Rand).Uint64N", Method, 22}, + {"(*Rand).UintN", Method, 22}, + {"(*Zipf).Uint64", Method, 22}, + {"ChaCha8", Type, 22}, + {"ExpFloat64", Func, 22}, + {"Float32", Func, 22}, + {"Float64", Func, 22}, + {"Int", Func, 22}, + {"Int32", Func, 22}, + {"Int32N", Func, 22}, + {"Int64", Func, 22}, + {"Int64N", Func, 22}, + {"IntN", Func, 22}, + {"N", Func, 22}, + {"New", Func, 22}, + {"NewChaCha8", Func, 22}, + {"NewPCG", Func, 22}, + {"NewZipf", Func, 22}, + {"NormFloat64", Func, 22}, + {"PCG", Type, 22}, + {"Perm", Func, 22}, + {"Rand", Type, 22}, + {"Shuffle", Func, 22}, + {"Source", Type, 22}, + {"Uint", Func, 23}, + {"Uint32", Func, 22}, + {"Uint32N", Func, 22}, + {"Uint64", Func, 22}, + {"Uint64N", Func, 22}, + {"UintN", Func, 22}, + {"Zipf", Type, 22}, + }, + "mime": { + {"(*WordDecoder).Decode", Method, 5}, + {"(*WordDecoder).DecodeHeader", Method, 5}, + {"(WordEncoder).Encode", Method, 5}, + {"AddExtensionType", Func, 0}, + {"BEncoding", Const, 5}, + {"ErrInvalidMediaParameter", Var, 9}, + {"ExtensionsByType", Func, 5}, + {"FormatMediaType", Func, 0}, + {"ParseMediaType", Func, 0}, + {"QEncoding", Const, 5}, + {"TypeByExtension", Func, 0}, + {"WordDecoder", Type, 5}, + {"WordDecoder.CharsetReader", Field, 5}, + {"WordEncoder", Type, 5}, + }, + "mime/multipart": { + {"(*FileHeader).Open", Method, 0}, + {"(*Form).RemoveAll", Method, 0}, + {"(*Part).Close", Method, 0}, + {"(*Part).FileName", Method, 0}, + {"(*Part).FormName", Method, 0}, + {"(*Part).Read", Method, 0}, + {"(*Reader).NextPart", Method, 0}, + {"(*Reader).NextRawPart", Method, 14}, + {"(*Reader).ReadForm", Method, 0}, + {"(*Writer).Boundary", Method, 0}, + {"(*Writer).Close", Method, 0}, + {"(*Writer).CreateFormField", Method, 0}, + {"(*Writer).CreateFormFile", Method, 0}, + {"(*Writer).CreatePart", Method, 0}, + {"(*Writer).FormDataContentType", Method, 0}, + {"(*Writer).SetBoundary", Method, 1}, + {"(*Writer).WriteField", Method, 0}, + {"ErrMessageTooLarge", Var, 9}, + {"File", Type, 0}, + {"FileHeader", Type, 0}, + {"FileHeader.Filename", Field, 0}, + {"FileHeader.Header", Field, 0}, + {"FileHeader.Size", Field, 9}, + {"Form", Type, 0}, + {"Form.File", Field, 0}, + {"Form.Value", Field, 0}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"Part", Type, 0}, + {"Part.Header", Field, 0}, + {"Reader", Type, 0}, + {"Writer", Type, 0}, + }, + "mime/quotedprintable": { + {"(*Reader).Read", Method, 5}, + {"(*Writer).Close", Method, 5}, + {"(*Writer).Write", Method, 5}, + {"NewReader", Func, 5}, + {"NewWriter", Func, 5}, + {"Reader", Type, 5}, + {"Writer", Type, 5}, + {"Writer.Binary", Field, 5}, + }, + "net": { + {"(*AddrError).Error", Method, 0}, + {"(*AddrError).Temporary", Method, 0}, + {"(*AddrError).Timeout", Method, 0}, + {"(*Buffers).Read", Method, 8}, + {"(*Buffers).WriteTo", Method, 8}, + {"(*DNSConfigError).Error", Method, 0}, + {"(*DNSConfigError).Temporary", Method, 0}, + {"(*DNSConfigError).Timeout", Method, 0}, + {"(*DNSConfigError).Unwrap", Method, 13}, + {"(*DNSError).Error", Method, 0}, + {"(*DNSError).Temporary", Method, 0}, + {"(*DNSError).Timeout", Method, 0}, + {"(*DNSError).Unwrap", Method, 23}, + {"(*Dialer).Dial", Method, 1}, + {"(*Dialer).DialContext", Method, 7}, + {"(*Dialer).MultipathTCP", Method, 21}, + {"(*Dialer).SetMultipathTCP", Method, 21}, + {"(*IP).UnmarshalText", Method, 2}, + {"(*IPAddr).Network", Method, 0}, + {"(*IPAddr).String", Method, 0}, + {"(*IPConn).Close", Method, 0}, + {"(*IPConn).File", Method, 0}, + {"(*IPConn).LocalAddr", Method, 0}, + {"(*IPConn).Read", Method, 0}, + {"(*IPConn).ReadFrom", Method, 0}, + {"(*IPConn).ReadFromIP", Method, 0}, + {"(*IPConn).ReadMsgIP", Method, 1}, + {"(*IPConn).RemoteAddr", Method, 0}, + {"(*IPConn).SetDeadline", Method, 0}, + {"(*IPConn).SetReadBuffer", Method, 0}, + {"(*IPConn).SetReadDeadline", Method, 0}, + {"(*IPConn).SetWriteBuffer", Method, 0}, + {"(*IPConn).SetWriteDeadline", Method, 0}, + {"(*IPConn).SyscallConn", Method, 9}, + {"(*IPConn).Write", Method, 0}, + {"(*IPConn).WriteMsgIP", Method, 1}, + {"(*IPConn).WriteTo", Method, 0}, + {"(*IPConn).WriteToIP", Method, 0}, + {"(*IPNet).Contains", Method, 0}, + {"(*IPNet).Network", Method, 0}, + {"(*IPNet).String", Method, 0}, + {"(*Interface).Addrs", Method, 0}, + {"(*Interface).MulticastAddrs", Method, 0}, + {"(*ListenConfig).Listen", Method, 11}, + {"(*ListenConfig).ListenPacket", Method, 11}, + {"(*ListenConfig).MultipathTCP", Method, 21}, + {"(*ListenConfig).SetMultipathTCP", Method, 21}, + {"(*OpError).Error", Method, 0}, + {"(*OpError).Temporary", Method, 0}, + {"(*OpError).Timeout", Method, 0}, + {"(*OpError).Unwrap", Method, 13}, + {"(*ParseError).Error", Method, 0}, + {"(*ParseError).Temporary", Method, 17}, + {"(*ParseError).Timeout", Method, 17}, + {"(*Resolver).LookupAddr", Method, 8}, + {"(*Resolver).LookupCNAME", Method, 8}, + {"(*Resolver).LookupHost", Method, 8}, + {"(*Resolver).LookupIP", Method, 15}, + {"(*Resolver).LookupIPAddr", Method, 8}, + {"(*Resolver).LookupMX", Method, 8}, + {"(*Resolver).LookupNS", Method, 8}, + {"(*Resolver).LookupNetIP", Method, 18}, + {"(*Resolver).LookupPort", Method, 8}, + {"(*Resolver).LookupSRV", Method, 8}, + {"(*Resolver).LookupTXT", Method, 8}, + {"(*TCPAddr).AddrPort", Method, 18}, + {"(*TCPAddr).Network", Method, 0}, + {"(*TCPAddr).String", Method, 0}, + {"(*TCPConn).Close", Method, 0}, + {"(*TCPConn).CloseRead", Method, 0}, + {"(*TCPConn).CloseWrite", Method, 0}, + {"(*TCPConn).File", Method, 0}, + {"(*TCPConn).LocalAddr", Method, 0}, + {"(*TCPConn).MultipathTCP", Method, 21}, + {"(*TCPConn).Read", Method, 0}, + {"(*TCPConn).ReadFrom", Method, 0}, + {"(*TCPConn).RemoteAddr", Method, 0}, + {"(*TCPConn).SetDeadline", Method, 0}, + {"(*TCPConn).SetKeepAlive", Method, 0}, + {"(*TCPConn).SetKeepAliveConfig", Method, 23}, + {"(*TCPConn).SetKeepAlivePeriod", Method, 2}, + {"(*TCPConn).SetLinger", Method, 0}, + {"(*TCPConn).SetNoDelay", Method, 0}, + {"(*TCPConn).SetReadBuffer", Method, 0}, + {"(*TCPConn).SetReadDeadline", Method, 0}, + {"(*TCPConn).SetWriteBuffer", Method, 0}, + {"(*TCPConn).SetWriteDeadline", Method, 0}, + {"(*TCPConn).SyscallConn", Method, 9}, + {"(*TCPConn).Write", Method, 0}, + {"(*TCPConn).WriteTo", Method, 22}, + {"(*TCPListener).Accept", Method, 0}, + {"(*TCPListener).AcceptTCP", Method, 0}, + {"(*TCPListener).Addr", Method, 0}, + {"(*TCPListener).Close", Method, 0}, + {"(*TCPListener).File", Method, 0}, + {"(*TCPListener).SetDeadline", Method, 0}, + {"(*TCPListener).SyscallConn", Method, 10}, + {"(*UDPAddr).AddrPort", Method, 18}, + {"(*UDPAddr).Network", Method, 0}, + {"(*UDPAddr).String", Method, 0}, + {"(*UDPConn).Close", Method, 0}, + {"(*UDPConn).File", Method, 0}, + {"(*UDPConn).LocalAddr", Method, 0}, + {"(*UDPConn).Read", Method, 0}, + {"(*UDPConn).ReadFrom", Method, 0}, + {"(*UDPConn).ReadFromUDP", Method, 0}, + {"(*UDPConn).ReadFromUDPAddrPort", Method, 18}, + {"(*UDPConn).ReadMsgUDP", Method, 1}, + {"(*UDPConn).ReadMsgUDPAddrPort", Method, 18}, + {"(*UDPConn).RemoteAddr", Method, 0}, + {"(*UDPConn).SetDeadline", Method, 0}, + {"(*UDPConn).SetReadBuffer", Method, 0}, + {"(*UDPConn).SetReadDeadline", Method, 0}, + {"(*UDPConn).SetWriteBuffer", Method, 0}, + {"(*UDPConn).SetWriteDeadline", Method, 0}, + {"(*UDPConn).SyscallConn", Method, 9}, + {"(*UDPConn).Write", Method, 0}, + {"(*UDPConn).WriteMsgUDP", Method, 1}, + {"(*UDPConn).WriteMsgUDPAddrPort", Method, 18}, + {"(*UDPConn).WriteTo", Method, 0}, + {"(*UDPConn).WriteToUDP", Method, 0}, + {"(*UDPConn).WriteToUDPAddrPort", Method, 18}, + {"(*UnixAddr).Network", Method, 0}, + {"(*UnixAddr).String", Method, 0}, + {"(*UnixConn).Close", Method, 0}, + {"(*UnixConn).CloseRead", Method, 1}, + {"(*UnixConn).CloseWrite", Method, 1}, + {"(*UnixConn).File", Method, 0}, + {"(*UnixConn).LocalAddr", Method, 0}, + {"(*UnixConn).Read", Method, 0}, + {"(*UnixConn).ReadFrom", Method, 0}, + {"(*UnixConn).ReadFromUnix", Method, 0}, + {"(*UnixConn).ReadMsgUnix", Method, 0}, + {"(*UnixConn).RemoteAddr", Method, 0}, + {"(*UnixConn).SetDeadline", Method, 0}, + {"(*UnixConn).SetReadBuffer", Method, 0}, + {"(*UnixConn).SetReadDeadline", Method, 0}, + {"(*UnixConn).SetWriteBuffer", Method, 0}, + {"(*UnixConn).SetWriteDeadline", Method, 0}, + {"(*UnixConn).SyscallConn", Method, 9}, + {"(*UnixConn).Write", Method, 0}, + {"(*UnixConn).WriteMsgUnix", Method, 0}, + {"(*UnixConn).WriteTo", Method, 0}, + {"(*UnixConn).WriteToUnix", Method, 0}, + {"(*UnixListener).Accept", Method, 0}, + {"(*UnixListener).AcceptUnix", Method, 0}, + {"(*UnixListener).Addr", Method, 0}, + {"(*UnixListener).Close", Method, 0}, + {"(*UnixListener).File", Method, 0}, + {"(*UnixListener).SetDeadline", Method, 0}, + {"(*UnixListener).SetUnlinkOnClose", Method, 8}, + {"(*UnixListener).SyscallConn", Method, 10}, + {"(Flags).String", Method, 0}, + {"(HardwareAddr).String", Method, 0}, + {"(IP).DefaultMask", Method, 0}, + {"(IP).Equal", Method, 0}, + {"(IP).IsGlobalUnicast", Method, 0}, + {"(IP).IsInterfaceLocalMulticast", Method, 0}, + {"(IP).IsLinkLocalMulticast", Method, 0}, + {"(IP).IsLinkLocalUnicast", Method, 0}, + {"(IP).IsLoopback", Method, 0}, + {"(IP).IsMulticast", Method, 0}, + {"(IP).IsPrivate", Method, 17}, + {"(IP).IsUnspecified", Method, 0}, + {"(IP).MarshalText", Method, 2}, + {"(IP).Mask", Method, 0}, + {"(IP).String", Method, 0}, + {"(IP).To16", Method, 0}, + {"(IP).To4", Method, 0}, + {"(IPMask).Size", Method, 0}, + {"(IPMask).String", Method, 0}, + {"(InvalidAddrError).Error", Method, 0}, + {"(InvalidAddrError).Temporary", Method, 0}, + {"(InvalidAddrError).Timeout", Method, 0}, + {"(UnknownNetworkError).Error", Method, 0}, + {"(UnknownNetworkError).Temporary", Method, 0}, + {"(UnknownNetworkError).Timeout", Method, 0}, + {"Addr", Type, 0}, + {"AddrError", Type, 0}, + {"AddrError.Addr", Field, 0}, + {"AddrError.Err", Field, 0}, + {"Buffers", Type, 8}, + {"CIDRMask", Func, 0}, + {"Conn", Type, 0}, + {"DNSConfigError", Type, 0}, + {"DNSConfigError.Err", Field, 0}, + {"DNSError", Type, 0}, + {"DNSError.Err", Field, 0}, + {"DNSError.IsNotFound", Field, 13}, + {"DNSError.IsTemporary", Field, 6}, + {"DNSError.IsTimeout", Field, 0}, + {"DNSError.Name", Field, 0}, + {"DNSError.Server", Field, 0}, + {"DNSError.UnwrapErr", Field, 23}, + {"DefaultResolver", Var, 8}, + {"Dial", Func, 0}, + {"DialIP", Func, 0}, + {"DialTCP", Func, 0}, + {"DialTimeout", Func, 0}, + {"DialUDP", Func, 0}, + {"DialUnix", Func, 0}, + {"Dialer", Type, 1}, + {"Dialer.Cancel", Field, 6}, + {"Dialer.Control", Field, 11}, + {"Dialer.ControlContext", Field, 20}, + {"Dialer.Deadline", Field, 1}, + {"Dialer.DualStack", Field, 2}, + {"Dialer.FallbackDelay", Field, 5}, + {"Dialer.KeepAlive", Field, 3}, + {"Dialer.KeepAliveConfig", Field, 23}, + {"Dialer.LocalAddr", Field, 1}, + {"Dialer.Resolver", Field, 8}, + {"Dialer.Timeout", Field, 1}, + {"ErrClosed", Var, 16}, + {"ErrWriteToConnected", Var, 0}, + {"Error", Type, 0}, + {"FileConn", Func, 0}, + {"FileListener", Func, 0}, + {"FilePacketConn", Func, 0}, + {"FlagBroadcast", Const, 0}, + {"FlagLoopback", Const, 0}, + {"FlagMulticast", Const, 0}, + {"FlagPointToPoint", Const, 0}, + {"FlagRunning", Const, 20}, + {"FlagUp", Const, 0}, + {"Flags", Type, 0}, + {"HardwareAddr", Type, 0}, + {"IP", Type, 0}, + {"IPAddr", Type, 0}, + {"IPAddr.IP", Field, 0}, + {"IPAddr.Zone", Field, 1}, + {"IPConn", Type, 0}, + {"IPMask", Type, 0}, + {"IPNet", Type, 0}, + {"IPNet.IP", Field, 0}, + {"IPNet.Mask", Field, 0}, + {"IPv4", Func, 0}, + {"IPv4Mask", Func, 0}, + {"IPv4allrouter", Var, 0}, + {"IPv4allsys", Var, 0}, + {"IPv4bcast", Var, 0}, + {"IPv4len", Const, 0}, + {"IPv4zero", Var, 0}, + {"IPv6interfacelocalallnodes", Var, 0}, + {"IPv6len", Const, 0}, + {"IPv6linklocalallnodes", Var, 0}, + {"IPv6linklocalallrouters", Var, 0}, + {"IPv6loopback", Var, 0}, + {"IPv6unspecified", Var, 0}, + {"IPv6zero", Var, 0}, + {"Interface", Type, 0}, + {"Interface.Flags", Field, 0}, + {"Interface.HardwareAddr", Field, 0}, + {"Interface.Index", Field, 0}, + {"Interface.MTU", Field, 0}, + {"Interface.Name", Field, 0}, + {"InterfaceAddrs", Func, 0}, + {"InterfaceByIndex", Func, 0}, + {"InterfaceByName", Func, 0}, + {"Interfaces", Func, 0}, + {"InvalidAddrError", Type, 0}, + {"JoinHostPort", Func, 0}, + {"KeepAliveConfig", Type, 23}, + {"KeepAliveConfig.Count", Field, 23}, + {"KeepAliveConfig.Enable", Field, 23}, + {"KeepAliveConfig.Idle", Field, 23}, + {"KeepAliveConfig.Interval", Field, 23}, + {"Listen", Func, 0}, + {"ListenConfig", Type, 11}, + {"ListenConfig.Control", Field, 11}, + {"ListenConfig.KeepAlive", Field, 13}, + {"ListenConfig.KeepAliveConfig", Field, 23}, + {"ListenIP", Func, 0}, + {"ListenMulticastUDP", Func, 0}, + {"ListenPacket", Func, 0}, + {"ListenTCP", Func, 0}, + {"ListenUDP", Func, 0}, + {"ListenUnix", Func, 0}, + {"ListenUnixgram", Func, 0}, + {"Listener", Type, 0}, + {"LookupAddr", Func, 0}, + {"LookupCNAME", Func, 0}, + {"LookupHost", Func, 0}, + {"LookupIP", Func, 0}, + {"LookupMX", Func, 0}, + {"LookupNS", Func, 1}, + {"LookupPort", Func, 0}, + {"LookupSRV", Func, 0}, + {"LookupTXT", Func, 0}, + {"MX", Type, 0}, + {"MX.Host", Field, 0}, + {"MX.Pref", Field, 0}, + {"NS", Type, 1}, + {"NS.Host", Field, 1}, + {"OpError", Type, 0}, + {"OpError.Addr", Field, 0}, + {"OpError.Err", Field, 0}, + {"OpError.Net", Field, 0}, + {"OpError.Op", Field, 0}, + {"OpError.Source", Field, 5}, + {"PacketConn", Type, 0}, + {"ParseCIDR", Func, 0}, + {"ParseError", Type, 0}, + {"ParseError.Text", Field, 0}, + {"ParseError.Type", Field, 0}, + {"ParseIP", Func, 0}, + {"ParseMAC", Func, 0}, + {"Pipe", Func, 0}, + {"ResolveIPAddr", Func, 0}, + {"ResolveTCPAddr", Func, 0}, + {"ResolveUDPAddr", Func, 0}, + {"ResolveUnixAddr", Func, 0}, + {"Resolver", Type, 8}, + {"Resolver.Dial", Field, 9}, + {"Resolver.PreferGo", Field, 8}, + {"Resolver.StrictErrors", Field, 9}, + {"SRV", Type, 0}, + {"SRV.Port", Field, 0}, + {"SRV.Priority", Field, 0}, + {"SRV.Target", Field, 0}, + {"SRV.Weight", Field, 0}, + {"SplitHostPort", Func, 0}, + {"TCPAddr", Type, 0}, + {"TCPAddr.IP", Field, 0}, + {"TCPAddr.Port", Field, 0}, + {"TCPAddr.Zone", Field, 1}, + {"TCPAddrFromAddrPort", Func, 18}, + {"TCPConn", Type, 0}, + {"TCPListener", Type, 0}, + {"UDPAddr", Type, 0}, + {"UDPAddr.IP", Field, 0}, + {"UDPAddr.Port", Field, 0}, + {"UDPAddr.Zone", Field, 1}, + {"UDPAddrFromAddrPort", Func, 18}, + {"UDPConn", Type, 0}, + {"UnixAddr", Type, 0}, + {"UnixAddr.Name", Field, 0}, + {"UnixAddr.Net", Field, 0}, + {"UnixConn", Type, 0}, + {"UnixListener", Type, 0}, + {"UnknownNetworkError", Type, 0}, + }, + "net/http": { + {"(*Client).CloseIdleConnections", Method, 12}, + {"(*Client).Do", Method, 0}, + {"(*Client).Get", Method, 0}, + {"(*Client).Head", Method, 0}, + {"(*Client).Post", Method, 0}, + {"(*Client).PostForm", Method, 0}, + {"(*Cookie).String", Method, 0}, + {"(*Cookie).Valid", Method, 18}, + {"(*MaxBytesError).Error", Method, 19}, + {"(*ProtocolError).Error", Method, 0}, + {"(*ProtocolError).Is", Method, 21}, + {"(*Request).AddCookie", Method, 0}, + {"(*Request).BasicAuth", Method, 4}, + {"(*Request).Clone", Method, 13}, + {"(*Request).Context", Method, 7}, + {"(*Request).Cookie", Method, 0}, + {"(*Request).Cookies", Method, 0}, + {"(*Request).CookiesNamed", Method, 23}, + {"(*Request).FormFile", Method, 0}, + {"(*Request).FormValue", Method, 0}, + {"(*Request).MultipartReader", Method, 0}, + {"(*Request).ParseForm", Method, 0}, + {"(*Request).ParseMultipartForm", Method, 0}, + {"(*Request).PathValue", Method, 22}, + {"(*Request).PostFormValue", Method, 1}, + {"(*Request).ProtoAtLeast", Method, 0}, + {"(*Request).Referer", Method, 0}, + {"(*Request).SetBasicAuth", Method, 0}, + {"(*Request).SetPathValue", Method, 22}, + {"(*Request).UserAgent", Method, 0}, + {"(*Request).WithContext", Method, 7}, + {"(*Request).Write", Method, 0}, + {"(*Request).WriteProxy", Method, 0}, + {"(*Response).Cookies", Method, 0}, + {"(*Response).Location", Method, 0}, + {"(*Response).ProtoAtLeast", Method, 0}, + {"(*Response).Write", Method, 0}, + {"(*ResponseController).EnableFullDuplex", Method, 21}, + {"(*ResponseController).Flush", Method, 20}, + {"(*ResponseController).Hijack", Method, 20}, + {"(*ResponseController).SetReadDeadline", Method, 20}, + {"(*ResponseController).SetWriteDeadline", Method, 20}, + {"(*ServeMux).Handle", Method, 0}, + {"(*ServeMux).HandleFunc", Method, 0}, + {"(*ServeMux).Handler", Method, 1}, + {"(*ServeMux).ServeHTTP", Method, 0}, + {"(*Server).Close", Method, 8}, + {"(*Server).ListenAndServe", Method, 0}, + {"(*Server).ListenAndServeTLS", Method, 0}, + {"(*Server).RegisterOnShutdown", Method, 9}, + {"(*Server).Serve", Method, 0}, + {"(*Server).ServeTLS", Method, 9}, + {"(*Server).SetKeepAlivesEnabled", Method, 3}, + {"(*Server).Shutdown", Method, 8}, + {"(*Transport).CancelRequest", Method, 1}, + {"(*Transport).Clone", Method, 13}, + {"(*Transport).CloseIdleConnections", Method, 0}, + {"(*Transport).RegisterProtocol", Method, 0}, + {"(*Transport).RoundTrip", Method, 0}, + {"(ConnState).String", Method, 3}, + {"(Dir).Open", Method, 0}, + {"(HandlerFunc).ServeHTTP", Method, 0}, + {"(Header).Add", Method, 0}, + {"(Header).Clone", Method, 13}, + {"(Header).Del", Method, 0}, + {"(Header).Get", Method, 0}, + {"(Header).Set", Method, 0}, + {"(Header).Values", Method, 14}, + {"(Header).Write", Method, 0}, + {"(Header).WriteSubset", Method, 0}, + {"AllowQuerySemicolons", Func, 17}, + {"CanonicalHeaderKey", Func, 0}, + {"Client", Type, 0}, + {"Client.CheckRedirect", Field, 0}, + {"Client.Jar", Field, 0}, + {"Client.Timeout", Field, 3}, + {"Client.Transport", Field, 0}, + {"CloseNotifier", Type, 1}, + {"ConnState", Type, 3}, + {"Cookie", Type, 0}, + {"Cookie.Domain", Field, 0}, + {"Cookie.Expires", Field, 0}, + {"Cookie.HttpOnly", Field, 0}, + {"Cookie.MaxAge", Field, 0}, + {"Cookie.Name", Field, 0}, + {"Cookie.Partitioned", Field, 23}, + {"Cookie.Path", Field, 0}, + {"Cookie.Quoted", Field, 23}, + {"Cookie.Raw", Field, 0}, + {"Cookie.RawExpires", Field, 0}, + {"Cookie.SameSite", Field, 11}, + {"Cookie.Secure", Field, 0}, + {"Cookie.Unparsed", Field, 0}, + {"Cookie.Value", Field, 0}, + {"CookieJar", Type, 0}, + {"DefaultClient", Var, 0}, + {"DefaultMaxHeaderBytes", Const, 0}, + {"DefaultMaxIdleConnsPerHost", Const, 0}, + {"DefaultServeMux", Var, 0}, + {"DefaultTransport", Var, 0}, + {"DetectContentType", Func, 0}, + {"Dir", Type, 0}, + {"ErrAbortHandler", Var, 8}, + {"ErrBodyNotAllowed", Var, 0}, + {"ErrBodyReadAfterClose", Var, 0}, + {"ErrContentLength", Var, 0}, + {"ErrHandlerTimeout", Var, 0}, + {"ErrHeaderTooLong", Var, 0}, + {"ErrHijacked", Var, 0}, + {"ErrLineTooLong", Var, 0}, + {"ErrMissingBoundary", Var, 0}, + {"ErrMissingContentLength", Var, 0}, + {"ErrMissingFile", Var, 0}, + {"ErrNoCookie", Var, 0}, + {"ErrNoLocation", Var, 0}, + {"ErrNotMultipart", Var, 0}, + {"ErrNotSupported", Var, 0}, + {"ErrSchemeMismatch", Var, 21}, + {"ErrServerClosed", Var, 8}, + {"ErrShortBody", Var, 0}, + {"ErrSkipAltProtocol", Var, 6}, + {"ErrUnexpectedTrailer", Var, 0}, + {"ErrUseLastResponse", Var, 7}, + {"ErrWriteAfterFlush", Var, 0}, + {"Error", Func, 0}, + {"FS", Func, 16}, + {"File", Type, 0}, + {"FileServer", Func, 0}, + {"FileServerFS", Func, 22}, + {"FileSystem", Type, 0}, + {"Flusher", Type, 0}, + {"Get", Func, 0}, + {"Handle", Func, 0}, + {"HandleFunc", Func, 0}, + {"Handler", Type, 0}, + {"HandlerFunc", Type, 0}, + {"Head", Func, 0}, + {"Header", Type, 0}, + {"Hijacker", Type, 0}, + {"ListenAndServe", Func, 0}, + {"ListenAndServeTLS", Func, 0}, + {"LocalAddrContextKey", Var, 7}, + {"MaxBytesError", Type, 19}, + {"MaxBytesError.Limit", Field, 19}, + {"MaxBytesHandler", Func, 18}, + {"MaxBytesReader", Func, 0}, + {"MethodConnect", Const, 6}, + {"MethodDelete", Const, 6}, + {"MethodGet", Const, 6}, + {"MethodHead", Const, 6}, + {"MethodOptions", Const, 6}, + {"MethodPatch", Const, 6}, + {"MethodPost", Const, 6}, + {"MethodPut", Const, 6}, + {"MethodTrace", Const, 6}, + {"NewFileTransport", Func, 0}, + {"NewFileTransportFS", Func, 22}, + {"NewRequest", Func, 0}, + {"NewRequestWithContext", Func, 13}, + {"NewResponseController", Func, 20}, + {"NewServeMux", Func, 0}, + {"NoBody", Var, 8}, + {"NotFound", Func, 0}, + {"NotFoundHandler", Func, 0}, + {"ParseCookie", Func, 23}, + {"ParseHTTPVersion", Func, 0}, + {"ParseSetCookie", Func, 23}, + {"ParseTime", Func, 1}, + {"Post", Func, 0}, + {"PostForm", Func, 0}, + {"ProtocolError", Type, 0}, + {"ProtocolError.ErrorString", Field, 0}, + {"ProxyFromEnvironment", Func, 0}, + {"ProxyURL", Func, 0}, + {"PushOptions", Type, 8}, + {"PushOptions.Header", Field, 8}, + {"PushOptions.Method", Field, 8}, + {"Pusher", Type, 8}, + {"ReadRequest", Func, 0}, + {"ReadResponse", Func, 0}, + {"Redirect", Func, 0}, + {"RedirectHandler", Func, 0}, + {"Request", Type, 0}, + {"Request.Body", Field, 0}, + {"Request.Cancel", Field, 5}, + {"Request.Close", Field, 0}, + {"Request.ContentLength", Field, 0}, + {"Request.Form", Field, 0}, + {"Request.GetBody", Field, 8}, + {"Request.Header", Field, 0}, + {"Request.Host", Field, 0}, + {"Request.Method", Field, 0}, + {"Request.MultipartForm", Field, 0}, + {"Request.Pattern", Field, 23}, + {"Request.PostForm", Field, 1}, + {"Request.Proto", Field, 0}, + {"Request.ProtoMajor", Field, 0}, + {"Request.ProtoMinor", Field, 0}, + {"Request.RemoteAddr", Field, 0}, + {"Request.RequestURI", Field, 0}, + {"Request.Response", Field, 7}, + {"Request.TLS", Field, 0}, + {"Request.Trailer", Field, 0}, + {"Request.TransferEncoding", Field, 0}, + {"Request.URL", Field, 0}, + {"Response", Type, 0}, + {"Response.Body", Field, 0}, + {"Response.Close", Field, 0}, + {"Response.ContentLength", Field, 0}, + {"Response.Header", Field, 0}, + {"Response.Proto", Field, 0}, + {"Response.ProtoMajor", Field, 0}, + {"Response.ProtoMinor", Field, 0}, + {"Response.Request", Field, 0}, + {"Response.Status", Field, 0}, + {"Response.StatusCode", Field, 0}, + {"Response.TLS", Field, 3}, + {"Response.Trailer", Field, 0}, + {"Response.TransferEncoding", Field, 0}, + {"Response.Uncompressed", Field, 7}, + {"ResponseController", Type, 20}, + {"ResponseWriter", Type, 0}, + {"RoundTripper", Type, 0}, + {"SameSite", Type, 11}, + {"SameSiteDefaultMode", Const, 11}, + {"SameSiteLaxMode", Const, 11}, + {"SameSiteNoneMode", Const, 13}, + {"SameSiteStrictMode", Const, 11}, + {"Serve", Func, 0}, + {"ServeContent", Func, 0}, + {"ServeFile", Func, 0}, + {"ServeFileFS", Func, 22}, + {"ServeMux", Type, 0}, + {"ServeTLS", Func, 9}, + {"Server", Type, 0}, + {"Server.Addr", Field, 0}, + {"Server.BaseContext", Field, 13}, + {"Server.ConnContext", Field, 13}, + {"Server.ConnState", Field, 3}, + {"Server.DisableGeneralOptionsHandler", Field, 20}, + {"Server.ErrorLog", Field, 3}, + {"Server.Handler", Field, 0}, + {"Server.IdleTimeout", Field, 8}, + {"Server.MaxHeaderBytes", Field, 0}, + {"Server.ReadHeaderTimeout", Field, 8}, + {"Server.ReadTimeout", Field, 0}, + {"Server.TLSConfig", Field, 0}, + {"Server.TLSNextProto", Field, 1}, + {"Server.WriteTimeout", Field, 0}, + {"ServerContextKey", Var, 7}, + {"SetCookie", Func, 0}, + {"StateActive", Const, 3}, + {"StateClosed", Const, 3}, + {"StateHijacked", Const, 3}, + {"StateIdle", Const, 3}, + {"StateNew", Const, 3}, + {"StatusAccepted", Const, 0}, + {"StatusAlreadyReported", Const, 7}, + {"StatusBadGateway", Const, 0}, + {"StatusBadRequest", Const, 0}, + {"StatusConflict", Const, 0}, + {"StatusContinue", Const, 0}, + {"StatusCreated", Const, 0}, + {"StatusEarlyHints", Const, 13}, + {"StatusExpectationFailed", Const, 0}, + {"StatusFailedDependency", Const, 7}, + {"StatusForbidden", Const, 0}, + {"StatusFound", Const, 0}, + {"StatusGatewayTimeout", Const, 0}, + {"StatusGone", Const, 0}, + {"StatusHTTPVersionNotSupported", Const, 0}, + {"StatusIMUsed", Const, 7}, + {"StatusInsufficientStorage", Const, 7}, + {"StatusInternalServerError", Const, 0}, + {"StatusLengthRequired", Const, 0}, + {"StatusLocked", Const, 7}, + {"StatusLoopDetected", Const, 7}, + {"StatusMethodNotAllowed", Const, 0}, + {"StatusMisdirectedRequest", Const, 11}, + {"StatusMovedPermanently", Const, 0}, + {"StatusMultiStatus", Const, 7}, + {"StatusMultipleChoices", Const, 0}, + {"StatusNetworkAuthenticationRequired", Const, 6}, + {"StatusNoContent", Const, 0}, + {"StatusNonAuthoritativeInfo", Const, 0}, + {"StatusNotAcceptable", Const, 0}, + {"StatusNotExtended", Const, 7}, + {"StatusNotFound", Const, 0}, + {"StatusNotImplemented", Const, 0}, + {"StatusNotModified", Const, 0}, + {"StatusOK", Const, 0}, + {"StatusPartialContent", Const, 0}, + {"StatusPaymentRequired", Const, 0}, + {"StatusPermanentRedirect", Const, 7}, + {"StatusPreconditionFailed", Const, 0}, + {"StatusPreconditionRequired", Const, 6}, + {"StatusProcessing", Const, 7}, + {"StatusProxyAuthRequired", Const, 0}, + {"StatusRequestEntityTooLarge", Const, 0}, + {"StatusRequestHeaderFieldsTooLarge", Const, 6}, + {"StatusRequestTimeout", Const, 0}, + {"StatusRequestURITooLong", Const, 0}, + {"StatusRequestedRangeNotSatisfiable", Const, 0}, + {"StatusResetContent", Const, 0}, + {"StatusSeeOther", Const, 0}, + {"StatusServiceUnavailable", Const, 0}, + {"StatusSwitchingProtocols", Const, 0}, + {"StatusTeapot", Const, 0}, + {"StatusTemporaryRedirect", Const, 0}, + {"StatusText", Func, 0}, + {"StatusTooEarly", Const, 12}, + {"StatusTooManyRequests", Const, 6}, + {"StatusUnauthorized", Const, 0}, + {"StatusUnavailableForLegalReasons", Const, 6}, + {"StatusUnprocessableEntity", Const, 7}, + {"StatusUnsupportedMediaType", Const, 0}, + {"StatusUpgradeRequired", Const, 7}, + {"StatusUseProxy", Const, 0}, + {"StatusVariantAlsoNegotiates", Const, 7}, + {"StripPrefix", Func, 0}, + {"TimeFormat", Const, 0}, + {"TimeoutHandler", Func, 0}, + {"TrailerPrefix", Const, 8}, + {"Transport", Type, 0}, + {"Transport.Dial", Field, 0}, + {"Transport.DialContext", Field, 7}, + {"Transport.DialTLS", Field, 4}, + {"Transport.DialTLSContext", Field, 14}, + {"Transport.DisableCompression", Field, 0}, + {"Transport.DisableKeepAlives", Field, 0}, + {"Transport.ExpectContinueTimeout", Field, 6}, + {"Transport.ForceAttemptHTTP2", Field, 13}, + {"Transport.GetProxyConnectHeader", Field, 16}, + {"Transport.IdleConnTimeout", Field, 7}, + {"Transport.MaxConnsPerHost", Field, 11}, + {"Transport.MaxIdleConns", Field, 7}, + {"Transport.MaxIdleConnsPerHost", Field, 0}, + {"Transport.MaxResponseHeaderBytes", Field, 7}, + {"Transport.OnProxyConnectResponse", Field, 20}, + {"Transport.Proxy", Field, 0}, + {"Transport.ProxyConnectHeader", Field, 8}, + {"Transport.ReadBufferSize", Field, 13}, + {"Transport.ResponseHeaderTimeout", Field, 1}, + {"Transport.TLSClientConfig", Field, 0}, + {"Transport.TLSHandshakeTimeout", Field, 3}, + {"Transport.TLSNextProto", Field, 6}, + {"Transport.WriteBufferSize", Field, 13}, + }, + "net/http/cgi": { + {"(*Handler).ServeHTTP", Method, 0}, + {"Handler", Type, 0}, + {"Handler.Args", Field, 0}, + {"Handler.Dir", Field, 0}, + {"Handler.Env", Field, 0}, + {"Handler.InheritEnv", Field, 0}, + {"Handler.Logger", Field, 0}, + {"Handler.Path", Field, 0}, + {"Handler.PathLocationHandler", Field, 0}, + {"Handler.Root", Field, 0}, + {"Handler.Stderr", Field, 7}, + {"Request", Func, 0}, + {"RequestFromMap", Func, 0}, + {"Serve", Func, 0}, + }, + "net/http/cookiejar": { + {"(*Jar).Cookies", Method, 1}, + {"(*Jar).SetCookies", Method, 1}, + {"Jar", Type, 1}, + {"New", Func, 1}, + {"Options", Type, 1}, + {"Options.PublicSuffixList", Field, 1}, + {"PublicSuffixList", Type, 1}, + }, + "net/http/fcgi": { + {"ErrConnClosed", Var, 5}, + {"ErrRequestAborted", Var, 5}, + {"ProcessEnv", Func, 9}, + {"Serve", Func, 0}, + }, + "net/http/httptest": { + {"(*ResponseRecorder).Flush", Method, 0}, + {"(*ResponseRecorder).Header", Method, 0}, + {"(*ResponseRecorder).Result", Method, 7}, + {"(*ResponseRecorder).Write", Method, 0}, + {"(*ResponseRecorder).WriteHeader", Method, 0}, + {"(*ResponseRecorder).WriteString", Method, 6}, + {"(*Server).Certificate", Method, 9}, + {"(*Server).Client", Method, 9}, + {"(*Server).Close", Method, 0}, + {"(*Server).CloseClientConnections", Method, 0}, + {"(*Server).Start", Method, 0}, + {"(*Server).StartTLS", Method, 0}, + {"DefaultRemoteAddr", Const, 0}, + {"NewRecorder", Func, 0}, + {"NewRequest", Func, 7}, + {"NewRequestWithContext", Func, 23}, + {"NewServer", Func, 0}, + {"NewTLSServer", Func, 0}, + {"NewUnstartedServer", Func, 0}, + {"ResponseRecorder", Type, 0}, + {"ResponseRecorder.Body", Field, 0}, + {"ResponseRecorder.Code", Field, 0}, + {"ResponseRecorder.Flushed", Field, 0}, + {"ResponseRecorder.HeaderMap", Field, 0}, + {"Server", Type, 0}, + {"Server.Config", Field, 0}, + {"Server.EnableHTTP2", Field, 14}, + {"Server.Listener", Field, 0}, + {"Server.TLS", Field, 0}, + {"Server.URL", Field, 0}, + }, + "net/http/httptrace": { + {"ClientTrace", Type, 7}, + {"ClientTrace.ConnectDone", Field, 7}, + {"ClientTrace.ConnectStart", Field, 7}, + {"ClientTrace.DNSDone", Field, 7}, + {"ClientTrace.DNSStart", Field, 7}, + {"ClientTrace.GetConn", Field, 7}, + {"ClientTrace.Got100Continue", Field, 7}, + {"ClientTrace.Got1xxResponse", Field, 11}, + {"ClientTrace.GotConn", Field, 7}, + {"ClientTrace.GotFirstResponseByte", Field, 7}, + {"ClientTrace.PutIdleConn", Field, 7}, + {"ClientTrace.TLSHandshakeDone", Field, 8}, + {"ClientTrace.TLSHandshakeStart", Field, 8}, + {"ClientTrace.Wait100Continue", Field, 7}, + {"ClientTrace.WroteHeaderField", Field, 11}, + {"ClientTrace.WroteHeaders", Field, 7}, + {"ClientTrace.WroteRequest", Field, 7}, + {"ContextClientTrace", Func, 7}, + {"DNSDoneInfo", Type, 7}, + {"DNSDoneInfo.Addrs", Field, 7}, + {"DNSDoneInfo.Coalesced", Field, 7}, + {"DNSDoneInfo.Err", Field, 7}, + {"DNSStartInfo", Type, 7}, + {"DNSStartInfo.Host", Field, 7}, + {"GotConnInfo", Type, 7}, + {"GotConnInfo.Conn", Field, 7}, + {"GotConnInfo.IdleTime", Field, 7}, + {"GotConnInfo.Reused", Field, 7}, + {"GotConnInfo.WasIdle", Field, 7}, + {"WithClientTrace", Func, 7}, + {"WroteRequestInfo", Type, 7}, + {"WroteRequestInfo.Err", Field, 7}, + }, + "net/http/httputil": { + {"(*ClientConn).Close", Method, 0}, + {"(*ClientConn).Do", Method, 0}, + {"(*ClientConn).Hijack", Method, 0}, + {"(*ClientConn).Pending", Method, 0}, + {"(*ClientConn).Read", Method, 0}, + {"(*ClientConn).Write", Method, 0}, + {"(*ProxyRequest).SetURL", Method, 20}, + {"(*ProxyRequest).SetXForwarded", Method, 20}, + {"(*ReverseProxy).ServeHTTP", Method, 0}, + {"(*ServerConn).Close", Method, 0}, + {"(*ServerConn).Hijack", Method, 0}, + {"(*ServerConn).Pending", Method, 0}, + {"(*ServerConn).Read", Method, 0}, + {"(*ServerConn).Write", Method, 0}, + {"BufferPool", Type, 6}, + {"ClientConn", Type, 0}, + {"DumpRequest", Func, 0}, + {"DumpRequestOut", Func, 0}, + {"DumpResponse", Func, 0}, + {"ErrClosed", Var, 0}, + {"ErrLineTooLong", Var, 0}, + {"ErrPersistEOF", Var, 0}, + {"ErrPipeline", Var, 0}, + {"NewChunkedReader", Func, 0}, + {"NewChunkedWriter", Func, 0}, + {"NewClientConn", Func, 0}, + {"NewProxyClientConn", Func, 0}, + {"NewServerConn", Func, 0}, + {"NewSingleHostReverseProxy", Func, 0}, + {"ProxyRequest", Type, 20}, + {"ProxyRequest.In", Field, 20}, + {"ProxyRequest.Out", Field, 20}, + {"ReverseProxy", Type, 0}, + {"ReverseProxy.BufferPool", Field, 6}, + {"ReverseProxy.Director", Field, 0}, + {"ReverseProxy.ErrorHandler", Field, 11}, + {"ReverseProxy.ErrorLog", Field, 4}, + {"ReverseProxy.FlushInterval", Field, 0}, + {"ReverseProxy.ModifyResponse", Field, 8}, + {"ReverseProxy.Rewrite", Field, 20}, + {"ReverseProxy.Transport", Field, 0}, + {"ServerConn", Type, 0}, + }, + "net/http/pprof": { + {"Cmdline", Func, 0}, + {"Handler", Func, 0}, + {"Index", Func, 0}, + {"Profile", Func, 0}, + {"Symbol", Func, 0}, + {"Trace", Func, 5}, + }, + "net/mail": { + {"(*Address).String", Method, 0}, + {"(*AddressParser).Parse", Method, 5}, + {"(*AddressParser).ParseList", Method, 5}, + {"(Header).AddressList", Method, 0}, + {"(Header).Date", Method, 0}, + {"(Header).Get", Method, 0}, + {"Address", Type, 0}, + {"Address.Address", Field, 0}, + {"Address.Name", Field, 0}, + {"AddressParser", Type, 5}, + {"AddressParser.WordDecoder", Field, 5}, + {"ErrHeaderNotPresent", Var, 0}, + {"Header", Type, 0}, + {"Message", Type, 0}, + {"Message.Body", Field, 0}, + {"Message.Header", Field, 0}, + {"ParseAddress", Func, 1}, + {"ParseAddressList", Func, 1}, + {"ParseDate", Func, 8}, + {"ReadMessage", Func, 0}, + }, + "net/netip": { + {"(*Addr).UnmarshalBinary", Method, 18}, + {"(*Addr).UnmarshalText", Method, 18}, + {"(*AddrPort).UnmarshalBinary", Method, 18}, + {"(*AddrPort).UnmarshalText", Method, 18}, + {"(*Prefix).UnmarshalBinary", Method, 18}, + {"(*Prefix).UnmarshalText", Method, 18}, + {"(Addr).AppendTo", Method, 18}, + {"(Addr).As16", Method, 18}, + {"(Addr).As4", Method, 18}, + {"(Addr).AsSlice", Method, 18}, + {"(Addr).BitLen", Method, 18}, + {"(Addr).Compare", Method, 18}, + {"(Addr).Is4", Method, 18}, + {"(Addr).Is4In6", Method, 18}, + {"(Addr).Is6", Method, 18}, + {"(Addr).IsGlobalUnicast", Method, 18}, + {"(Addr).IsInterfaceLocalMulticast", Method, 18}, + {"(Addr).IsLinkLocalMulticast", Method, 18}, + {"(Addr).IsLinkLocalUnicast", Method, 18}, + {"(Addr).IsLoopback", Method, 18}, + {"(Addr).IsMulticast", Method, 18}, + {"(Addr).IsPrivate", Method, 18}, + {"(Addr).IsUnspecified", Method, 18}, + {"(Addr).IsValid", Method, 18}, + {"(Addr).Less", Method, 18}, + {"(Addr).MarshalBinary", Method, 18}, + {"(Addr).MarshalText", Method, 18}, + {"(Addr).Next", Method, 18}, + {"(Addr).Prefix", Method, 18}, + {"(Addr).Prev", Method, 18}, + {"(Addr).String", Method, 18}, + {"(Addr).StringExpanded", Method, 18}, + {"(Addr).Unmap", Method, 18}, + {"(Addr).WithZone", Method, 18}, + {"(Addr).Zone", Method, 18}, + {"(AddrPort).Addr", Method, 18}, + {"(AddrPort).AppendTo", Method, 18}, + {"(AddrPort).Compare", Method, 22}, + {"(AddrPort).IsValid", Method, 18}, + {"(AddrPort).MarshalBinary", Method, 18}, + {"(AddrPort).MarshalText", Method, 18}, + {"(AddrPort).Port", Method, 18}, + {"(AddrPort).String", Method, 18}, + {"(Prefix).Addr", Method, 18}, + {"(Prefix).AppendTo", Method, 18}, + {"(Prefix).Bits", Method, 18}, + {"(Prefix).Contains", Method, 18}, + {"(Prefix).IsSingleIP", Method, 18}, + {"(Prefix).IsValid", Method, 18}, + {"(Prefix).MarshalBinary", Method, 18}, + {"(Prefix).MarshalText", Method, 18}, + {"(Prefix).Masked", Method, 18}, + {"(Prefix).Overlaps", Method, 18}, + {"(Prefix).String", Method, 18}, + {"Addr", Type, 18}, + {"AddrFrom16", Func, 18}, + {"AddrFrom4", Func, 18}, + {"AddrFromSlice", Func, 18}, + {"AddrPort", Type, 18}, + {"AddrPortFrom", Func, 18}, + {"IPv4Unspecified", Func, 18}, + {"IPv6LinkLocalAllNodes", Func, 18}, + {"IPv6LinkLocalAllRouters", Func, 20}, + {"IPv6Loopback", Func, 20}, + {"IPv6Unspecified", Func, 18}, + {"MustParseAddr", Func, 18}, + {"MustParseAddrPort", Func, 18}, + {"MustParsePrefix", Func, 18}, + {"ParseAddr", Func, 18}, + {"ParseAddrPort", Func, 18}, + {"ParsePrefix", Func, 18}, + {"Prefix", Type, 18}, + {"PrefixFrom", Func, 18}, + }, + "net/rpc": { + {"(*Client).Call", Method, 0}, + {"(*Client).Close", Method, 0}, + {"(*Client).Go", Method, 0}, + {"(*Server).Accept", Method, 0}, + {"(*Server).HandleHTTP", Method, 0}, + {"(*Server).Register", Method, 0}, + {"(*Server).RegisterName", Method, 0}, + {"(*Server).ServeCodec", Method, 0}, + {"(*Server).ServeConn", Method, 0}, + {"(*Server).ServeHTTP", Method, 0}, + {"(*Server).ServeRequest", Method, 0}, + {"(ServerError).Error", Method, 0}, + {"Accept", Func, 0}, + {"Call", Type, 0}, + {"Call.Args", Field, 0}, + {"Call.Done", Field, 0}, + {"Call.Error", Field, 0}, + {"Call.Reply", Field, 0}, + {"Call.ServiceMethod", Field, 0}, + {"Client", Type, 0}, + {"ClientCodec", Type, 0}, + {"DefaultDebugPath", Const, 0}, + {"DefaultRPCPath", Const, 0}, + {"DefaultServer", Var, 0}, + {"Dial", Func, 0}, + {"DialHTTP", Func, 0}, + {"DialHTTPPath", Func, 0}, + {"ErrShutdown", Var, 0}, + {"HandleHTTP", Func, 0}, + {"NewClient", Func, 0}, + {"NewClientWithCodec", Func, 0}, + {"NewServer", Func, 0}, + {"Register", Func, 0}, + {"RegisterName", Func, 0}, + {"Request", Type, 0}, + {"Request.Seq", Field, 0}, + {"Request.ServiceMethod", Field, 0}, + {"Response", Type, 0}, + {"Response.Error", Field, 0}, + {"Response.Seq", Field, 0}, + {"Response.ServiceMethod", Field, 0}, + {"ServeCodec", Func, 0}, + {"ServeConn", Func, 0}, + {"ServeRequest", Func, 0}, + {"Server", Type, 0}, + {"ServerCodec", Type, 0}, + {"ServerError", Type, 0}, + }, + "net/rpc/jsonrpc": { + {"Dial", Func, 0}, + {"NewClient", Func, 0}, + {"NewClientCodec", Func, 0}, + {"NewServerCodec", Func, 0}, + {"ServeConn", Func, 0}, + }, + "net/smtp": { + {"(*Client).Auth", Method, 0}, + {"(*Client).Close", Method, 2}, + {"(*Client).Data", Method, 0}, + {"(*Client).Extension", Method, 0}, + {"(*Client).Hello", Method, 1}, + {"(*Client).Mail", Method, 0}, + {"(*Client).Noop", Method, 10}, + {"(*Client).Quit", Method, 0}, + {"(*Client).Rcpt", Method, 0}, + {"(*Client).Reset", Method, 0}, + {"(*Client).StartTLS", Method, 0}, + {"(*Client).TLSConnectionState", Method, 5}, + {"(*Client).Verify", Method, 0}, + {"Auth", Type, 0}, + {"CRAMMD5Auth", Func, 0}, + {"Client", Type, 0}, + {"Client.Text", Field, 0}, + {"Dial", Func, 0}, + {"NewClient", Func, 0}, + {"PlainAuth", Func, 0}, + {"SendMail", Func, 0}, + {"ServerInfo", Type, 0}, + {"ServerInfo.Auth", Field, 0}, + {"ServerInfo.Name", Field, 0}, + {"ServerInfo.TLS", Field, 0}, + }, + "net/textproto": { + {"(*Conn).Close", Method, 0}, + {"(*Conn).Cmd", Method, 0}, + {"(*Conn).DotReader", Method, 0}, + {"(*Conn).DotWriter", Method, 0}, + {"(*Conn).EndRequest", Method, 0}, + {"(*Conn).EndResponse", Method, 0}, + {"(*Conn).Next", Method, 0}, + {"(*Conn).PrintfLine", Method, 0}, + {"(*Conn).ReadCodeLine", Method, 0}, + {"(*Conn).ReadContinuedLine", Method, 0}, + {"(*Conn).ReadContinuedLineBytes", Method, 0}, + {"(*Conn).ReadDotBytes", Method, 0}, + {"(*Conn).ReadDotLines", Method, 0}, + {"(*Conn).ReadLine", Method, 0}, + {"(*Conn).ReadLineBytes", Method, 0}, + {"(*Conn).ReadMIMEHeader", Method, 0}, + {"(*Conn).ReadResponse", Method, 0}, + {"(*Conn).StartRequest", Method, 0}, + {"(*Conn).StartResponse", Method, 0}, + {"(*Error).Error", Method, 0}, + {"(*Pipeline).EndRequest", Method, 0}, + {"(*Pipeline).EndResponse", Method, 0}, + {"(*Pipeline).Next", Method, 0}, + {"(*Pipeline).StartRequest", Method, 0}, + {"(*Pipeline).StartResponse", Method, 0}, + {"(*Reader).DotReader", Method, 0}, + {"(*Reader).ReadCodeLine", Method, 0}, + {"(*Reader).ReadContinuedLine", Method, 0}, + {"(*Reader).ReadContinuedLineBytes", Method, 0}, + {"(*Reader).ReadDotBytes", Method, 0}, + {"(*Reader).ReadDotLines", Method, 0}, + {"(*Reader).ReadLine", Method, 0}, + {"(*Reader).ReadLineBytes", Method, 0}, + {"(*Reader).ReadMIMEHeader", Method, 0}, + {"(*Reader).ReadResponse", Method, 0}, + {"(*Writer).DotWriter", Method, 0}, + {"(*Writer).PrintfLine", Method, 0}, + {"(MIMEHeader).Add", Method, 0}, + {"(MIMEHeader).Del", Method, 0}, + {"(MIMEHeader).Get", Method, 0}, + {"(MIMEHeader).Set", Method, 0}, + {"(MIMEHeader).Values", Method, 14}, + {"(ProtocolError).Error", Method, 0}, + {"CanonicalMIMEHeaderKey", Func, 0}, + {"Conn", Type, 0}, + {"Conn.Pipeline", Field, 0}, + {"Conn.Reader", Field, 0}, + {"Conn.Writer", Field, 0}, + {"Dial", Func, 0}, + {"Error", Type, 0}, + {"Error.Code", Field, 0}, + {"Error.Msg", Field, 0}, + {"MIMEHeader", Type, 0}, + {"NewConn", Func, 0}, + {"NewReader", Func, 0}, + {"NewWriter", Func, 0}, + {"Pipeline", Type, 0}, + {"ProtocolError", Type, 0}, + {"Reader", Type, 0}, + {"Reader.R", Field, 0}, + {"TrimBytes", Func, 1}, + {"TrimString", Func, 1}, + {"Writer", Type, 0}, + {"Writer.W", Field, 0}, + }, + "net/url": { + {"(*Error).Error", Method, 0}, + {"(*Error).Temporary", Method, 6}, + {"(*Error).Timeout", Method, 6}, + {"(*Error).Unwrap", Method, 13}, + {"(*URL).EscapedFragment", Method, 15}, + {"(*URL).EscapedPath", Method, 5}, + {"(*URL).Hostname", Method, 8}, + {"(*URL).IsAbs", Method, 0}, + {"(*URL).JoinPath", Method, 19}, + {"(*URL).MarshalBinary", Method, 8}, + {"(*URL).Parse", Method, 0}, + {"(*URL).Port", Method, 8}, + {"(*URL).Query", Method, 0}, + {"(*URL).Redacted", Method, 15}, + {"(*URL).RequestURI", Method, 0}, + {"(*URL).ResolveReference", Method, 0}, + {"(*URL).String", Method, 0}, + {"(*URL).UnmarshalBinary", Method, 8}, + {"(*Userinfo).Password", Method, 0}, + {"(*Userinfo).String", Method, 0}, + {"(*Userinfo).Username", Method, 0}, + {"(EscapeError).Error", Method, 0}, + {"(InvalidHostError).Error", Method, 6}, + {"(Values).Add", Method, 0}, + {"(Values).Del", Method, 0}, + {"(Values).Encode", Method, 0}, + {"(Values).Get", Method, 0}, + {"(Values).Has", Method, 17}, + {"(Values).Set", Method, 0}, + {"Error", Type, 0}, + {"Error.Err", Field, 0}, + {"Error.Op", Field, 0}, + {"Error.URL", Field, 0}, + {"EscapeError", Type, 0}, + {"InvalidHostError", Type, 6}, + {"JoinPath", Func, 19}, + {"Parse", Func, 0}, + {"ParseQuery", Func, 0}, + {"ParseRequestURI", Func, 0}, + {"PathEscape", Func, 8}, + {"PathUnescape", Func, 8}, + {"QueryEscape", Func, 0}, + {"QueryUnescape", Func, 0}, + {"URL", Type, 0}, + {"URL.ForceQuery", Field, 7}, + {"URL.Fragment", Field, 0}, + {"URL.Host", Field, 0}, + {"URL.OmitHost", Field, 19}, + {"URL.Opaque", Field, 0}, + {"URL.Path", Field, 0}, + {"URL.RawFragment", Field, 15}, + {"URL.RawPath", Field, 5}, + {"URL.RawQuery", Field, 0}, + {"URL.Scheme", Field, 0}, + {"URL.User", Field, 0}, + {"User", Func, 0}, + {"UserPassword", Func, 0}, + {"Userinfo", Type, 0}, + {"Values", Type, 0}, + }, + "os": { + {"(*File).Chdir", Method, 0}, + {"(*File).Chmod", Method, 0}, + {"(*File).Chown", Method, 0}, + {"(*File).Close", Method, 0}, + {"(*File).Fd", Method, 0}, + {"(*File).Name", Method, 0}, + {"(*File).Read", Method, 0}, + {"(*File).ReadAt", Method, 0}, + {"(*File).ReadDir", Method, 16}, + {"(*File).ReadFrom", Method, 15}, + {"(*File).Readdir", Method, 0}, + {"(*File).Readdirnames", Method, 0}, + {"(*File).Seek", Method, 0}, + {"(*File).SetDeadline", Method, 10}, + {"(*File).SetReadDeadline", Method, 10}, + {"(*File).SetWriteDeadline", Method, 10}, + {"(*File).Stat", Method, 0}, + {"(*File).Sync", Method, 0}, + {"(*File).SyscallConn", Method, 12}, + {"(*File).Truncate", Method, 0}, + {"(*File).Write", Method, 0}, + {"(*File).WriteAt", Method, 0}, + {"(*File).WriteString", Method, 0}, + {"(*File).WriteTo", Method, 22}, + {"(*LinkError).Error", Method, 0}, + {"(*LinkError).Unwrap", Method, 13}, + {"(*PathError).Error", Method, 0}, + {"(*PathError).Timeout", Method, 10}, + {"(*PathError).Unwrap", Method, 13}, + {"(*Process).Kill", Method, 0}, + {"(*Process).Release", Method, 0}, + {"(*Process).Signal", Method, 0}, + {"(*Process).Wait", Method, 0}, + {"(*ProcessState).ExitCode", Method, 12}, + {"(*ProcessState).Exited", Method, 0}, + {"(*ProcessState).Pid", Method, 0}, + {"(*ProcessState).String", Method, 0}, + {"(*ProcessState).Success", Method, 0}, + {"(*ProcessState).Sys", Method, 0}, + {"(*ProcessState).SysUsage", Method, 0}, + {"(*ProcessState).SystemTime", Method, 0}, + {"(*ProcessState).UserTime", Method, 0}, + {"(*SyscallError).Error", Method, 0}, + {"(*SyscallError).Timeout", Method, 10}, + {"(*SyscallError).Unwrap", Method, 13}, + {"(FileMode).IsDir", Method, 0}, + {"(FileMode).IsRegular", Method, 1}, + {"(FileMode).Perm", Method, 0}, + {"(FileMode).String", Method, 0}, + {"Args", Var, 0}, + {"Chdir", Func, 0}, + {"Chmod", Func, 0}, + {"Chown", Func, 0}, + {"Chtimes", Func, 0}, + {"Clearenv", Func, 0}, + {"CopyFS", Func, 23}, + {"Create", Func, 0}, + {"CreateTemp", Func, 16}, + {"DevNull", Const, 0}, + {"DirEntry", Type, 16}, + {"DirFS", Func, 16}, + {"Environ", Func, 0}, + {"ErrClosed", Var, 8}, + {"ErrDeadlineExceeded", Var, 15}, + {"ErrExist", Var, 0}, + {"ErrInvalid", Var, 0}, + {"ErrNoDeadline", Var, 10}, + {"ErrNotExist", Var, 0}, + {"ErrPermission", Var, 0}, + {"ErrProcessDone", Var, 16}, + {"Executable", Func, 8}, + {"Exit", Func, 0}, + {"Expand", Func, 0}, + {"ExpandEnv", Func, 0}, + {"File", Type, 0}, + {"FileInfo", Type, 0}, + {"FileMode", Type, 0}, + {"FindProcess", Func, 0}, + {"Getegid", Func, 0}, + {"Getenv", Func, 0}, + {"Geteuid", Func, 0}, + {"Getgid", Func, 0}, + {"Getgroups", Func, 0}, + {"Getpagesize", Func, 0}, + {"Getpid", Func, 0}, + {"Getppid", Func, 0}, + {"Getuid", Func, 0}, + {"Getwd", Func, 0}, + {"Hostname", Func, 0}, + {"Interrupt", Var, 0}, + {"IsExist", Func, 0}, + {"IsNotExist", Func, 0}, + {"IsPathSeparator", Func, 0}, + {"IsPermission", Func, 0}, + {"IsTimeout", Func, 10}, + {"Kill", Var, 0}, + {"Lchown", Func, 0}, + {"Link", Func, 0}, + {"LinkError", Type, 0}, + {"LinkError.Err", Field, 0}, + {"LinkError.New", Field, 0}, + {"LinkError.Old", Field, 0}, + {"LinkError.Op", Field, 0}, + {"LookupEnv", Func, 5}, + {"Lstat", Func, 0}, + {"Mkdir", Func, 0}, + {"MkdirAll", Func, 0}, + {"MkdirTemp", Func, 16}, + {"ModeAppend", Const, 0}, + {"ModeCharDevice", Const, 0}, + {"ModeDevice", Const, 0}, + {"ModeDir", Const, 0}, + {"ModeExclusive", Const, 0}, + {"ModeIrregular", Const, 11}, + {"ModeNamedPipe", Const, 0}, + {"ModePerm", Const, 0}, + {"ModeSetgid", Const, 0}, + {"ModeSetuid", Const, 0}, + {"ModeSocket", Const, 0}, + {"ModeSticky", Const, 0}, + {"ModeSymlink", Const, 0}, + {"ModeTemporary", Const, 0}, + {"ModeType", Const, 0}, + {"NewFile", Func, 0}, + {"NewSyscallError", Func, 0}, + {"O_APPEND", Const, 0}, + {"O_CREATE", Const, 0}, + {"O_EXCL", Const, 0}, + {"O_RDONLY", Const, 0}, + {"O_RDWR", Const, 0}, + {"O_SYNC", Const, 0}, + {"O_TRUNC", Const, 0}, + {"O_WRONLY", Const, 0}, + {"Open", Func, 0}, + {"OpenFile", Func, 0}, + {"PathError", Type, 0}, + {"PathError.Err", Field, 0}, + {"PathError.Op", Field, 0}, + {"PathError.Path", Field, 0}, + {"PathListSeparator", Const, 0}, + {"PathSeparator", Const, 0}, + {"Pipe", Func, 0}, + {"ProcAttr", Type, 0}, + {"ProcAttr.Dir", Field, 0}, + {"ProcAttr.Env", Field, 0}, + {"ProcAttr.Files", Field, 0}, + {"ProcAttr.Sys", Field, 0}, + {"Process", Type, 0}, + {"Process.Pid", Field, 0}, + {"ProcessState", Type, 0}, + {"ReadDir", Func, 16}, + {"ReadFile", Func, 16}, + {"Readlink", Func, 0}, + {"Remove", Func, 0}, + {"RemoveAll", Func, 0}, + {"Rename", Func, 0}, + {"SEEK_CUR", Const, 0}, + {"SEEK_END", Const, 0}, + {"SEEK_SET", Const, 0}, + {"SameFile", Func, 0}, + {"Setenv", Func, 0}, + {"Signal", Type, 0}, + {"StartProcess", Func, 0}, + {"Stat", Func, 0}, + {"Stderr", Var, 0}, + {"Stdin", Var, 0}, + {"Stdout", Var, 0}, + {"Symlink", Func, 0}, + {"SyscallError", Type, 0}, + {"SyscallError.Err", Field, 0}, + {"SyscallError.Syscall", Field, 0}, + {"TempDir", Func, 0}, + {"Truncate", Func, 0}, + {"Unsetenv", Func, 4}, + {"UserCacheDir", Func, 11}, + {"UserConfigDir", Func, 13}, + {"UserHomeDir", Func, 12}, + {"WriteFile", Func, 16}, + }, + "os/exec": { + {"(*Cmd).CombinedOutput", Method, 0}, + {"(*Cmd).Environ", Method, 19}, + {"(*Cmd).Output", Method, 0}, + {"(*Cmd).Run", Method, 0}, + {"(*Cmd).Start", Method, 0}, + {"(*Cmd).StderrPipe", Method, 0}, + {"(*Cmd).StdinPipe", Method, 0}, + {"(*Cmd).StdoutPipe", Method, 0}, + {"(*Cmd).String", Method, 13}, + {"(*Cmd).Wait", Method, 0}, + {"(*Error).Error", Method, 0}, + {"(*Error).Unwrap", Method, 13}, + {"(*ExitError).Error", Method, 0}, + {"(ExitError).ExitCode", Method, 12}, + {"(ExitError).Exited", Method, 0}, + {"(ExitError).Pid", Method, 0}, + {"(ExitError).String", Method, 0}, + {"(ExitError).Success", Method, 0}, + {"(ExitError).Sys", Method, 0}, + {"(ExitError).SysUsage", Method, 0}, + {"(ExitError).SystemTime", Method, 0}, + {"(ExitError).UserTime", Method, 0}, + {"Cmd", Type, 0}, + {"Cmd.Args", Field, 0}, + {"Cmd.Cancel", Field, 20}, + {"Cmd.Dir", Field, 0}, + {"Cmd.Env", Field, 0}, + {"Cmd.Err", Field, 19}, + {"Cmd.ExtraFiles", Field, 0}, + {"Cmd.Path", Field, 0}, + {"Cmd.Process", Field, 0}, + {"Cmd.ProcessState", Field, 0}, + {"Cmd.Stderr", Field, 0}, + {"Cmd.Stdin", Field, 0}, + {"Cmd.Stdout", Field, 0}, + {"Cmd.SysProcAttr", Field, 0}, + {"Cmd.WaitDelay", Field, 20}, + {"Command", Func, 0}, + {"CommandContext", Func, 7}, + {"ErrDot", Var, 19}, + {"ErrNotFound", Var, 0}, + {"ErrWaitDelay", Var, 20}, + {"Error", Type, 0}, + {"Error.Err", Field, 0}, + {"Error.Name", Field, 0}, + {"ExitError", Type, 0}, + {"ExitError.ProcessState", Field, 0}, + {"ExitError.Stderr", Field, 6}, + {"LookPath", Func, 0}, + }, + "os/signal": { + {"Ignore", Func, 5}, + {"Ignored", Func, 11}, + {"Notify", Func, 0}, + {"NotifyContext", Func, 16}, + {"Reset", Func, 5}, + {"Stop", Func, 1}, + }, + "os/user": { + {"(*User).GroupIds", Method, 7}, + {"(UnknownGroupError).Error", Method, 7}, + {"(UnknownGroupIdError).Error", Method, 7}, + {"(UnknownUserError).Error", Method, 0}, + {"(UnknownUserIdError).Error", Method, 0}, + {"Current", Func, 0}, + {"Group", Type, 7}, + {"Group.Gid", Field, 7}, + {"Group.Name", Field, 7}, + {"Lookup", Func, 0}, + {"LookupGroup", Func, 7}, + {"LookupGroupId", Func, 7}, + {"LookupId", Func, 0}, + {"UnknownGroupError", Type, 7}, + {"UnknownGroupIdError", Type, 7}, + {"UnknownUserError", Type, 0}, + {"UnknownUserIdError", Type, 0}, + {"User", Type, 0}, + {"User.Gid", Field, 0}, + {"User.HomeDir", Field, 0}, + {"User.Name", Field, 0}, + {"User.Uid", Field, 0}, + {"User.Username", Field, 0}, + }, + "path": { + {"Base", Func, 0}, + {"Clean", Func, 0}, + {"Dir", Func, 0}, + {"ErrBadPattern", Var, 0}, + {"Ext", Func, 0}, + {"IsAbs", Func, 0}, + {"Join", Func, 0}, + {"Match", Func, 0}, + {"Split", Func, 0}, + }, + "path/filepath": { + {"Abs", Func, 0}, + {"Base", Func, 0}, + {"Clean", Func, 0}, + {"Dir", Func, 0}, + {"ErrBadPattern", Var, 0}, + {"EvalSymlinks", Func, 0}, + {"Ext", Func, 0}, + {"FromSlash", Func, 0}, + {"Glob", Func, 0}, + {"HasPrefix", Func, 0}, + {"IsAbs", Func, 0}, + {"IsLocal", Func, 20}, + {"Join", Func, 0}, + {"ListSeparator", Const, 0}, + {"Localize", Func, 23}, + {"Match", Func, 0}, + {"Rel", Func, 0}, + {"Separator", Const, 0}, + {"SkipAll", Var, 20}, + {"SkipDir", Var, 0}, + {"Split", Func, 0}, + {"SplitList", Func, 0}, + {"ToSlash", Func, 0}, + {"VolumeName", Func, 0}, + {"Walk", Func, 0}, + {"WalkDir", Func, 16}, + {"WalkFunc", Type, 0}, + }, + "plugin": { + {"(*Plugin).Lookup", Method, 8}, + {"Open", Func, 8}, + {"Plugin", Type, 8}, + {"Symbol", Type, 8}, + }, + "reflect": { + {"(*MapIter).Key", Method, 12}, + {"(*MapIter).Next", Method, 12}, + {"(*MapIter).Reset", Method, 18}, + {"(*MapIter).Value", Method, 12}, + {"(*ValueError).Error", Method, 0}, + {"(ChanDir).String", Method, 0}, + {"(Kind).String", Method, 0}, + {"(Method).IsExported", Method, 17}, + {"(StructField).IsExported", Method, 17}, + {"(StructTag).Get", Method, 0}, + {"(StructTag).Lookup", Method, 7}, + {"(Value).Addr", Method, 0}, + {"(Value).Bool", Method, 0}, + {"(Value).Bytes", Method, 0}, + {"(Value).Call", Method, 0}, + {"(Value).CallSlice", Method, 0}, + {"(Value).CanAddr", Method, 0}, + {"(Value).CanComplex", Method, 18}, + {"(Value).CanConvert", Method, 17}, + {"(Value).CanFloat", Method, 18}, + {"(Value).CanInt", Method, 18}, + {"(Value).CanInterface", Method, 0}, + {"(Value).CanSet", Method, 0}, + {"(Value).CanUint", Method, 18}, + {"(Value).Cap", Method, 0}, + {"(Value).Clear", Method, 21}, + {"(Value).Close", Method, 0}, + {"(Value).Comparable", Method, 20}, + {"(Value).Complex", Method, 0}, + {"(Value).Convert", Method, 1}, + {"(Value).Elem", Method, 0}, + {"(Value).Equal", Method, 20}, + {"(Value).Field", Method, 0}, + {"(Value).FieldByIndex", Method, 0}, + {"(Value).FieldByIndexErr", Method, 18}, + {"(Value).FieldByName", Method, 0}, + {"(Value).FieldByNameFunc", Method, 0}, + {"(Value).Float", Method, 0}, + {"(Value).Grow", Method, 20}, + {"(Value).Index", Method, 0}, + {"(Value).Int", Method, 0}, + {"(Value).Interface", Method, 0}, + {"(Value).InterfaceData", Method, 0}, + {"(Value).IsNil", Method, 0}, + {"(Value).IsValid", Method, 0}, + {"(Value).IsZero", Method, 13}, + {"(Value).Kind", Method, 0}, + {"(Value).Len", Method, 0}, + {"(Value).MapIndex", Method, 0}, + {"(Value).MapKeys", Method, 0}, + {"(Value).MapRange", Method, 12}, + {"(Value).Method", Method, 0}, + {"(Value).MethodByName", Method, 0}, + {"(Value).NumField", Method, 0}, + {"(Value).NumMethod", Method, 0}, + {"(Value).OverflowComplex", Method, 0}, + {"(Value).OverflowFloat", Method, 0}, + {"(Value).OverflowInt", Method, 0}, + {"(Value).OverflowUint", Method, 0}, + {"(Value).Pointer", Method, 0}, + {"(Value).Recv", Method, 0}, + {"(Value).Send", Method, 0}, + {"(Value).Seq", Method, 23}, + {"(Value).Seq2", Method, 23}, + {"(Value).Set", Method, 0}, + {"(Value).SetBool", Method, 0}, + {"(Value).SetBytes", Method, 0}, + {"(Value).SetCap", Method, 2}, + {"(Value).SetComplex", Method, 0}, + {"(Value).SetFloat", Method, 0}, + {"(Value).SetInt", Method, 0}, + {"(Value).SetIterKey", Method, 18}, + {"(Value).SetIterValue", Method, 18}, + {"(Value).SetLen", Method, 0}, + {"(Value).SetMapIndex", Method, 0}, + {"(Value).SetPointer", Method, 0}, + {"(Value).SetString", Method, 0}, + {"(Value).SetUint", Method, 0}, + {"(Value).SetZero", Method, 20}, + {"(Value).Slice", Method, 0}, + {"(Value).Slice3", Method, 2}, + {"(Value).String", Method, 0}, + {"(Value).TryRecv", Method, 0}, + {"(Value).TrySend", Method, 0}, + {"(Value).Type", Method, 0}, + {"(Value).Uint", Method, 0}, + {"(Value).UnsafeAddr", Method, 0}, + {"(Value).UnsafePointer", Method, 18}, + {"Append", Func, 0}, + {"AppendSlice", Func, 0}, + {"Array", Const, 0}, + {"ArrayOf", Func, 5}, + {"Bool", Const, 0}, + {"BothDir", Const, 0}, + {"Chan", Const, 0}, + {"ChanDir", Type, 0}, + {"ChanOf", Func, 1}, + {"Complex128", Const, 0}, + {"Complex64", Const, 0}, + {"Copy", Func, 0}, + {"DeepEqual", Func, 0}, + {"Float32", Const, 0}, + {"Float64", Const, 0}, + {"Func", Const, 0}, + {"FuncOf", Func, 5}, + {"Indirect", Func, 0}, + {"Int", Const, 0}, + {"Int16", Const, 0}, + {"Int32", Const, 0}, + {"Int64", Const, 0}, + {"Int8", Const, 0}, + {"Interface", Const, 0}, + {"Invalid", Const, 0}, + {"Kind", Type, 0}, + {"MakeChan", Func, 0}, + {"MakeFunc", Func, 1}, + {"MakeMap", Func, 0}, + {"MakeMapWithSize", Func, 9}, + {"MakeSlice", Func, 0}, + {"Map", Const, 0}, + {"MapIter", Type, 12}, + {"MapOf", Func, 1}, + {"Method", Type, 0}, + {"Method.Func", Field, 0}, + {"Method.Index", Field, 0}, + {"Method.Name", Field, 0}, + {"Method.PkgPath", Field, 0}, + {"Method.Type", Field, 0}, + {"New", Func, 0}, + {"NewAt", Func, 0}, + {"Pointer", Const, 18}, + {"PointerTo", Func, 18}, + {"Ptr", Const, 0}, + {"PtrTo", Func, 0}, + {"RecvDir", Const, 0}, + {"Select", Func, 1}, + {"SelectCase", Type, 1}, + {"SelectCase.Chan", Field, 1}, + {"SelectCase.Dir", Field, 1}, + {"SelectCase.Send", Field, 1}, + {"SelectDefault", Const, 1}, + {"SelectDir", Type, 1}, + {"SelectRecv", Const, 1}, + {"SelectSend", Const, 1}, + {"SendDir", Const, 0}, + {"Slice", Const, 0}, + {"SliceAt", Func, 23}, + {"SliceHeader", Type, 0}, + {"SliceHeader.Cap", Field, 0}, + {"SliceHeader.Data", Field, 0}, + {"SliceHeader.Len", Field, 0}, + {"SliceOf", Func, 1}, + {"String", Const, 0}, + {"StringHeader", Type, 0}, + {"StringHeader.Data", Field, 0}, + {"StringHeader.Len", Field, 0}, + {"Struct", Const, 0}, + {"StructField", Type, 0}, + {"StructField.Anonymous", Field, 0}, + {"StructField.Index", Field, 0}, + {"StructField.Name", Field, 0}, + {"StructField.Offset", Field, 0}, + {"StructField.PkgPath", Field, 0}, + {"StructField.Tag", Field, 0}, + {"StructField.Type", Field, 0}, + {"StructOf", Func, 7}, + {"StructTag", Type, 0}, + {"Swapper", Func, 8}, + {"Type", Type, 0}, + {"TypeFor", Func, 22}, + {"TypeOf", Func, 0}, + {"Uint", Const, 0}, + {"Uint16", Const, 0}, + {"Uint32", Const, 0}, + {"Uint64", Const, 0}, + {"Uint8", Const, 0}, + {"Uintptr", Const, 0}, + {"UnsafePointer", Const, 0}, + {"Value", Type, 0}, + {"ValueError", Type, 0}, + {"ValueError.Kind", Field, 0}, + {"ValueError.Method", Field, 0}, + {"ValueOf", Func, 0}, + {"VisibleFields", Func, 17}, + {"Zero", Func, 0}, + }, + "regexp": { + {"(*Regexp).Copy", Method, 6}, + {"(*Regexp).Expand", Method, 0}, + {"(*Regexp).ExpandString", Method, 0}, + {"(*Regexp).Find", Method, 0}, + {"(*Regexp).FindAll", Method, 0}, + {"(*Regexp).FindAllIndex", Method, 0}, + {"(*Regexp).FindAllString", Method, 0}, + {"(*Regexp).FindAllStringIndex", Method, 0}, + {"(*Regexp).FindAllStringSubmatch", Method, 0}, + {"(*Regexp).FindAllStringSubmatchIndex", Method, 0}, + {"(*Regexp).FindAllSubmatch", Method, 0}, + {"(*Regexp).FindAllSubmatchIndex", Method, 0}, + {"(*Regexp).FindIndex", Method, 0}, + {"(*Regexp).FindReaderIndex", Method, 0}, + {"(*Regexp).FindReaderSubmatchIndex", Method, 0}, + {"(*Regexp).FindString", Method, 0}, + {"(*Regexp).FindStringIndex", Method, 0}, + {"(*Regexp).FindStringSubmatch", Method, 0}, + {"(*Regexp).FindStringSubmatchIndex", Method, 0}, + {"(*Regexp).FindSubmatch", Method, 0}, + {"(*Regexp).FindSubmatchIndex", Method, 0}, + {"(*Regexp).LiteralPrefix", Method, 0}, + {"(*Regexp).Longest", Method, 1}, + {"(*Regexp).MarshalText", Method, 21}, + {"(*Regexp).Match", Method, 0}, + {"(*Regexp).MatchReader", Method, 0}, + {"(*Regexp).MatchString", Method, 0}, + {"(*Regexp).NumSubexp", Method, 0}, + {"(*Regexp).ReplaceAll", Method, 0}, + {"(*Regexp).ReplaceAllFunc", Method, 0}, + {"(*Regexp).ReplaceAllLiteral", Method, 0}, + {"(*Regexp).ReplaceAllLiteralString", Method, 0}, + {"(*Regexp).ReplaceAllString", Method, 0}, + {"(*Regexp).ReplaceAllStringFunc", Method, 0}, + {"(*Regexp).Split", Method, 1}, + {"(*Regexp).String", Method, 0}, + {"(*Regexp).SubexpIndex", Method, 15}, + {"(*Regexp).SubexpNames", Method, 0}, + {"(*Regexp).UnmarshalText", Method, 21}, + {"Compile", Func, 0}, + {"CompilePOSIX", Func, 0}, + {"Match", Func, 0}, + {"MatchReader", Func, 0}, + {"MatchString", Func, 0}, + {"MustCompile", Func, 0}, + {"MustCompilePOSIX", Func, 0}, + {"QuoteMeta", Func, 0}, + {"Regexp", Type, 0}, + }, + "regexp/syntax": { + {"(*Error).Error", Method, 0}, + {"(*Inst).MatchEmptyWidth", Method, 0}, + {"(*Inst).MatchRune", Method, 0}, + {"(*Inst).MatchRunePos", Method, 3}, + {"(*Inst).String", Method, 0}, + {"(*Prog).Prefix", Method, 0}, + {"(*Prog).StartCond", Method, 0}, + {"(*Prog).String", Method, 0}, + {"(*Regexp).CapNames", Method, 0}, + {"(*Regexp).Equal", Method, 0}, + {"(*Regexp).MaxCap", Method, 0}, + {"(*Regexp).Simplify", Method, 0}, + {"(*Regexp).String", Method, 0}, + {"(ErrorCode).String", Method, 0}, + {"(InstOp).String", Method, 3}, + {"(Op).String", Method, 11}, + {"ClassNL", Const, 0}, + {"Compile", Func, 0}, + {"DotNL", Const, 0}, + {"EmptyBeginLine", Const, 0}, + {"EmptyBeginText", Const, 0}, + {"EmptyEndLine", Const, 0}, + {"EmptyEndText", Const, 0}, + {"EmptyNoWordBoundary", Const, 0}, + {"EmptyOp", Type, 0}, + {"EmptyOpContext", Func, 0}, + {"EmptyWordBoundary", Const, 0}, + {"ErrInternalError", Const, 0}, + {"ErrInvalidCharClass", Const, 0}, + {"ErrInvalidCharRange", Const, 0}, + {"ErrInvalidEscape", Const, 0}, + {"ErrInvalidNamedCapture", Const, 0}, + {"ErrInvalidPerlOp", Const, 0}, + {"ErrInvalidRepeatOp", Const, 0}, + {"ErrInvalidRepeatSize", Const, 0}, + {"ErrInvalidUTF8", Const, 0}, + {"ErrLarge", Const, 20}, + {"ErrMissingBracket", Const, 0}, + {"ErrMissingParen", Const, 0}, + {"ErrMissingRepeatArgument", Const, 0}, + {"ErrNestingDepth", Const, 19}, + {"ErrTrailingBackslash", Const, 0}, + {"ErrUnexpectedParen", Const, 1}, + {"Error", Type, 0}, + {"Error.Code", Field, 0}, + {"Error.Expr", Field, 0}, + {"ErrorCode", Type, 0}, + {"Flags", Type, 0}, + {"FoldCase", Const, 0}, + {"Inst", Type, 0}, + {"Inst.Arg", Field, 0}, + {"Inst.Op", Field, 0}, + {"Inst.Out", Field, 0}, + {"Inst.Rune", Field, 0}, + {"InstAlt", Const, 0}, + {"InstAltMatch", Const, 0}, + {"InstCapture", Const, 0}, + {"InstEmptyWidth", Const, 0}, + {"InstFail", Const, 0}, + {"InstMatch", Const, 0}, + {"InstNop", Const, 0}, + {"InstOp", Type, 0}, + {"InstRune", Const, 0}, + {"InstRune1", Const, 0}, + {"InstRuneAny", Const, 0}, + {"InstRuneAnyNotNL", Const, 0}, + {"IsWordChar", Func, 0}, + {"Literal", Const, 0}, + {"MatchNL", Const, 0}, + {"NonGreedy", Const, 0}, + {"OneLine", Const, 0}, + {"Op", Type, 0}, + {"OpAlternate", Const, 0}, + {"OpAnyChar", Const, 0}, + {"OpAnyCharNotNL", Const, 0}, + {"OpBeginLine", Const, 0}, + {"OpBeginText", Const, 0}, + {"OpCapture", Const, 0}, + {"OpCharClass", Const, 0}, + {"OpConcat", Const, 0}, + {"OpEmptyMatch", Const, 0}, + {"OpEndLine", Const, 0}, + {"OpEndText", Const, 0}, + {"OpLiteral", Const, 0}, + {"OpNoMatch", Const, 0}, + {"OpNoWordBoundary", Const, 0}, + {"OpPlus", Const, 0}, + {"OpQuest", Const, 0}, + {"OpRepeat", Const, 0}, + {"OpStar", Const, 0}, + {"OpWordBoundary", Const, 0}, + {"POSIX", Const, 0}, + {"Parse", Func, 0}, + {"Perl", Const, 0}, + {"PerlX", Const, 0}, + {"Prog", Type, 0}, + {"Prog.Inst", Field, 0}, + {"Prog.NumCap", Field, 0}, + {"Prog.Start", Field, 0}, + {"Regexp", Type, 0}, + {"Regexp.Cap", Field, 0}, + {"Regexp.Flags", Field, 0}, + {"Regexp.Max", Field, 0}, + {"Regexp.Min", Field, 0}, + {"Regexp.Name", Field, 0}, + {"Regexp.Op", Field, 0}, + {"Regexp.Rune", Field, 0}, + {"Regexp.Rune0", Field, 0}, + {"Regexp.Sub", Field, 0}, + {"Regexp.Sub0", Field, 0}, + {"Simple", Const, 0}, + {"UnicodeGroups", Const, 0}, + {"WasDollar", Const, 0}, + }, + "runtime": { + {"(*BlockProfileRecord).Stack", Method, 1}, + {"(*Frames).Next", Method, 7}, + {"(*Func).Entry", Method, 0}, + {"(*Func).FileLine", Method, 0}, + {"(*Func).Name", Method, 0}, + {"(*MemProfileRecord).InUseBytes", Method, 0}, + {"(*MemProfileRecord).InUseObjects", Method, 0}, + {"(*MemProfileRecord).Stack", Method, 0}, + {"(*PanicNilError).Error", Method, 21}, + {"(*PanicNilError).RuntimeError", Method, 21}, + {"(*Pinner).Pin", Method, 21}, + {"(*Pinner).Unpin", Method, 21}, + {"(*StackRecord).Stack", Method, 0}, + {"(*TypeAssertionError).Error", Method, 0}, + {"(*TypeAssertionError).RuntimeError", Method, 0}, + {"BlockProfile", Func, 1}, + {"BlockProfileRecord", Type, 1}, + {"BlockProfileRecord.Count", Field, 1}, + {"BlockProfileRecord.Cycles", Field, 1}, + {"BlockProfileRecord.StackRecord", Field, 1}, + {"Breakpoint", Func, 0}, + {"CPUProfile", Func, 0}, + {"Caller", Func, 0}, + {"Callers", Func, 0}, + {"CallersFrames", Func, 7}, + {"Compiler", Const, 0}, + {"Error", Type, 0}, + {"Frame", Type, 7}, + {"Frame.Entry", Field, 7}, + {"Frame.File", Field, 7}, + {"Frame.Func", Field, 7}, + {"Frame.Function", Field, 7}, + {"Frame.Line", Field, 7}, + {"Frame.PC", Field, 7}, + {"Frames", Type, 7}, + {"Func", Type, 0}, + {"FuncForPC", Func, 0}, + {"GC", Func, 0}, + {"GOARCH", Const, 0}, + {"GOMAXPROCS", Func, 0}, + {"GOOS", Const, 0}, + {"GOROOT", Func, 0}, + {"Goexit", Func, 0}, + {"GoroutineProfile", Func, 0}, + {"Gosched", Func, 0}, + {"KeepAlive", Func, 7}, + {"LockOSThread", Func, 0}, + {"MemProfile", Func, 0}, + {"MemProfileRate", Var, 0}, + {"MemProfileRecord", Type, 0}, + {"MemProfileRecord.AllocBytes", Field, 0}, + {"MemProfileRecord.AllocObjects", Field, 0}, + {"MemProfileRecord.FreeBytes", Field, 0}, + {"MemProfileRecord.FreeObjects", Field, 0}, + {"MemProfileRecord.Stack0", Field, 0}, + {"MemStats", Type, 0}, + {"MemStats.Alloc", Field, 0}, + {"MemStats.BuckHashSys", Field, 0}, + {"MemStats.BySize", Field, 0}, + {"MemStats.DebugGC", Field, 0}, + {"MemStats.EnableGC", Field, 0}, + {"MemStats.Frees", Field, 0}, + {"MemStats.GCCPUFraction", Field, 5}, + {"MemStats.GCSys", Field, 2}, + {"MemStats.HeapAlloc", Field, 0}, + {"MemStats.HeapIdle", Field, 0}, + {"MemStats.HeapInuse", Field, 0}, + {"MemStats.HeapObjects", Field, 0}, + {"MemStats.HeapReleased", Field, 0}, + {"MemStats.HeapSys", Field, 0}, + {"MemStats.LastGC", Field, 0}, + {"MemStats.Lookups", Field, 0}, + {"MemStats.MCacheInuse", Field, 0}, + {"MemStats.MCacheSys", Field, 0}, + {"MemStats.MSpanInuse", Field, 0}, + {"MemStats.MSpanSys", Field, 0}, + {"MemStats.Mallocs", Field, 0}, + {"MemStats.NextGC", Field, 0}, + {"MemStats.NumForcedGC", Field, 8}, + {"MemStats.NumGC", Field, 0}, + {"MemStats.OtherSys", Field, 2}, + {"MemStats.PauseEnd", Field, 4}, + {"MemStats.PauseNs", Field, 0}, + {"MemStats.PauseTotalNs", Field, 0}, + {"MemStats.StackInuse", Field, 0}, + {"MemStats.StackSys", Field, 0}, + {"MemStats.Sys", Field, 0}, + {"MemStats.TotalAlloc", Field, 0}, + {"MutexProfile", Func, 8}, + {"NumCPU", Func, 0}, + {"NumCgoCall", Func, 0}, + {"NumGoroutine", Func, 0}, + {"PanicNilError", Type, 21}, + {"Pinner", Type, 21}, + {"ReadMemStats", Func, 0}, + {"ReadTrace", Func, 5}, + {"SetBlockProfileRate", Func, 1}, + {"SetCPUProfileRate", Func, 0}, + {"SetCgoTraceback", Func, 7}, + {"SetFinalizer", Func, 0}, + {"SetMutexProfileFraction", Func, 8}, + {"Stack", Func, 0}, + {"StackRecord", Type, 0}, + {"StackRecord.Stack0", Field, 0}, + {"StartTrace", Func, 5}, + {"StopTrace", Func, 5}, + {"ThreadCreateProfile", Func, 0}, + {"TypeAssertionError", Type, 0}, + {"UnlockOSThread", Func, 0}, + {"Version", Func, 0}, + }, + "runtime/cgo": { + {"(Handle).Delete", Method, 17}, + {"(Handle).Value", Method, 17}, + {"Handle", Type, 17}, + {"Incomplete", Type, 20}, + {"NewHandle", Func, 17}, + }, + "runtime/coverage": { + {"ClearCounters", Func, 20}, + {"WriteCounters", Func, 20}, + {"WriteCountersDir", Func, 20}, + {"WriteMeta", Func, 20}, + {"WriteMetaDir", Func, 20}, + }, + "runtime/debug": { + {"(*BuildInfo).String", Method, 18}, + {"BuildInfo", Type, 12}, + {"BuildInfo.Deps", Field, 12}, + {"BuildInfo.GoVersion", Field, 18}, + {"BuildInfo.Main", Field, 12}, + {"BuildInfo.Path", Field, 12}, + {"BuildInfo.Settings", Field, 18}, + {"BuildSetting", Type, 18}, + {"BuildSetting.Key", Field, 18}, + {"BuildSetting.Value", Field, 18}, + {"CrashOptions", Type, 23}, + {"FreeOSMemory", Func, 1}, + {"GCStats", Type, 1}, + {"GCStats.LastGC", Field, 1}, + {"GCStats.NumGC", Field, 1}, + {"GCStats.Pause", Field, 1}, + {"GCStats.PauseEnd", Field, 4}, + {"GCStats.PauseQuantiles", Field, 1}, + {"GCStats.PauseTotal", Field, 1}, + {"Module", Type, 12}, + {"Module.Path", Field, 12}, + {"Module.Replace", Field, 12}, + {"Module.Sum", Field, 12}, + {"Module.Version", Field, 12}, + {"ParseBuildInfo", Func, 18}, + {"PrintStack", Func, 0}, + {"ReadBuildInfo", Func, 12}, + {"ReadGCStats", Func, 1}, + {"SetCrashOutput", Func, 23}, + {"SetGCPercent", Func, 1}, + {"SetMaxStack", Func, 2}, + {"SetMaxThreads", Func, 2}, + {"SetMemoryLimit", Func, 19}, + {"SetPanicOnFault", Func, 3}, + {"SetTraceback", Func, 6}, + {"Stack", Func, 0}, + {"WriteHeapDump", Func, 3}, + }, + "runtime/metrics": { + {"(Value).Float64", Method, 16}, + {"(Value).Float64Histogram", Method, 16}, + {"(Value).Kind", Method, 16}, + {"(Value).Uint64", Method, 16}, + {"All", Func, 16}, + {"Description", Type, 16}, + {"Description.Cumulative", Field, 16}, + {"Description.Description", Field, 16}, + {"Description.Kind", Field, 16}, + {"Description.Name", Field, 16}, + {"Float64Histogram", Type, 16}, + {"Float64Histogram.Buckets", Field, 16}, + {"Float64Histogram.Counts", Field, 16}, + {"KindBad", Const, 16}, + {"KindFloat64", Const, 16}, + {"KindFloat64Histogram", Const, 16}, + {"KindUint64", Const, 16}, + {"Read", Func, 16}, + {"Sample", Type, 16}, + {"Sample.Name", Field, 16}, + {"Sample.Value", Field, 16}, + {"Value", Type, 16}, + {"ValueKind", Type, 16}, + }, + "runtime/pprof": { + {"(*Profile).Add", Method, 0}, + {"(*Profile).Count", Method, 0}, + {"(*Profile).Name", Method, 0}, + {"(*Profile).Remove", Method, 0}, + {"(*Profile).WriteTo", Method, 0}, + {"Do", Func, 9}, + {"ForLabels", Func, 9}, + {"Label", Func, 9}, + {"LabelSet", Type, 9}, + {"Labels", Func, 9}, + {"Lookup", Func, 0}, + {"NewProfile", Func, 0}, + {"Profile", Type, 0}, + {"Profiles", Func, 0}, + {"SetGoroutineLabels", Func, 9}, + {"StartCPUProfile", Func, 0}, + {"StopCPUProfile", Func, 0}, + {"WithLabels", Func, 9}, + {"WriteHeapProfile", Func, 0}, + }, + "runtime/trace": { + {"(*Region).End", Method, 11}, + {"(*Task).End", Method, 11}, + {"IsEnabled", Func, 11}, + {"Log", Func, 11}, + {"Logf", Func, 11}, + {"NewTask", Func, 11}, + {"Region", Type, 11}, + {"Start", Func, 5}, + {"StartRegion", Func, 11}, + {"Stop", Func, 5}, + {"Task", Type, 11}, + {"WithRegion", Func, 11}, + }, + "slices": { + {"All", Func, 23}, + {"AppendSeq", Func, 23}, + {"Backward", Func, 23}, + {"BinarySearch", Func, 21}, + {"BinarySearchFunc", Func, 21}, + {"Chunk", Func, 23}, + {"Clip", Func, 21}, + {"Clone", Func, 21}, + {"Collect", Func, 23}, + {"Compact", Func, 21}, + {"CompactFunc", Func, 21}, + {"Compare", Func, 21}, + {"CompareFunc", Func, 21}, + {"Concat", Func, 22}, + {"Contains", Func, 21}, + {"ContainsFunc", Func, 21}, + {"Delete", Func, 21}, + {"DeleteFunc", Func, 21}, + {"Equal", Func, 21}, + {"EqualFunc", Func, 21}, + {"Grow", Func, 21}, + {"Index", Func, 21}, + {"IndexFunc", Func, 21}, + {"Insert", Func, 21}, + {"IsSorted", Func, 21}, + {"IsSortedFunc", Func, 21}, + {"Max", Func, 21}, + {"MaxFunc", Func, 21}, + {"Min", Func, 21}, + {"MinFunc", Func, 21}, + {"Repeat", Func, 23}, + {"Replace", Func, 21}, + {"Reverse", Func, 21}, + {"Sort", Func, 21}, + {"SortFunc", Func, 21}, + {"SortStableFunc", Func, 21}, + {"Sorted", Func, 23}, + {"SortedFunc", Func, 23}, + {"SortedStableFunc", Func, 23}, + {"Values", Func, 23}, + }, + "sort": { + {"(Float64Slice).Len", Method, 0}, + {"(Float64Slice).Less", Method, 0}, + {"(Float64Slice).Search", Method, 0}, + {"(Float64Slice).Sort", Method, 0}, + {"(Float64Slice).Swap", Method, 0}, + {"(IntSlice).Len", Method, 0}, + {"(IntSlice).Less", Method, 0}, + {"(IntSlice).Search", Method, 0}, + {"(IntSlice).Sort", Method, 0}, + {"(IntSlice).Swap", Method, 0}, + {"(StringSlice).Len", Method, 0}, + {"(StringSlice).Less", Method, 0}, + {"(StringSlice).Search", Method, 0}, + {"(StringSlice).Sort", Method, 0}, + {"(StringSlice).Swap", Method, 0}, + {"Find", Func, 19}, + {"Float64Slice", Type, 0}, + {"Float64s", Func, 0}, + {"Float64sAreSorted", Func, 0}, + {"IntSlice", Type, 0}, + {"Interface", Type, 0}, + {"Ints", Func, 0}, + {"IntsAreSorted", Func, 0}, + {"IsSorted", Func, 0}, + {"Reverse", Func, 1}, + {"Search", Func, 0}, + {"SearchFloat64s", Func, 0}, + {"SearchInts", Func, 0}, + {"SearchStrings", Func, 0}, + {"Slice", Func, 8}, + {"SliceIsSorted", Func, 8}, + {"SliceStable", Func, 8}, + {"Sort", Func, 0}, + {"Stable", Func, 2}, + {"StringSlice", Type, 0}, + {"Strings", Func, 0}, + {"StringsAreSorted", Func, 0}, + }, + "strconv": { + {"(*NumError).Error", Method, 0}, + {"(*NumError).Unwrap", Method, 14}, + {"AppendBool", Func, 0}, + {"AppendFloat", Func, 0}, + {"AppendInt", Func, 0}, + {"AppendQuote", Func, 0}, + {"AppendQuoteRune", Func, 0}, + {"AppendQuoteRuneToASCII", Func, 0}, + {"AppendQuoteRuneToGraphic", Func, 6}, + {"AppendQuoteToASCII", Func, 0}, + {"AppendQuoteToGraphic", Func, 6}, + {"AppendUint", Func, 0}, + {"Atoi", Func, 0}, + {"CanBackquote", Func, 0}, + {"ErrRange", Var, 0}, + {"ErrSyntax", Var, 0}, + {"FormatBool", Func, 0}, + {"FormatComplex", Func, 15}, + {"FormatFloat", Func, 0}, + {"FormatInt", Func, 0}, + {"FormatUint", Func, 0}, + {"IntSize", Const, 0}, + {"IsGraphic", Func, 6}, + {"IsPrint", Func, 0}, + {"Itoa", Func, 0}, + {"NumError", Type, 0}, + {"NumError.Err", Field, 0}, + {"NumError.Func", Field, 0}, + {"NumError.Num", Field, 0}, + {"ParseBool", Func, 0}, + {"ParseComplex", Func, 15}, + {"ParseFloat", Func, 0}, + {"ParseInt", Func, 0}, + {"ParseUint", Func, 0}, + {"Quote", Func, 0}, + {"QuoteRune", Func, 0}, + {"QuoteRuneToASCII", Func, 0}, + {"QuoteRuneToGraphic", Func, 6}, + {"QuoteToASCII", Func, 0}, + {"QuoteToGraphic", Func, 6}, + {"QuotedPrefix", Func, 17}, + {"Unquote", Func, 0}, + {"UnquoteChar", Func, 0}, + }, + "strings": { + {"(*Builder).Cap", Method, 12}, + {"(*Builder).Grow", Method, 10}, + {"(*Builder).Len", Method, 10}, + {"(*Builder).Reset", Method, 10}, + {"(*Builder).String", Method, 10}, + {"(*Builder).Write", Method, 10}, + {"(*Builder).WriteByte", Method, 10}, + {"(*Builder).WriteRune", Method, 10}, + {"(*Builder).WriteString", Method, 10}, + {"(*Reader).Len", Method, 0}, + {"(*Reader).Read", Method, 0}, + {"(*Reader).ReadAt", Method, 0}, + {"(*Reader).ReadByte", Method, 0}, + {"(*Reader).ReadRune", Method, 0}, + {"(*Reader).Reset", Method, 7}, + {"(*Reader).Seek", Method, 0}, + {"(*Reader).Size", Method, 5}, + {"(*Reader).UnreadByte", Method, 0}, + {"(*Reader).UnreadRune", Method, 0}, + {"(*Reader).WriteTo", Method, 1}, + {"(*Replacer).Replace", Method, 0}, + {"(*Replacer).WriteString", Method, 0}, + {"Builder", Type, 10}, + {"Clone", Func, 18}, + {"Compare", Func, 5}, + {"Contains", Func, 0}, + {"ContainsAny", Func, 0}, + {"ContainsFunc", Func, 21}, + {"ContainsRune", Func, 0}, + {"Count", Func, 0}, + {"Cut", Func, 18}, + {"CutPrefix", Func, 20}, + {"CutSuffix", Func, 20}, + {"EqualFold", Func, 0}, + {"Fields", Func, 0}, + {"FieldsFunc", Func, 0}, + {"HasPrefix", Func, 0}, + {"HasSuffix", Func, 0}, + {"Index", Func, 0}, + {"IndexAny", Func, 0}, + {"IndexByte", Func, 2}, + {"IndexFunc", Func, 0}, + {"IndexRune", Func, 0}, + {"Join", Func, 0}, + {"LastIndex", Func, 0}, + {"LastIndexAny", Func, 0}, + {"LastIndexByte", Func, 5}, + {"LastIndexFunc", Func, 0}, + {"Map", Func, 0}, + {"NewReader", Func, 0}, + {"NewReplacer", Func, 0}, + {"Reader", Type, 0}, + {"Repeat", Func, 0}, + {"Replace", Func, 0}, + {"ReplaceAll", Func, 12}, + {"Replacer", Type, 0}, + {"Split", Func, 0}, + {"SplitAfter", Func, 0}, + {"SplitAfterN", Func, 0}, + {"SplitN", Func, 0}, + {"Title", Func, 0}, + {"ToLower", Func, 0}, + {"ToLowerSpecial", Func, 0}, + {"ToTitle", Func, 0}, + {"ToTitleSpecial", Func, 0}, + {"ToUpper", Func, 0}, + {"ToUpperSpecial", Func, 0}, + {"ToValidUTF8", Func, 13}, + {"Trim", Func, 0}, + {"TrimFunc", Func, 0}, + {"TrimLeft", Func, 0}, + {"TrimLeftFunc", Func, 0}, + {"TrimPrefix", Func, 1}, + {"TrimRight", Func, 0}, + {"TrimRightFunc", Func, 0}, + {"TrimSpace", Func, 0}, + {"TrimSuffix", Func, 1}, + }, + "structs": { + {"HostLayout", Type, 23}, + }, + "sync": { + {"(*Cond).Broadcast", Method, 0}, + {"(*Cond).Signal", Method, 0}, + {"(*Cond).Wait", Method, 0}, + {"(*Map).Clear", Method, 23}, + {"(*Map).CompareAndDelete", Method, 20}, + {"(*Map).CompareAndSwap", Method, 20}, + {"(*Map).Delete", Method, 9}, + {"(*Map).Load", Method, 9}, + {"(*Map).LoadAndDelete", Method, 15}, + {"(*Map).LoadOrStore", Method, 9}, + {"(*Map).Range", Method, 9}, + {"(*Map).Store", Method, 9}, + {"(*Map).Swap", Method, 20}, + {"(*Mutex).Lock", Method, 0}, + {"(*Mutex).TryLock", Method, 18}, + {"(*Mutex).Unlock", Method, 0}, + {"(*Once).Do", Method, 0}, + {"(*Pool).Get", Method, 3}, + {"(*Pool).Put", Method, 3}, + {"(*RWMutex).Lock", Method, 0}, + {"(*RWMutex).RLock", Method, 0}, + {"(*RWMutex).RLocker", Method, 0}, + {"(*RWMutex).RUnlock", Method, 0}, + {"(*RWMutex).TryLock", Method, 18}, + {"(*RWMutex).TryRLock", Method, 18}, + {"(*RWMutex).Unlock", Method, 0}, + {"(*WaitGroup).Add", Method, 0}, + {"(*WaitGroup).Done", Method, 0}, + {"(*WaitGroup).Wait", Method, 0}, + {"Cond", Type, 0}, + {"Cond.L", Field, 0}, + {"Locker", Type, 0}, + {"Map", Type, 9}, + {"Mutex", Type, 0}, + {"NewCond", Func, 0}, + {"Once", Type, 0}, + {"OnceFunc", Func, 21}, + {"OnceValue", Func, 21}, + {"OnceValues", Func, 21}, + {"Pool", Type, 3}, + {"Pool.New", Field, 3}, + {"RWMutex", Type, 0}, + {"WaitGroup", Type, 0}, + }, + "sync/atomic": { + {"(*Bool).CompareAndSwap", Method, 19}, + {"(*Bool).Load", Method, 19}, + {"(*Bool).Store", Method, 19}, + {"(*Bool).Swap", Method, 19}, + {"(*Int32).Add", Method, 19}, + {"(*Int32).And", Method, 23}, + {"(*Int32).CompareAndSwap", Method, 19}, + {"(*Int32).Load", Method, 19}, + {"(*Int32).Or", Method, 23}, + {"(*Int32).Store", Method, 19}, + {"(*Int32).Swap", Method, 19}, + {"(*Int64).Add", Method, 19}, + {"(*Int64).And", Method, 23}, + {"(*Int64).CompareAndSwap", Method, 19}, + {"(*Int64).Load", Method, 19}, + {"(*Int64).Or", Method, 23}, + {"(*Int64).Store", Method, 19}, + {"(*Int64).Swap", Method, 19}, + {"(*Pointer).CompareAndSwap", Method, 19}, + {"(*Pointer).Load", Method, 19}, + {"(*Pointer).Store", Method, 19}, + {"(*Pointer).Swap", Method, 19}, + {"(*Uint32).Add", Method, 19}, + {"(*Uint32).And", Method, 23}, + {"(*Uint32).CompareAndSwap", Method, 19}, + {"(*Uint32).Load", Method, 19}, + {"(*Uint32).Or", Method, 23}, + {"(*Uint32).Store", Method, 19}, + {"(*Uint32).Swap", Method, 19}, + {"(*Uint64).Add", Method, 19}, + {"(*Uint64).And", Method, 23}, + {"(*Uint64).CompareAndSwap", Method, 19}, + {"(*Uint64).Load", Method, 19}, + {"(*Uint64).Or", Method, 23}, + {"(*Uint64).Store", Method, 19}, + {"(*Uint64).Swap", Method, 19}, + {"(*Uintptr).Add", Method, 19}, + {"(*Uintptr).And", Method, 23}, + {"(*Uintptr).CompareAndSwap", Method, 19}, + {"(*Uintptr).Load", Method, 19}, + {"(*Uintptr).Or", Method, 23}, + {"(*Uintptr).Store", Method, 19}, + {"(*Uintptr).Swap", Method, 19}, + {"(*Value).CompareAndSwap", Method, 17}, + {"(*Value).Load", Method, 4}, + {"(*Value).Store", Method, 4}, + {"(*Value).Swap", Method, 17}, + {"AddInt32", Func, 0}, + {"AddInt64", Func, 0}, + {"AddUint32", Func, 0}, + {"AddUint64", Func, 0}, + {"AddUintptr", Func, 0}, + {"AndInt32", Func, 23}, + {"AndInt64", Func, 23}, + {"AndUint32", Func, 23}, + {"AndUint64", Func, 23}, + {"AndUintptr", Func, 23}, + {"Bool", Type, 19}, + {"CompareAndSwapInt32", Func, 0}, + {"CompareAndSwapInt64", Func, 0}, + {"CompareAndSwapPointer", Func, 0}, + {"CompareAndSwapUint32", Func, 0}, + {"CompareAndSwapUint64", Func, 0}, + {"CompareAndSwapUintptr", Func, 0}, + {"Int32", Type, 19}, + {"Int64", Type, 19}, + {"LoadInt32", Func, 0}, + {"LoadInt64", Func, 0}, + {"LoadPointer", Func, 0}, + {"LoadUint32", Func, 0}, + {"LoadUint64", Func, 0}, + {"LoadUintptr", Func, 0}, + {"OrInt32", Func, 23}, + {"OrInt64", Func, 23}, + {"OrUint32", Func, 23}, + {"OrUint64", Func, 23}, + {"OrUintptr", Func, 23}, + {"Pointer", Type, 19}, + {"StoreInt32", Func, 0}, + {"StoreInt64", Func, 0}, + {"StorePointer", Func, 0}, + {"StoreUint32", Func, 0}, + {"StoreUint64", Func, 0}, + {"StoreUintptr", Func, 0}, + {"SwapInt32", Func, 2}, + {"SwapInt64", Func, 2}, + {"SwapPointer", Func, 2}, + {"SwapUint32", Func, 2}, + {"SwapUint64", Func, 2}, + {"SwapUintptr", Func, 2}, + {"Uint32", Type, 19}, + {"Uint64", Type, 19}, + {"Uintptr", Type, 19}, + {"Value", Type, 4}, + }, + "syscall": { + {"(*Cmsghdr).SetLen", Method, 0}, + {"(*DLL).FindProc", Method, 0}, + {"(*DLL).MustFindProc", Method, 0}, + {"(*DLL).Release", Method, 0}, + {"(*DLLError).Error", Method, 0}, + {"(*DLLError).Unwrap", Method, 16}, + {"(*Filetime).Nanoseconds", Method, 0}, + {"(*Iovec).SetLen", Method, 0}, + {"(*LazyDLL).Handle", Method, 0}, + {"(*LazyDLL).Load", Method, 0}, + {"(*LazyDLL).NewProc", Method, 0}, + {"(*LazyProc).Addr", Method, 0}, + {"(*LazyProc).Call", Method, 0}, + {"(*LazyProc).Find", Method, 0}, + {"(*Msghdr).SetControllen", Method, 0}, + {"(*Proc).Addr", Method, 0}, + {"(*Proc).Call", Method, 0}, + {"(*PtraceRegs).PC", Method, 0}, + {"(*PtraceRegs).SetPC", Method, 0}, + {"(*RawSockaddrAny).Sockaddr", Method, 0}, + {"(*SID).Copy", Method, 0}, + {"(*SID).Len", Method, 0}, + {"(*SID).LookupAccount", Method, 0}, + {"(*SID).String", Method, 0}, + {"(*Timespec).Nano", Method, 0}, + {"(*Timespec).Unix", Method, 0}, + {"(*Timeval).Nano", Method, 0}, + {"(*Timeval).Nanoseconds", Method, 0}, + {"(*Timeval).Unix", Method, 0}, + {"(Errno).Error", Method, 0}, + {"(Errno).Is", Method, 13}, + {"(Errno).Temporary", Method, 0}, + {"(Errno).Timeout", Method, 0}, + {"(Signal).Signal", Method, 0}, + {"(Signal).String", Method, 0}, + {"(Token).Close", Method, 0}, + {"(Token).GetTokenPrimaryGroup", Method, 0}, + {"(Token).GetTokenUser", Method, 0}, + {"(Token).GetUserProfileDirectory", Method, 0}, + {"(WaitStatus).Continued", Method, 0}, + {"(WaitStatus).CoreDump", Method, 0}, + {"(WaitStatus).ExitStatus", Method, 0}, + {"(WaitStatus).Exited", Method, 0}, + {"(WaitStatus).Signal", Method, 0}, + {"(WaitStatus).Signaled", Method, 0}, + {"(WaitStatus).StopSignal", Method, 0}, + {"(WaitStatus).Stopped", Method, 0}, + {"(WaitStatus).TrapCause", Method, 0}, + {"AF_ALG", Const, 0}, + {"AF_APPLETALK", Const, 0}, + {"AF_ARP", Const, 0}, + {"AF_ASH", Const, 0}, + {"AF_ATM", Const, 0}, + {"AF_ATMPVC", Const, 0}, + {"AF_ATMSVC", Const, 0}, + {"AF_AX25", Const, 0}, + {"AF_BLUETOOTH", Const, 0}, + {"AF_BRIDGE", Const, 0}, + {"AF_CAIF", Const, 0}, + {"AF_CAN", Const, 0}, + {"AF_CCITT", Const, 0}, + {"AF_CHAOS", Const, 0}, + {"AF_CNT", Const, 0}, + {"AF_COIP", Const, 0}, + {"AF_DATAKIT", Const, 0}, + {"AF_DECnet", Const, 0}, + {"AF_DLI", Const, 0}, + {"AF_E164", Const, 0}, + {"AF_ECMA", Const, 0}, + {"AF_ECONET", Const, 0}, + {"AF_ENCAP", Const, 1}, + {"AF_FILE", Const, 0}, + {"AF_HYLINK", Const, 0}, + {"AF_IEEE80211", Const, 0}, + {"AF_IEEE802154", Const, 0}, + {"AF_IMPLINK", Const, 0}, + {"AF_INET", Const, 0}, + {"AF_INET6", Const, 0}, + {"AF_INET6_SDP", Const, 3}, + {"AF_INET_SDP", Const, 3}, + {"AF_IPX", Const, 0}, + {"AF_IRDA", Const, 0}, + {"AF_ISDN", Const, 0}, + {"AF_ISO", Const, 0}, + {"AF_IUCV", Const, 0}, + {"AF_KEY", Const, 0}, + {"AF_LAT", Const, 0}, + {"AF_LINK", Const, 0}, + {"AF_LLC", Const, 0}, + {"AF_LOCAL", Const, 0}, + {"AF_MAX", Const, 0}, + {"AF_MPLS", Const, 1}, + {"AF_NATM", Const, 0}, + {"AF_NDRV", Const, 0}, + {"AF_NETBEUI", Const, 0}, + {"AF_NETBIOS", Const, 0}, + {"AF_NETGRAPH", Const, 0}, + {"AF_NETLINK", Const, 0}, + {"AF_NETROM", Const, 0}, + {"AF_NS", Const, 0}, + {"AF_OROUTE", Const, 1}, + {"AF_OSI", Const, 0}, + {"AF_PACKET", Const, 0}, + {"AF_PHONET", Const, 0}, + {"AF_PPP", Const, 0}, + {"AF_PPPOX", Const, 0}, + {"AF_PUP", Const, 0}, + {"AF_RDS", Const, 0}, + {"AF_RESERVED_36", Const, 0}, + {"AF_ROSE", Const, 0}, + {"AF_ROUTE", Const, 0}, + {"AF_RXRPC", Const, 0}, + {"AF_SCLUSTER", Const, 0}, + {"AF_SECURITY", Const, 0}, + {"AF_SIP", Const, 0}, + {"AF_SLOW", Const, 0}, + {"AF_SNA", Const, 0}, + {"AF_SYSTEM", Const, 0}, + {"AF_TIPC", Const, 0}, + {"AF_UNIX", Const, 0}, + {"AF_UNSPEC", Const, 0}, + {"AF_UTUN", Const, 16}, + {"AF_VENDOR00", Const, 0}, + {"AF_VENDOR01", Const, 0}, + {"AF_VENDOR02", Const, 0}, + {"AF_VENDOR03", Const, 0}, + {"AF_VENDOR04", Const, 0}, + {"AF_VENDOR05", Const, 0}, + {"AF_VENDOR06", Const, 0}, + {"AF_VENDOR07", Const, 0}, + {"AF_VENDOR08", Const, 0}, + {"AF_VENDOR09", Const, 0}, + {"AF_VENDOR10", Const, 0}, + {"AF_VENDOR11", Const, 0}, + {"AF_VENDOR12", Const, 0}, + {"AF_VENDOR13", Const, 0}, + {"AF_VENDOR14", Const, 0}, + {"AF_VENDOR15", Const, 0}, + {"AF_VENDOR16", Const, 0}, + {"AF_VENDOR17", Const, 0}, + {"AF_VENDOR18", Const, 0}, + {"AF_VENDOR19", Const, 0}, + {"AF_VENDOR20", Const, 0}, + {"AF_VENDOR21", Const, 0}, + {"AF_VENDOR22", Const, 0}, + {"AF_VENDOR23", Const, 0}, + {"AF_VENDOR24", Const, 0}, + {"AF_VENDOR25", Const, 0}, + {"AF_VENDOR26", Const, 0}, + {"AF_VENDOR27", Const, 0}, + {"AF_VENDOR28", Const, 0}, + {"AF_VENDOR29", Const, 0}, + {"AF_VENDOR30", Const, 0}, + {"AF_VENDOR31", Const, 0}, + {"AF_VENDOR32", Const, 0}, + {"AF_VENDOR33", Const, 0}, + {"AF_VENDOR34", Const, 0}, + {"AF_VENDOR35", Const, 0}, + {"AF_VENDOR36", Const, 0}, + {"AF_VENDOR37", Const, 0}, + {"AF_VENDOR38", Const, 0}, + {"AF_VENDOR39", Const, 0}, + {"AF_VENDOR40", Const, 0}, + {"AF_VENDOR41", Const, 0}, + {"AF_VENDOR42", Const, 0}, + {"AF_VENDOR43", Const, 0}, + {"AF_VENDOR44", Const, 0}, + {"AF_VENDOR45", Const, 0}, + {"AF_VENDOR46", Const, 0}, + {"AF_VENDOR47", Const, 0}, + {"AF_WANPIPE", Const, 0}, + {"AF_X25", Const, 0}, + {"AI_CANONNAME", Const, 1}, + {"AI_NUMERICHOST", Const, 1}, + {"AI_PASSIVE", Const, 1}, + {"APPLICATION_ERROR", Const, 0}, + {"ARPHRD_ADAPT", Const, 0}, + {"ARPHRD_APPLETLK", Const, 0}, + {"ARPHRD_ARCNET", Const, 0}, + {"ARPHRD_ASH", Const, 0}, + {"ARPHRD_ATM", Const, 0}, + {"ARPHRD_AX25", Const, 0}, + {"ARPHRD_BIF", Const, 0}, + {"ARPHRD_CHAOS", Const, 0}, + {"ARPHRD_CISCO", Const, 0}, + {"ARPHRD_CSLIP", Const, 0}, + {"ARPHRD_CSLIP6", Const, 0}, + {"ARPHRD_DDCMP", Const, 0}, + {"ARPHRD_DLCI", Const, 0}, + {"ARPHRD_ECONET", Const, 0}, + {"ARPHRD_EETHER", Const, 0}, + {"ARPHRD_ETHER", Const, 0}, + {"ARPHRD_EUI64", Const, 0}, + {"ARPHRD_FCAL", Const, 0}, + {"ARPHRD_FCFABRIC", Const, 0}, + {"ARPHRD_FCPL", Const, 0}, + {"ARPHRD_FCPP", Const, 0}, + {"ARPHRD_FDDI", Const, 0}, + {"ARPHRD_FRAD", Const, 0}, + {"ARPHRD_FRELAY", Const, 1}, + {"ARPHRD_HDLC", Const, 0}, + {"ARPHRD_HIPPI", Const, 0}, + {"ARPHRD_HWX25", Const, 0}, + {"ARPHRD_IEEE1394", Const, 0}, + {"ARPHRD_IEEE802", Const, 0}, + {"ARPHRD_IEEE80211", Const, 0}, + {"ARPHRD_IEEE80211_PRISM", Const, 0}, + {"ARPHRD_IEEE80211_RADIOTAP", Const, 0}, + {"ARPHRD_IEEE802154", Const, 0}, + {"ARPHRD_IEEE802154_PHY", Const, 0}, + {"ARPHRD_IEEE802_TR", Const, 0}, + {"ARPHRD_INFINIBAND", Const, 0}, + {"ARPHRD_IPDDP", Const, 0}, + {"ARPHRD_IPGRE", Const, 0}, + {"ARPHRD_IRDA", Const, 0}, + {"ARPHRD_LAPB", Const, 0}, + {"ARPHRD_LOCALTLK", Const, 0}, + {"ARPHRD_LOOPBACK", Const, 0}, + {"ARPHRD_METRICOM", Const, 0}, + {"ARPHRD_NETROM", Const, 0}, + {"ARPHRD_NONE", Const, 0}, + {"ARPHRD_PIMREG", Const, 0}, + {"ARPHRD_PPP", Const, 0}, + {"ARPHRD_PRONET", Const, 0}, + {"ARPHRD_RAWHDLC", Const, 0}, + {"ARPHRD_ROSE", Const, 0}, + {"ARPHRD_RSRVD", Const, 0}, + {"ARPHRD_SIT", Const, 0}, + {"ARPHRD_SKIP", Const, 0}, + {"ARPHRD_SLIP", Const, 0}, + {"ARPHRD_SLIP6", Const, 0}, + {"ARPHRD_STRIP", Const, 1}, + {"ARPHRD_TUNNEL", Const, 0}, + {"ARPHRD_TUNNEL6", Const, 0}, + {"ARPHRD_VOID", Const, 0}, + {"ARPHRD_X25", Const, 0}, + {"AUTHTYPE_CLIENT", Const, 0}, + {"AUTHTYPE_SERVER", Const, 0}, + {"Accept", Func, 0}, + {"Accept4", Func, 1}, + {"AcceptEx", Func, 0}, + {"Access", Func, 0}, + {"Acct", Func, 0}, + {"AddrinfoW", Type, 1}, + {"AddrinfoW.Addr", Field, 1}, + {"AddrinfoW.Addrlen", Field, 1}, + {"AddrinfoW.Canonname", Field, 1}, + {"AddrinfoW.Family", Field, 1}, + {"AddrinfoW.Flags", Field, 1}, + {"AddrinfoW.Next", Field, 1}, + {"AddrinfoW.Protocol", Field, 1}, + {"AddrinfoW.Socktype", Field, 1}, + {"Adjtime", Func, 0}, + {"Adjtimex", Func, 0}, + {"AllThreadsSyscall", Func, 16}, + {"AllThreadsSyscall6", Func, 16}, + {"AttachLsf", Func, 0}, + {"B0", Const, 0}, + {"B1000000", Const, 0}, + {"B110", Const, 0}, + {"B115200", Const, 0}, + {"B1152000", Const, 0}, + {"B1200", Const, 0}, + {"B134", Const, 0}, + {"B14400", Const, 1}, + {"B150", Const, 0}, + {"B1500000", Const, 0}, + {"B1800", Const, 0}, + {"B19200", Const, 0}, + {"B200", Const, 0}, + {"B2000000", Const, 0}, + {"B230400", Const, 0}, + {"B2400", Const, 0}, + {"B2500000", Const, 0}, + {"B28800", Const, 1}, + {"B300", Const, 0}, + {"B3000000", Const, 0}, + {"B3500000", Const, 0}, + {"B38400", Const, 0}, + {"B4000000", Const, 0}, + {"B460800", Const, 0}, + {"B4800", Const, 0}, + {"B50", Const, 0}, + {"B500000", Const, 0}, + {"B57600", Const, 0}, + {"B576000", Const, 0}, + {"B600", Const, 0}, + {"B7200", Const, 1}, + {"B75", Const, 0}, + {"B76800", Const, 1}, + {"B921600", Const, 0}, + {"B9600", Const, 0}, + {"BASE_PROTOCOL", Const, 2}, + {"BIOCFEEDBACK", Const, 0}, + {"BIOCFLUSH", Const, 0}, + {"BIOCGBLEN", Const, 0}, + {"BIOCGDIRECTION", Const, 0}, + {"BIOCGDIRFILT", Const, 1}, + {"BIOCGDLT", Const, 0}, + {"BIOCGDLTLIST", Const, 0}, + {"BIOCGETBUFMODE", Const, 0}, + {"BIOCGETIF", Const, 0}, + {"BIOCGETZMAX", Const, 0}, + {"BIOCGFEEDBACK", Const, 1}, + {"BIOCGFILDROP", Const, 1}, + {"BIOCGHDRCMPLT", Const, 0}, + {"BIOCGRSIG", Const, 0}, + {"BIOCGRTIMEOUT", Const, 0}, + {"BIOCGSEESENT", Const, 0}, + {"BIOCGSTATS", Const, 0}, + {"BIOCGSTATSOLD", Const, 1}, + {"BIOCGTSTAMP", Const, 1}, + {"BIOCIMMEDIATE", Const, 0}, + {"BIOCLOCK", Const, 0}, + {"BIOCPROMISC", Const, 0}, + {"BIOCROTZBUF", Const, 0}, + {"BIOCSBLEN", Const, 0}, + {"BIOCSDIRECTION", Const, 0}, + {"BIOCSDIRFILT", Const, 1}, + {"BIOCSDLT", Const, 0}, + {"BIOCSETBUFMODE", Const, 0}, + {"BIOCSETF", Const, 0}, + {"BIOCSETFNR", Const, 0}, + {"BIOCSETIF", Const, 0}, + {"BIOCSETWF", Const, 0}, + {"BIOCSETZBUF", Const, 0}, + {"BIOCSFEEDBACK", Const, 1}, + {"BIOCSFILDROP", Const, 1}, + {"BIOCSHDRCMPLT", Const, 0}, + {"BIOCSRSIG", Const, 0}, + {"BIOCSRTIMEOUT", Const, 0}, + {"BIOCSSEESENT", Const, 0}, + {"BIOCSTCPF", Const, 1}, + {"BIOCSTSTAMP", Const, 1}, + {"BIOCSUDPF", Const, 1}, + {"BIOCVERSION", Const, 0}, + {"BPF_A", Const, 0}, + {"BPF_ABS", Const, 0}, + {"BPF_ADD", Const, 0}, + {"BPF_ALIGNMENT", Const, 0}, + {"BPF_ALIGNMENT32", Const, 1}, + {"BPF_ALU", Const, 0}, + {"BPF_AND", Const, 0}, + {"BPF_B", Const, 0}, + {"BPF_BUFMODE_BUFFER", Const, 0}, + {"BPF_BUFMODE_ZBUF", Const, 0}, + {"BPF_DFLTBUFSIZE", Const, 1}, + {"BPF_DIRECTION_IN", Const, 1}, + {"BPF_DIRECTION_OUT", Const, 1}, + {"BPF_DIV", Const, 0}, + {"BPF_H", Const, 0}, + {"BPF_IMM", Const, 0}, + {"BPF_IND", Const, 0}, + {"BPF_JA", Const, 0}, + {"BPF_JEQ", Const, 0}, + {"BPF_JGE", Const, 0}, + {"BPF_JGT", Const, 0}, + {"BPF_JMP", Const, 0}, + {"BPF_JSET", Const, 0}, + {"BPF_K", Const, 0}, + {"BPF_LD", Const, 0}, + {"BPF_LDX", Const, 0}, + {"BPF_LEN", Const, 0}, + {"BPF_LSH", Const, 0}, + {"BPF_MAJOR_VERSION", Const, 0}, + {"BPF_MAXBUFSIZE", Const, 0}, + {"BPF_MAXINSNS", Const, 0}, + {"BPF_MEM", Const, 0}, + {"BPF_MEMWORDS", Const, 0}, + {"BPF_MINBUFSIZE", Const, 0}, + {"BPF_MINOR_VERSION", Const, 0}, + {"BPF_MISC", Const, 0}, + {"BPF_MSH", Const, 0}, + {"BPF_MUL", Const, 0}, + {"BPF_NEG", Const, 0}, + {"BPF_OR", Const, 0}, + {"BPF_RELEASE", Const, 0}, + {"BPF_RET", Const, 0}, + {"BPF_RSH", Const, 0}, + {"BPF_ST", Const, 0}, + {"BPF_STX", Const, 0}, + {"BPF_SUB", Const, 0}, + {"BPF_TAX", Const, 0}, + {"BPF_TXA", Const, 0}, + {"BPF_T_BINTIME", Const, 1}, + {"BPF_T_BINTIME_FAST", Const, 1}, + {"BPF_T_BINTIME_MONOTONIC", Const, 1}, + {"BPF_T_BINTIME_MONOTONIC_FAST", Const, 1}, + {"BPF_T_FAST", Const, 1}, + {"BPF_T_FLAG_MASK", Const, 1}, + {"BPF_T_FORMAT_MASK", Const, 1}, + {"BPF_T_MICROTIME", Const, 1}, + {"BPF_T_MICROTIME_FAST", Const, 1}, + {"BPF_T_MICROTIME_MONOTONIC", Const, 1}, + {"BPF_T_MICROTIME_MONOTONIC_FAST", Const, 1}, + {"BPF_T_MONOTONIC", Const, 1}, + {"BPF_T_MONOTONIC_FAST", Const, 1}, + {"BPF_T_NANOTIME", Const, 1}, + {"BPF_T_NANOTIME_FAST", Const, 1}, + {"BPF_T_NANOTIME_MONOTONIC", Const, 1}, + {"BPF_T_NANOTIME_MONOTONIC_FAST", Const, 1}, + {"BPF_T_NONE", Const, 1}, + {"BPF_T_NORMAL", Const, 1}, + {"BPF_W", Const, 0}, + {"BPF_X", Const, 0}, + {"BRKINT", Const, 0}, + {"Bind", Func, 0}, + {"BindToDevice", Func, 0}, + {"BpfBuflen", Func, 0}, + {"BpfDatalink", Func, 0}, + {"BpfHdr", Type, 0}, + {"BpfHdr.Caplen", Field, 0}, + {"BpfHdr.Datalen", Field, 0}, + {"BpfHdr.Hdrlen", Field, 0}, + {"BpfHdr.Pad_cgo_0", Field, 0}, + {"BpfHdr.Tstamp", Field, 0}, + {"BpfHeadercmpl", Func, 0}, + {"BpfInsn", Type, 0}, + {"BpfInsn.Code", Field, 0}, + {"BpfInsn.Jf", Field, 0}, + {"BpfInsn.Jt", Field, 0}, + {"BpfInsn.K", Field, 0}, + {"BpfInterface", Func, 0}, + {"BpfJump", Func, 0}, + {"BpfProgram", Type, 0}, + {"BpfProgram.Insns", Field, 0}, + {"BpfProgram.Len", Field, 0}, + {"BpfProgram.Pad_cgo_0", Field, 0}, + {"BpfStat", Type, 0}, + {"BpfStat.Capt", Field, 2}, + {"BpfStat.Drop", Field, 0}, + {"BpfStat.Padding", Field, 2}, + {"BpfStat.Recv", Field, 0}, + {"BpfStats", Func, 0}, + {"BpfStmt", Func, 0}, + {"BpfTimeout", Func, 0}, + {"BpfTimeval", Type, 2}, + {"BpfTimeval.Sec", Field, 2}, + {"BpfTimeval.Usec", Field, 2}, + {"BpfVersion", Type, 0}, + {"BpfVersion.Major", Field, 0}, + {"BpfVersion.Minor", Field, 0}, + {"BpfZbuf", Type, 0}, + {"BpfZbuf.Bufa", Field, 0}, + {"BpfZbuf.Bufb", Field, 0}, + {"BpfZbuf.Buflen", Field, 0}, + {"BpfZbufHeader", Type, 0}, + {"BpfZbufHeader.Kernel_gen", Field, 0}, + {"BpfZbufHeader.Kernel_len", Field, 0}, + {"BpfZbufHeader.User_gen", Field, 0}, + {"BpfZbufHeader.X_bzh_pad", Field, 0}, + {"ByHandleFileInformation", Type, 0}, + {"ByHandleFileInformation.CreationTime", Field, 0}, + {"ByHandleFileInformation.FileAttributes", Field, 0}, + {"ByHandleFileInformation.FileIndexHigh", Field, 0}, + {"ByHandleFileInformation.FileIndexLow", Field, 0}, + {"ByHandleFileInformation.FileSizeHigh", Field, 0}, + {"ByHandleFileInformation.FileSizeLow", Field, 0}, + {"ByHandleFileInformation.LastAccessTime", Field, 0}, + {"ByHandleFileInformation.LastWriteTime", Field, 0}, + {"ByHandleFileInformation.NumberOfLinks", Field, 0}, + {"ByHandleFileInformation.VolumeSerialNumber", Field, 0}, + {"BytePtrFromString", Func, 1}, + {"ByteSliceFromString", Func, 1}, + {"CCR0_FLUSH", Const, 1}, + {"CERT_CHAIN_POLICY_AUTHENTICODE", Const, 0}, + {"CERT_CHAIN_POLICY_AUTHENTICODE_TS", Const, 0}, + {"CERT_CHAIN_POLICY_BASE", Const, 0}, + {"CERT_CHAIN_POLICY_BASIC_CONSTRAINTS", Const, 0}, + {"CERT_CHAIN_POLICY_EV", Const, 0}, + {"CERT_CHAIN_POLICY_MICROSOFT_ROOT", Const, 0}, + {"CERT_CHAIN_POLICY_NT_AUTH", Const, 0}, + {"CERT_CHAIN_POLICY_SSL", Const, 0}, + {"CERT_E_CN_NO_MATCH", Const, 0}, + {"CERT_E_EXPIRED", Const, 0}, + {"CERT_E_PURPOSE", Const, 0}, + {"CERT_E_ROLE", Const, 0}, + {"CERT_E_UNTRUSTEDROOT", Const, 0}, + {"CERT_STORE_ADD_ALWAYS", Const, 0}, + {"CERT_STORE_DEFER_CLOSE_UNTIL_LAST_FREE_FLAG", Const, 0}, + {"CERT_STORE_PROV_MEMORY", Const, 0}, + {"CERT_TRUST_HAS_EXCLUDED_NAME_CONSTRAINT", Const, 0}, + {"CERT_TRUST_HAS_NOT_DEFINED_NAME_CONSTRAINT", Const, 0}, + {"CERT_TRUST_HAS_NOT_PERMITTED_NAME_CONSTRAINT", Const, 0}, + {"CERT_TRUST_HAS_NOT_SUPPORTED_CRITICAL_EXT", Const, 0}, + {"CERT_TRUST_HAS_NOT_SUPPORTED_NAME_CONSTRAINT", Const, 0}, + {"CERT_TRUST_INVALID_BASIC_CONSTRAINTS", Const, 0}, + {"CERT_TRUST_INVALID_EXTENSION", Const, 0}, + {"CERT_TRUST_INVALID_NAME_CONSTRAINTS", Const, 0}, + {"CERT_TRUST_INVALID_POLICY_CONSTRAINTS", Const, 0}, + {"CERT_TRUST_IS_CYCLIC", Const, 0}, + {"CERT_TRUST_IS_EXPLICIT_DISTRUST", Const, 0}, + {"CERT_TRUST_IS_NOT_SIGNATURE_VALID", Const, 0}, + {"CERT_TRUST_IS_NOT_TIME_VALID", Const, 0}, + {"CERT_TRUST_IS_NOT_VALID_FOR_USAGE", Const, 0}, + {"CERT_TRUST_IS_OFFLINE_REVOCATION", Const, 0}, + {"CERT_TRUST_IS_REVOKED", Const, 0}, + {"CERT_TRUST_IS_UNTRUSTED_ROOT", Const, 0}, + {"CERT_TRUST_NO_ERROR", Const, 0}, + {"CERT_TRUST_NO_ISSUANCE_CHAIN_POLICY", Const, 0}, + {"CERT_TRUST_REVOCATION_STATUS_UNKNOWN", Const, 0}, + {"CFLUSH", Const, 1}, + {"CLOCAL", Const, 0}, + {"CLONE_CHILD_CLEARTID", Const, 2}, + {"CLONE_CHILD_SETTID", Const, 2}, + {"CLONE_CLEAR_SIGHAND", Const, 20}, + {"CLONE_CSIGNAL", Const, 3}, + {"CLONE_DETACHED", Const, 2}, + {"CLONE_FILES", Const, 2}, + {"CLONE_FS", Const, 2}, + {"CLONE_INTO_CGROUP", Const, 20}, + {"CLONE_IO", Const, 2}, + {"CLONE_NEWCGROUP", Const, 20}, + {"CLONE_NEWIPC", Const, 2}, + {"CLONE_NEWNET", Const, 2}, + {"CLONE_NEWNS", Const, 2}, + {"CLONE_NEWPID", Const, 2}, + {"CLONE_NEWTIME", Const, 20}, + {"CLONE_NEWUSER", Const, 2}, + {"CLONE_NEWUTS", Const, 2}, + {"CLONE_PARENT", Const, 2}, + {"CLONE_PARENT_SETTID", Const, 2}, + {"CLONE_PID", Const, 3}, + {"CLONE_PIDFD", Const, 20}, + {"CLONE_PTRACE", Const, 2}, + {"CLONE_SETTLS", Const, 2}, + {"CLONE_SIGHAND", Const, 2}, + {"CLONE_SYSVSEM", Const, 2}, + {"CLONE_THREAD", Const, 2}, + {"CLONE_UNTRACED", Const, 2}, + {"CLONE_VFORK", Const, 2}, + {"CLONE_VM", Const, 2}, + {"CPUID_CFLUSH", Const, 1}, + {"CREAD", Const, 0}, + {"CREATE_ALWAYS", Const, 0}, + {"CREATE_NEW", Const, 0}, + {"CREATE_NEW_PROCESS_GROUP", Const, 1}, + {"CREATE_UNICODE_ENVIRONMENT", Const, 0}, + {"CRYPT_DEFAULT_CONTAINER_OPTIONAL", Const, 0}, + {"CRYPT_DELETEKEYSET", Const, 0}, + {"CRYPT_MACHINE_KEYSET", Const, 0}, + {"CRYPT_NEWKEYSET", Const, 0}, + {"CRYPT_SILENT", Const, 0}, + {"CRYPT_VERIFYCONTEXT", Const, 0}, + {"CS5", Const, 0}, + {"CS6", Const, 0}, + {"CS7", Const, 0}, + {"CS8", Const, 0}, + {"CSIZE", Const, 0}, + {"CSTART", Const, 1}, + {"CSTATUS", Const, 1}, + {"CSTOP", Const, 1}, + {"CSTOPB", Const, 0}, + {"CSUSP", Const, 1}, + {"CTL_MAXNAME", Const, 0}, + {"CTL_NET", Const, 0}, + {"CTL_QUERY", Const, 1}, + {"CTRL_BREAK_EVENT", Const, 1}, + {"CTRL_CLOSE_EVENT", Const, 14}, + {"CTRL_C_EVENT", Const, 1}, + {"CTRL_LOGOFF_EVENT", Const, 14}, + {"CTRL_SHUTDOWN_EVENT", Const, 14}, + {"CancelIo", Func, 0}, + {"CancelIoEx", Func, 1}, + {"CertAddCertificateContextToStore", Func, 0}, + {"CertChainContext", Type, 0}, + {"CertChainContext.ChainCount", Field, 0}, + {"CertChainContext.Chains", Field, 0}, + {"CertChainContext.HasRevocationFreshnessTime", Field, 0}, + {"CertChainContext.LowerQualityChainCount", Field, 0}, + {"CertChainContext.LowerQualityChains", Field, 0}, + {"CertChainContext.RevocationFreshnessTime", Field, 0}, + {"CertChainContext.Size", Field, 0}, + {"CertChainContext.TrustStatus", Field, 0}, + {"CertChainElement", Type, 0}, + {"CertChainElement.ApplicationUsage", Field, 0}, + {"CertChainElement.CertContext", Field, 0}, + {"CertChainElement.ExtendedErrorInfo", Field, 0}, + {"CertChainElement.IssuanceUsage", Field, 0}, + {"CertChainElement.RevocationInfo", Field, 0}, + {"CertChainElement.Size", Field, 0}, + {"CertChainElement.TrustStatus", Field, 0}, + {"CertChainPara", Type, 0}, + {"CertChainPara.CacheResync", Field, 0}, + {"CertChainPara.CheckRevocationFreshnessTime", Field, 0}, + {"CertChainPara.RequestedUsage", Field, 0}, + {"CertChainPara.RequstedIssuancePolicy", Field, 0}, + {"CertChainPara.RevocationFreshnessTime", Field, 0}, + {"CertChainPara.Size", Field, 0}, + {"CertChainPara.URLRetrievalTimeout", Field, 0}, + {"CertChainPolicyPara", Type, 0}, + {"CertChainPolicyPara.ExtraPolicyPara", Field, 0}, + {"CertChainPolicyPara.Flags", Field, 0}, + {"CertChainPolicyPara.Size", Field, 0}, + {"CertChainPolicyStatus", Type, 0}, + {"CertChainPolicyStatus.ChainIndex", Field, 0}, + {"CertChainPolicyStatus.ElementIndex", Field, 0}, + {"CertChainPolicyStatus.Error", Field, 0}, + {"CertChainPolicyStatus.ExtraPolicyStatus", Field, 0}, + {"CertChainPolicyStatus.Size", Field, 0}, + {"CertCloseStore", Func, 0}, + {"CertContext", Type, 0}, + {"CertContext.CertInfo", Field, 0}, + {"CertContext.EncodedCert", Field, 0}, + {"CertContext.EncodingType", Field, 0}, + {"CertContext.Length", Field, 0}, + {"CertContext.Store", Field, 0}, + {"CertCreateCertificateContext", Func, 0}, + {"CertEnhKeyUsage", Type, 0}, + {"CertEnhKeyUsage.Length", Field, 0}, + {"CertEnhKeyUsage.UsageIdentifiers", Field, 0}, + {"CertEnumCertificatesInStore", Func, 0}, + {"CertFreeCertificateChain", Func, 0}, + {"CertFreeCertificateContext", Func, 0}, + {"CertGetCertificateChain", Func, 0}, + {"CertInfo", Type, 11}, + {"CertOpenStore", Func, 0}, + {"CertOpenSystemStore", Func, 0}, + {"CertRevocationCrlInfo", Type, 11}, + {"CertRevocationInfo", Type, 0}, + {"CertRevocationInfo.CrlInfo", Field, 0}, + {"CertRevocationInfo.FreshnessTime", Field, 0}, + {"CertRevocationInfo.HasFreshnessTime", Field, 0}, + {"CertRevocationInfo.OidSpecificInfo", Field, 0}, + {"CertRevocationInfo.RevocationOid", Field, 0}, + {"CertRevocationInfo.RevocationResult", Field, 0}, + {"CertRevocationInfo.Size", Field, 0}, + {"CertSimpleChain", Type, 0}, + {"CertSimpleChain.Elements", Field, 0}, + {"CertSimpleChain.HasRevocationFreshnessTime", Field, 0}, + {"CertSimpleChain.NumElements", Field, 0}, + {"CertSimpleChain.RevocationFreshnessTime", Field, 0}, + {"CertSimpleChain.Size", Field, 0}, + {"CertSimpleChain.TrustListInfo", Field, 0}, + {"CertSimpleChain.TrustStatus", Field, 0}, + {"CertTrustListInfo", Type, 11}, + {"CertTrustStatus", Type, 0}, + {"CertTrustStatus.ErrorStatus", Field, 0}, + {"CertTrustStatus.InfoStatus", Field, 0}, + {"CertUsageMatch", Type, 0}, + {"CertUsageMatch.Type", Field, 0}, + {"CertUsageMatch.Usage", Field, 0}, + {"CertVerifyCertificateChainPolicy", Func, 0}, + {"Chdir", Func, 0}, + {"CheckBpfVersion", Func, 0}, + {"Chflags", Func, 0}, + {"Chmod", Func, 0}, + {"Chown", Func, 0}, + {"Chroot", Func, 0}, + {"Clearenv", Func, 0}, + {"Close", Func, 0}, + {"CloseHandle", Func, 0}, + {"CloseOnExec", Func, 0}, + {"Closesocket", Func, 0}, + {"CmsgLen", Func, 0}, + {"CmsgSpace", Func, 0}, + {"Cmsghdr", Type, 0}, + {"Cmsghdr.Len", Field, 0}, + {"Cmsghdr.Level", Field, 0}, + {"Cmsghdr.Type", Field, 0}, + {"Cmsghdr.X__cmsg_data", Field, 0}, + {"CommandLineToArgv", Func, 0}, + {"ComputerName", Func, 0}, + {"Conn", Type, 9}, + {"Connect", Func, 0}, + {"ConnectEx", Func, 1}, + {"ConvertSidToStringSid", Func, 0}, + {"ConvertStringSidToSid", Func, 0}, + {"CopySid", Func, 0}, + {"Creat", Func, 0}, + {"CreateDirectory", Func, 0}, + {"CreateFile", Func, 0}, + {"CreateFileMapping", Func, 0}, + {"CreateHardLink", Func, 4}, + {"CreateIoCompletionPort", Func, 0}, + {"CreatePipe", Func, 0}, + {"CreateProcess", Func, 0}, + {"CreateProcessAsUser", Func, 10}, + {"CreateSymbolicLink", Func, 4}, + {"CreateToolhelp32Snapshot", Func, 4}, + {"Credential", Type, 0}, + {"Credential.Gid", Field, 0}, + {"Credential.Groups", Field, 0}, + {"Credential.NoSetGroups", Field, 9}, + {"Credential.Uid", Field, 0}, + {"CryptAcquireContext", Func, 0}, + {"CryptGenRandom", Func, 0}, + {"CryptReleaseContext", Func, 0}, + {"DIOCBSFLUSH", Const, 1}, + {"DIOCOSFPFLUSH", Const, 1}, + {"DLL", Type, 0}, + {"DLL.Handle", Field, 0}, + {"DLL.Name", Field, 0}, + {"DLLError", Type, 0}, + {"DLLError.Err", Field, 0}, + {"DLLError.Msg", Field, 0}, + {"DLLError.ObjName", Field, 0}, + {"DLT_A429", Const, 0}, + {"DLT_A653_ICM", Const, 0}, + {"DLT_AIRONET_HEADER", Const, 0}, + {"DLT_AOS", Const, 1}, + {"DLT_APPLE_IP_OVER_IEEE1394", Const, 0}, + {"DLT_ARCNET", Const, 0}, + {"DLT_ARCNET_LINUX", Const, 0}, + {"DLT_ATM_CLIP", Const, 0}, + {"DLT_ATM_RFC1483", Const, 0}, + {"DLT_AURORA", Const, 0}, + {"DLT_AX25", Const, 0}, + {"DLT_AX25_KISS", Const, 0}, + {"DLT_BACNET_MS_TP", Const, 0}, + {"DLT_BLUETOOTH_HCI_H4", Const, 0}, + {"DLT_BLUETOOTH_HCI_H4_WITH_PHDR", Const, 0}, + {"DLT_CAN20B", Const, 0}, + {"DLT_CAN_SOCKETCAN", Const, 1}, + {"DLT_CHAOS", Const, 0}, + {"DLT_CHDLC", Const, 0}, + {"DLT_CISCO_IOS", Const, 0}, + {"DLT_C_HDLC", Const, 0}, + {"DLT_C_HDLC_WITH_DIR", Const, 0}, + {"DLT_DBUS", Const, 1}, + {"DLT_DECT", Const, 1}, + {"DLT_DOCSIS", Const, 0}, + {"DLT_DVB_CI", Const, 1}, + {"DLT_ECONET", Const, 0}, + {"DLT_EN10MB", Const, 0}, + {"DLT_EN3MB", Const, 0}, + {"DLT_ENC", Const, 0}, + {"DLT_ERF", Const, 0}, + {"DLT_ERF_ETH", Const, 0}, + {"DLT_ERF_POS", Const, 0}, + {"DLT_FC_2", Const, 1}, + {"DLT_FC_2_WITH_FRAME_DELIMS", Const, 1}, + {"DLT_FDDI", Const, 0}, + {"DLT_FLEXRAY", Const, 0}, + {"DLT_FRELAY", Const, 0}, + {"DLT_FRELAY_WITH_DIR", Const, 0}, + {"DLT_GCOM_SERIAL", Const, 0}, + {"DLT_GCOM_T1E1", Const, 0}, + {"DLT_GPF_F", Const, 0}, + {"DLT_GPF_T", Const, 0}, + {"DLT_GPRS_LLC", Const, 0}, + {"DLT_GSMTAP_ABIS", Const, 1}, + {"DLT_GSMTAP_UM", Const, 1}, + {"DLT_HDLC", Const, 1}, + {"DLT_HHDLC", Const, 0}, + {"DLT_HIPPI", Const, 1}, + {"DLT_IBM_SN", Const, 0}, + {"DLT_IBM_SP", Const, 0}, + {"DLT_IEEE802", Const, 0}, + {"DLT_IEEE802_11", Const, 0}, + {"DLT_IEEE802_11_RADIO", Const, 0}, + {"DLT_IEEE802_11_RADIO_AVS", Const, 0}, + {"DLT_IEEE802_15_4", Const, 0}, + {"DLT_IEEE802_15_4_LINUX", Const, 0}, + {"DLT_IEEE802_15_4_NOFCS", Const, 1}, + {"DLT_IEEE802_15_4_NONASK_PHY", Const, 0}, + {"DLT_IEEE802_16_MAC_CPS", Const, 0}, + {"DLT_IEEE802_16_MAC_CPS_RADIO", Const, 0}, + {"DLT_IPFILTER", Const, 0}, + {"DLT_IPMB", Const, 0}, + {"DLT_IPMB_LINUX", Const, 0}, + {"DLT_IPNET", Const, 1}, + {"DLT_IPOIB", Const, 1}, + {"DLT_IPV4", Const, 1}, + {"DLT_IPV6", Const, 1}, + {"DLT_IP_OVER_FC", Const, 0}, + {"DLT_JUNIPER_ATM1", Const, 0}, + {"DLT_JUNIPER_ATM2", Const, 0}, + {"DLT_JUNIPER_ATM_CEMIC", Const, 1}, + {"DLT_JUNIPER_CHDLC", Const, 0}, + {"DLT_JUNIPER_ES", Const, 0}, + {"DLT_JUNIPER_ETHER", Const, 0}, + {"DLT_JUNIPER_FIBRECHANNEL", Const, 1}, + {"DLT_JUNIPER_FRELAY", Const, 0}, + {"DLT_JUNIPER_GGSN", Const, 0}, + {"DLT_JUNIPER_ISM", Const, 0}, + {"DLT_JUNIPER_MFR", Const, 0}, + {"DLT_JUNIPER_MLFR", Const, 0}, + {"DLT_JUNIPER_MLPPP", Const, 0}, + {"DLT_JUNIPER_MONITOR", Const, 0}, + {"DLT_JUNIPER_PIC_PEER", Const, 0}, + {"DLT_JUNIPER_PPP", Const, 0}, + {"DLT_JUNIPER_PPPOE", Const, 0}, + {"DLT_JUNIPER_PPPOE_ATM", Const, 0}, + {"DLT_JUNIPER_SERVICES", Const, 0}, + {"DLT_JUNIPER_SRX_E2E", Const, 1}, + {"DLT_JUNIPER_ST", Const, 0}, + {"DLT_JUNIPER_VP", Const, 0}, + {"DLT_JUNIPER_VS", Const, 1}, + {"DLT_LAPB_WITH_DIR", Const, 0}, + {"DLT_LAPD", Const, 0}, + {"DLT_LIN", Const, 0}, + {"DLT_LINUX_EVDEV", Const, 1}, + {"DLT_LINUX_IRDA", Const, 0}, + {"DLT_LINUX_LAPD", Const, 0}, + {"DLT_LINUX_PPP_WITHDIRECTION", Const, 0}, + {"DLT_LINUX_SLL", Const, 0}, + {"DLT_LOOP", Const, 0}, + {"DLT_LTALK", Const, 0}, + {"DLT_MATCHING_MAX", Const, 1}, + {"DLT_MATCHING_MIN", Const, 1}, + {"DLT_MFR", Const, 0}, + {"DLT_MOST", Const, 0}, + {"DLT_MPEG_2_TS", Const, 1}, + {"DLT_MPLS", Const, 1}, + {"DLT_MTP2", Const, 0}, + {"DLT_MTP2_WITH_PHDR", Const, 0}, + {"DLT_MTP3", Const, 0}, + {"DLT_MUX27010", Const, 1}, + {"DLT_NETANALYZER", Const, 1}, + {"DLT_NETANALYZER_TRANSPARENT", Const, 1}, + {"DLT_NFC_LLCP", Const, 1}, + {"DLT_NFLOG", Const, 1}, + {"DLT_NG40", Const, 1}, + {"DLT_NULL", Const, 0}, + {"DLT_PCI_EXP", Const, 0}, + {"DLT_PFLOG", Const, 0}, + {"DLT_PFSYNC", Const, 0}, + {"DLT_PPI", Const, 0}, + {"DLT_PPP", Const, 0}, + {"DLT_PPP_BSDOS", Const, 0}, + {"DLT_PPP_ETHER", Const, 0}, + {"DLT_PPP_PPPD", Const, 0}, + {"DLT_PPP_SERIAL", Const, 0}, + {"DLT_PPP_WITH_DIR", Const, 0}, + {"DLT_PPP_WITH_DIRECTION", Const, 0}, + {"DLT_PRISM_HEADER", Const, 0}, + {"DLT_PRONET", Const, 0}, + {"DLT_RAIF1", Const, 0}, + {"DLT_RAW", Const, 0}, + {"DLT_RAWAF_MASK", Const, 1}, + {"DLT_RIO", Const, 0}, + {"DLT_SCCP", Const, 0}, + {"DLT_SITA", Const, 0}, + {"DLT_SLIP", Const, 0}, + {"DLT_SLIP_BSDOS", Const, 0}, + {"DLT_STANAG_5066_D_PDU", Const, 1}, + {"DLT_SUNATM", Const, 0}, + {"DLT_SYMANTEC_FIREWALL", Const, 0}, + {"DLT_TZSP", Const, 0}, + {"DLT_USB", Const, 0}, + {"DLT_USB_LINUX", Const, 0}, + {"DLT_USB_LINUX_MMAPPED", Const, 1}, + {"DLT_USER0", Const, 0}, + {"DLT_USER1", Const, 0}, + {"DLT_USER10", Const, 0}, + {"DLT_USER11", Const, 0}, + {"DLT_USER12", Const, 0}, + {"DLT_USER13", Const, 0}, + {"DLT_USER14", Const, 0}, + {"DLT_USER15", Const, 0}, + {"DLT_USER2", Const, 0}, + {"DLT_USER3", Const, 0}, + {"DLT_USER4", Const, 0}, + {"DLT_USER5", Const, 0}, + {"DLT_USER6", Const, 0}, + {"DLT_USER7", Const, 0}, + {"DLT_USER8", Const, 0}, + {"DLT_USER9", Const, 0}, + {"DLT_WIHART", Const, 1}, + {"DLT_X2E_SERIAL", Const, 0}, + {"DLT_X2E_XORAYA", Const, 0}, + {"DNSMXData", Type, 0}, + {"DNSMXData.NameExchange", Field, 0}, + {"DNSMXData.Pad", Field, 0}, + {"DNSMXData.Preference", Field, 0}, + {"DNSPTRData", Type, 0}, + {"DNSPTRData.Host", Field, 0}, + {"DNSRecord", Type, 0}, + {"DNSRecord.Data", Field, 0}, + {"DNSRecord.Dw", Field, 0}, + {"DNSRecord.Length", Field, 0}, + {"DNSRecord.Name", Field, 0}, + {"DNSRecord.Next", Field, 0}, + {"DNSRecord.Reserved", Field, 0}, + {"DNSRecord.Ttl", Field, 0}, + {"DNSRecord.Type", Field, 0}, + {"DNSSRVData", Type, 0}, + {"DNSSRVData.Pad", Field, 0}, + {"DNSSRVData.Port", Field, 0}, + {"DNSSRVData.Priority", Field, 0}, + {"DNSSRVData.Target", Field, 0}, + {"DNSSRVData.Weight", Field, 0}, + {"DNSTXTData", Type, 0}, + {"DNSTXTData.StringArray", Field, 0}, + {"DNSTXTData.StringCount", Field, 0}, + {"DNS_INFO_NO_RECORDS", Const, 4}, + {"DNS_TYPE_A", Const, 0}, + {"DNS_TYPE_A6", Const, 0}, + {"DNS_TYPE_AAAA", Const, 0}, + {"DNS_TYPE_ADDRS", Const, 0}, + {"DNS_TYPE_AFSDB", Const, 0}, + {"DNS_TYPE_ALL", Const, 0}, + {"DNS_TYPE_ANY", Const, 0}, + {"DNS_TYPE_ATMA", Const, 0}, + {"DNS_TYPE_AXFR", Const, 0}, + {"DNS_TYPE_CERT", Const, 0}, + {"DNS_TYPE_CNAME", Const, 0}, + {"DNS_TYPE_DHCID", Const, 0}, + {"DNS_TYPE_DNAME", Const, 0}, + {"DNS_TYPE_DNSKEY", Const, 0}, + {"DNS_TYPE_DS", Const, 0}, + {"DNS_TYPE_EID", Const, 0}, + {"DNS_TYPE_GID", Const, 0}, + {"DNS_TYPE_GPOS", Const, 0}, + {"DNS_TYPE_HINFO", Const, 0}, + {"DNS_TYPE_ISDN", Const, 0}, + {"DNS_TYPE_IXFR", Const, 0}, + {"DNS_TYPE_KEY", Const, 0}, + {"DNS_TYPE_KX", Const, 0}, + {"DNS_TYPE_LOC", Const, 0}, + {"DNS_TYPE_MAILA", Const, 0}, + {"DNS_TYPE_MAILB", Const, 0}, + {"DNS_TYPE_MB", Const, 0}, + {"DNS_TYPE_MD", Const, 0}, + {"DNS_TYPE_MF", Const, 0}, + {"DNS_TYPE_MG", Const, 0}, + {"DNS_TYPE_MINFO", Const, 0}, + {"DNS_TYPE_MR", Const, 0}, + {"DNS_TYPE_MX", Const, 0}, + {"DNS_TYPE_NAPTR", Const, 0}, + {"DNS_TYPE_NBSTAT", Const, 0}, + {"DNS_TYPE_NIMLOC", Const, 0}, + {"DNS_TYPE_NS", Const, 0}, + {"DNS_TYPE_NSAP", Const, 0}, + {"DNS_TYPE_NSAPPTR", Const, 0}, + {"DNS_TYPE_NSEC", Const, 0}, + {"DNS_TYPE_NULL", Const, 0}, + {"DNS_TYPE_NXT", Const, 0}, + {"DNS_TYPE_OPT", Const, 0}, + {"DNS_TYPE_PTR", Const, 0}, + {"DNS_TYPE_PX", Const, 0}, + {"DNS_TYPE_RP", Const, 0}, + {"DNS_TYPE_RRSIG", Const, 0}, + {"DNS_TYPE_RT", Const, 0}, + {"DNS_TYPE_SIG", Const, 0}, + {"DNS_TYPE_SINK", Const, 0}, + {"DNS_TYPE_SOA", Const, 0}, + {"DNS_TYPE_SRV", Const, 0}, + {"DNS_TYPE_TEXT", Const, 0}, + {"DNS_TYPE_TKEY", Const, 0}, + {"DNS_TYPE_TSIG", Const, 0}, + {"DNS_TYPE_UID", Const, 0}, + {"DNS_TYPE_UINFO", Const, 0}, + {"DNS_TYPE_UNSPEC", Const, 0}, + {"DNS_TYPE_WINS", Const, 0}, + {"DNS_TYPE_WINSR", Const, 0}, + {"DNS_TYPE_WKS", Const, 0}, + {"DNS_TYPE_X25", Const, 0}, + {"DT_BLK", Const, 0}, + {"DT_CHR", Const, 0}, + {"DT_DIR", Const, 0}, + {"DT_FIFO", Const, 0}, + {"DT_LNK", Const, 0}, + {"DT_REG", Const, 0}, + {"DT_SOCK", Const, 0}, + {"DT_UNKNOWN", Const, 0}, + {"DT_WHT", Const, 0}, + {"DUPLICATE_CLOSE_SOURCE", Const, 0}, + {"DUPLICATE_SAME_ACCESS", Const, 0}, + {"DeleteFile", Func, 0}, + {"DetachLsf", Func, 0}, + {"DeviceIoControl", Func, 4}, + {"Dirent", Type, 0}, + {"Dirent.Fileno", Field, 0}, + {"Dirent.Ino", Field, 0}, + {"Dirent.Name", Field, 0}, + {"Dirent.Namlen", Field, 0}, + {"Dirent.Off", Field, 0}, + {"Dirent.Pad0", Field, 12}, + {"Dirent.Pad1", Field, 12}, + {"Dirent.Pad_cgo_0", Field, 0}, + {"Dirent.Reclen", Field, 0}, + {"Dirent.Seekoff", Field, 0}, + {"Dirent.Type", Field, 0}, + {"Dirent.X__d_padding", Field, 3}, + {"DnsNameCompare", Func, 4}, + {"DnsQuery", Func, 0}, + {"DnsRecordListFree", Func, 0}, + {"DnsSectionAdditional", Const, 4}, + {"DnsSectionAnswer", Const, 4}, + {"DnsSectionAuthority", Const, 4}, + {"DnsSectionQuestion", Const, 4}, + {"Dup", Func, 0}, + {"Dup2", Func, 0}, + {"Dup3", Func, 2}, + {"DuplicateHandle", Func, 0}, + {"E2BIG", Const, 0}, + {"EACCES", Const, 0}, + {"EADDRINUSE", Const, 0}, + {"EADDRNOTAVAIL", Const, 0}, + {"EADV", Const, 0}, + {"EAFNOSUPPORT", Const, 0}, + {"EAGAIN", Const, 0}, + {"EALREADY", Const, 0}, + {"EAUTH", Const, 0}, + {"EBADARCH", Const, 0}, + {"EBADE", Const, 0}, + {"EBADEXEC", Const, 0}, + {"EBADF", Const, 0}, + {"EBADFD", Const, 0}, + {"EBADMACHO", Const, 0}, + {"EBADMSG", Const, 0}, + {"EBADR", Const, 0}, + {"EBADRPC", Const, 0}, + {"EBADRQC", Const, 0}, + {"EBADSLT", Const, 0}, + {"EBFONT", Const, 0}, + {"EBUSY", Const, 0}, + {"ECANCELED", Const, 0}, + {"ECAPMODE", Const, 1}, + {"ECHILD", Const, 0}, + {"ECHO", Const, 0}, + {"ECHOCTL", Const, 0}, + {"ECHOE", Const, 0}, + {"ECHOK", Const, 0}, + {"ECHOKE", Const, 0}, + {"ECHONL", Const, 0}, + {"ECHOPRT", Const, 0}, + {"ECHRNG", Const, 0}, + {"ECOMM", Const, 0}, + {"ECONNABORTED", Const, 0}, + {"ECONNREFUSED", Const, 0}, + {"ECONNRESET", Const, 0}, + {"EDEADLK", Const, 0}, + {"EDEADLOCK", Const, 0}, + {"EDESTADDRREQ", Const, 0}, + {"EDEVERR", Const, 0}, + {"EDOM", Const, 0}, + {"EDOOFUS", Const, 0}, + {"EDOTDOT", Const, 0}, + {"EDQUOT", Const, 0}, + {"EEXIST", Const, 0}, + {"EFAULT", Const, 0}, + {"EFBIG", Const, 0}, + {"EFER_LMA", Const, 1}, + {"EFER_LME", Const, 1}, + {"EFER_NXE", Const, 1}, + {"EFER_SCE", Const, 1}, + {"EFTYPE", Const, 0}, + {"EHOSTDOWN", Const, 0}, + {"EHOSTUNREACH", Const, 0}, + {"EHWPOISON", Const, 0}, + {"EIDRM", Const, 0}, + {"EILSEQ", Const, 0}, + {"EINPROGRESS", Const, 0}, + {"EINTR", Const, 0}, + {"EINVAL", Const, 0}, + {"EIO", Const, 0}, + {"EIPSEC", Const, 1}, + {"EISCONN", Const, 0}, + {"EISDIR", Const, 0}, + {"EISNAM", Const, 0}, + {"EKEYEXPIRED", Const, 0}, + {"EKEYREJECTED", Const, 0}, + {"EKEYREVOKED", Const, 0}, + {"EL2HLT", Const, 0}, + {"EL2NSYNC", Const, 0}, + {"EL3HLT", Const, 0}, + {"EL3RST", Const, 0}, + {"ELAST", Const, 0}, + {"ELF_NGREG", Const, 0}, + {"ELF_PRARGSZ", Const, 0}, + {"ELIBACC", Const, 0}, + {"ELIBBAD", Const, 0}, + {"ELIBEXEC", Const, 0}, + {"ELIBMAX", Const, 0}, + {"ELIBSCN", Const, 0}, + {"ELNRNG", Const, 0}, + {"ELOOP", Const, 0}, + {"EMEDIUMTYPE", Const, 0}, + {"EMFILE", Const, 0}, + {"EMLINK", Const, 0}, + {"EMSGSIZE", Const, 0}, + {"EMT_TAGOVF", Const, 1}, + {"EMULTIHOP", Const, 0}, + {"EMUL_ENABLED", Const, 1}, + {"EMUL_LINUX", Const, 1}, + {"EMUL_LINUX32", Const, 1}, + {"EMUL_MAXID", Const, 1}, + {"EMUL_NATIVE", Const, 1}, + {"ENAMETOOLONG", Const, 0}, + {"ENAVAIL", Const, 0}, + {"ENDRUNDISC", Const, 1}, + {"ENEEDAUTH", Const, 0}, + {"ENETDOWN", Const, 0}, + {"ENETRESET", Const, 0}, + {"ENETUNREACH", Const, 0}, + {"ENFILE", Const, 0}, + {"ENOANO", Const, 0}, + {"ENOATTR", Const, 0}, + {"ENOBUFS", Const, 0}, + {"ENOCSI", Const, 0}, + {"ENODATA", Const, 0}, + {"ENODEV", Const, 0}, + {"ENOENT", Const, 0}, + {"ENOEXEC", Const, 0}, + {"ENOKEY", Const, 0}, + {"ENOLCK", Const, 0}, + {"ENOLINK", Const, 0}, + {"ENOMEDIUM", Const, 0}, + {"ENOMEM", Const, 0}, + {"ENOMSG", Const, 0}, + {"ENONET", Const, 0}, + {"ENOPKG", Const, 0}, + {"ENOPOLICY", Const, 0}, + {"ENOPROTOOPT", Const, 0}, + {"ENOSPC", Const, 0}, + {"ENOSR", Const, 0}, + {"ENOSTR", Const, 0}, + {"ENOSYS", Const, 0}, + {"ENOTBLK", Const, 0}, + {"ENOTCAPABLE", Const, 0}, + {"ENOTCONN", Const, 0}, + {"ENOTDIR", Const, 0}, + {"ENOTEMPTY", Const, 0}, + {"ENOTNAM", Const, 0}, + {"ENOTRECOVERABLE", Const, 0}, + {"ENOTSOCK", Const, 0}, + {"ENOTSUP", Const, 0}, + {"ENOTTY", Const, 0}, + {"ENOTUNIQ", Const, 0}, + {"ENXIO", Const, 0}, + {"EN_SW_CTL_INF", Const, 1}, + {"EN_SW_CTL_PREC", Const, 1}, + {"EN_SW_CTL_ROUND", Const, 1}, + {"EN_SW_DATACHAIN", Const, 1}, + {"EN_SW_DENORM", Const, 1}, + {"EN_SW_INVOP", Const, 1}, + {"EN_SW_OVERFLOW", Const, 1}, + {"EN_SW_PRECLOSS", Const, 1}, + {"EN_SW_UNDERFLOW", Const, 1}, + {"EN_SW_ZERODIV", Const, 1}, + {"EOPNOTSUPP", Const, 0}, + {"EOVERFLOW", Const, 0}, + {"EOWNERDEAD", Const, 0}, + {"EPERM", Const, 0}, + {"EPFNOSUPPORT", Const, 0}, + {"EPIPE", Const, 0}, + {"EPOLLERR", Const, 0}, + {"EPOLLET", Const, 0}, + {"EPOLLHUP", Const, 0}, + {"EPOLLIN", Const, 0}, + {"EPOLLMSG", Const, 0}, + {"EPOLLONESHOT", Const, 0}, + {"EPOLLOUT", Const, 0}, + {"EPOLLPRI", Const, 0}, + {"EPOLLRDBAND", Const, 0}, + {"EPOLLRDHUP", Const, 0}, + {"EPOLLRDNORM", Const, 0}, + {"EPOLLWRBAND", Const, 0}, + {"EPOLLWRNORM", Const, 0}, + {"EPOLL_CLOEXEC", Const, 0}, + {"EPOLL_CTL_ADD", Const, 0}, + {"EPOLL_CTL_DEL", Const, 0}, + {"EPOLL_CTL_MOD", Const, 0}, + {"EPOLL_NONBLOCK", Const, 0}, + {"EPROCLIM", Const, 0}, + {"EPROCUNAVAIL", Const, 0}, + {"EPROGMISMATCH", Const, 0}, + {"EPROGUNAVAIL", Const, 0}, + {"EPROTO", Const, 0}, + {"EPROTONOSUPPORT", Const, 0}, + {"EPROTOTYPE", Const, 0}, + {"EPWROFF", Const, 0}, + {"EQFULL", Const, 16}, + {"ERANGE", Const, 0}, + {"EREMCHG", Const, 0}, + {"EREMOTE", Const, 0}, + {"EREMOTEIO", Const, 0}, + {"ERESTART", Const, 0}, + {"ERFKILL", Const, 0}, + {"EROFS", Const, 0}, + {"ERPCMISMATCH", Const, 0}, + {"ERROR_ACCESS_DENIED", Const, 0}, + {"ERROR_ALREADY_EXISTS", Const, 0}, + {"ERROR_BROKEN_PIPE", Const, 0}, + {"ERROR_BUFFER_OVERFLOW", Const, 0}, + {"ERROR_DIR_NOT_EMPTY", Const, 8}, + {"ERROR_ENVVAR_NOT_FOUND", Const, 0}, + {"ERROR_FILE_EXISTS", Const, 0}, + {"ERROR_FILE_NOT_FOUND", Const, 0}, + {"ERROR_HANDLE_EOF", Const, 2}, + {"ERROR_INSUFFICIENT_BUFFER", Const, 0}, + {"ERROR_IO_PENDING", Const, 0}, + {"ERROR_MOD_NOT_FOUND", Const, 0}, + {"ERROR_MORE_DATA", Const, 3}, + {"ERROR_NETNAME_DELETED", Const, 3}, + {"ERROR_NOT_FOUND", Const, 1}, + {"ERROR_NO_MORE_FILES", Const, 0}, + {"ERROR_OPERATION_ABORTED", Const, 0}, + {"ERROR_PATH_NOT_FOUND", Const, 0}, + {"ERROR_PRIVILEGE_NOT_HELD", Const, 4}, + {"ERROR_PROC_NOT_FOUND", Const, 0}, + {"ESHLIBVERS", Const, 0}, + {"ESHUTDOWN", Const, 0}, + {"ESOCKTNOSUPPORT", Const, 0}, + {"ESPIPE", Const, 0}, + {"ESRCH", Const, 0}, + {"ESRMNT", Const, 0}, + {"ESTALE", Const, 0}, + {"ESTRPIPE", Const, 0}, + {"ETHERCAP_JUMBO_MTU", Const, 1}, + {"ETHERCAP_VLAN_HWTAGGING", Const, 1}, + {"ETHERCAP_VLAN_MTU", Const, 1}, + {"ETHERMIN", Const, 1}, + {"ETHERMTU", Const, 1}, + {"ETHERMTU_JUMBO", Const, 1}, + {"ETHERTYPE_8023", Const, 1}, + {"ETHERTYPE_AARP", Const, 1}, + {"ETHERTYPE_ACCTON", Const, 1}, + {"ETHERTYPE_AEONIC", Const, 1}, + {"ETHERTYPE_ALPHA", Const, 1}, + {"ETHERTYPE_AMBER", Const, 1}, + {"ETHERTYPE_AMOEBA", Const, 1}, + {"ETHERTYPE_AOE", Const, 1}, + {"ETHERTYPE_APOLLO", Const, 1}, + {"ETHERTYPE_APOLLODOMAIN", Const, 1}, + {"ETHERTYPE_APPLETALK", Const, 1}, + {"ETHERTYPE_APPLITEK", Const, 1}, + {"ETHERTYPE_ARGONAUT", Const, 1}, + {"ETHERTYPE_ARP", Const, 1}, + {"ETHERTYPE_AT", Const, 1}, + {"ETHERTYPE_ATALK", Const, 1}, + {"ETHERTYPE_ATOMIC", Const, 1}, + {"ETHERTYPE_ATT", Const, 1}, + {"ETHERTYPE_ATTSTANFORD", Const, 1}, + {"ETHERTYPE_AUTOPHON", Const, 1}, + {"ETHERTYPE_AXIS", Const, 1}, + {"ETHERTYPE_BCLOOP", Const, 1}, + {"ETHERTYPE_BOFL", Const, 1}, + {"ETHERTYPE_CABLETRON", Const, 1}, + {"ETHERTYPE_CHAOS", Const, 1}, + {"ETHERTYPE_COMDESIGN", Const, 1}, + {"ETHERTYPE_COMPUGRAPHIC", Const, 1}, + {"ETHERTYPE_COUNTERPOINT", Const, 1}, + {"ETHERTYPE_CRONUS", Const, 1}, + {"ETHERTYPE_CRONUSVLN", Const, 1}, + {"ETHERTYPE_DCA", Const, 1}, + {"ETHERTYPE_DDE", Const, 1}, + {"ETHERTYPE_DEBNI", Const, 1}, + {"ETHERTYPE_DECAM", Const, 1}, + {"ETHERTYPE_DECCUST", Const, 1}, + {"ETHERTYPE_DECDIAG", Const, 1}, + {"ETHERTYPE_DECDNS", Const, 1}, + {"ETHERTYPE_DECDTS", Const, 1}, + {"ETHERTYPE_DECEXPER", Const, 1}, + {"ETHERTYPE_DECLAST", Const, 1}, + {"ETHERTYPE_DECLTM", Const, 1}, + {"ETHERTYPE_DECMUMPS", Const, 1}, + {"ETHERTYPE_DECNETBIOS", Const, 1}, + {"ETHERTYPE_DELTACON", Const, 1}, + {"ETHERTYPE_DIDDLE", Const, 1}, + {"ETHERTYPE_DLOG1", Const, 1}, + {"ETHERTYPE_DLOG2", Const, 1}, + {"ETHERTYPE_DN", Const, 1}, + {"ETHERTYPE_DOGFIGHT", Const, 1}, + {"ETHERTYPE_DSMD", Const, 1}, + {"ETHERTYPE_ECMA", Const, 1}, + {"ETHERTYPE_ENCRYPT", Const, 1}, + {"ETHERTYPE_ES", Const, 1}, + {"ETHERTYPE_EXCELAN", Const, 1}, + {"ETHERTYPE_EXPERDATA", Const, 1}, + {"ETHERTYPE_FLIP", Const, 1}, + {"ETHERTYPE_FLOWCONTROL", Const, 1}, + {"ETHERTYPE_FRARP", Const, 1}, + {"ETHERTYPE_GENDYN", Const, 1}, + {"ETHERTYPE_HAYES", Const, 1}, + {"ETHERTYPE_HIPPI_FP", Const, 1}, + {"ETHERTYPE_HITACHI", Const, 1}, + {"ETHERTYPE_HP", Const, 1}, + {"ETHERTYPE_IEEEPUP", Const, 1}, + {"ETHERTYPE_IEEEPUPAT", Const, 1}, + {"ETHERTYPE_IMLBL", Const, 1}, + {"ETHERTYPE_IMLBLDIAG", Const, 1}, + {"ETHERTYPE_IP", Const, 1}, + {"ETHERTYPE_IPAS", Const, 1}, + {"ETHERTYPE_IPV6", Const, 1}, + {"ETHERTYPE_IPX", Const, 1}, + {"ETHERTYPE_IPXNEW", Const, 1}, + {"ETHERTYPE_KALPANA", Const, 1}, + {"ETHERTYPE_LANBRIDGE", Const, 1}, + {"ETHERTYPE_LANPROBE", Const, 1}, + {"ETHERTYPE_LAT", Const, 1}, + {"ETHERTYPE_LBACK", Const, 1}, + {"ETHERTYPE_LITTLE", Const, 1}, + {"ETHERTYPE_LLDP", Const, 1}, + {"ETHERTYPE_LOGICRAFT", Const, 1}, + {"ETHERTYPE_LOOPBACK", Const, 1}, + {"ETHERTYPE_MATRA", Const, 1}, + {"ETHERTYPE_MAX", Const, 1}, + {"ETHERTYPE_MERIT", Const, 1}, + {"ETHERTYPE_MICP", Const, 1}, + {"ETHERTYPE_MOPDL", Const, 1}, + {"ETHERTYPE_MOPRC", Const, 1}, + {"ETHERTYPE_MOTOROLA", Const, 1}, + {"ETHERTYPE_MPLS", Const, 1}, + {"ETHERTYPE_MPLS_MCAST", Const, 1}, + {"ETHERTYPE_MUMPS", Const, 1}, + {"ETHERTYPE_NBPCC", Const, 1}, + {"ETHERTYPE_NBPCLAIM", Const, 1}, + {"ETHERTYPE_NBPCLREQ", Const, 1}, + {"ETHERTYPE_NBPCLRSP", Const, 1}, + {"ETHERTYPE_NBPCREQ", Const, 1}, + {"ETHERTYPE_NBPCRSP", Const, 1}, + {"ETHERTYPE_NBPDG", Const, 1}, + {"ETHERTYPE_NBPDGB", Const, 1}, + {"ETHERTYPE_NBPDLTE", Const, 1}, + {"ETHERTYPE_NBPRAR", Const, 1}, + {"ETHERTYPE_NBPRAS", Const, 1}, + {"ETHERTYPE_NBPRST", Const, 1}, + {"ETHERTYPE_NBPSCD", Const, 1}, + {"ETHERTYPE_NBPVCD", Const, 1}, + {"ETHERTYPE_NBS", Const, 1}, + {"ETHERTYPE_NCD", Const, 1}, + {"ETHERTYPE_NESTAR", Const, 1}, + {"ETHERTYPE_NETBEUI", Const, 1}, + {"ETHERTYPE_NOVELL", Const, 1}, + {"ETHERTYPE_NS", Const, 1}, + {"ETHERTYPE_NSAT", Const, 1}, + {"ETHERTYPE_NSCOMPAT", Const, 1}, + {"ETHERTYPE_NTRAILER", Const, 1}, + {"ETHERTYPE_OS9", Const, 1}, + {"ETHERTYPE_OS9NET", Const, 1}, + {"ETHERTYPE_PACER", Const, 1}, + {"ETHERTYPE_PAE", Const, 1}, + {"ETHERTYPE_PCS", Const, 1}, + {"ETHERTYPE_PLANNING", Const, 1}, + {"ETHERTYPE_PPP", Const, 1}, + {"ETHERTYPE_PPPOE", Const, 1}, + {"ETHERTYPE_PPPOEDISC", Const, 1}, + {"ETHERTYPE_PRIMENTS", Const, 1}, + {"ETHERTYPE_PUP", Const, 1}, + {"ETHERTYPE_PUPAT", Const, 1}, + {"ETHERTYPE_QINQ", Const, 1}, + {"ETHERTYPE_RACAL", Const, 1}, + {"ETHERTYPE_RATIONAL", Const, 1}, + {"ETHERTYPE_RAWFR", Const, 1}, + {"ETHERTYPE_RCL", Const, 1}, + {"ETHERTYPE_RDP", Const, 1}, + {"ETHERTYPE_RETIX", Const, 1}, + {"ETHERTYPE_REVARP", Const, 1}, + {"ETHERTYPE_SCA", Const, 1}, + {"ETHERTYPE_SECTRA", Const, 1}, + {"ETHERTYPE_SECUREDATA", Const, 1}, + {"ETHERTYPE_SGITW", Const, 1}, + {"ETHERTYPE_SG_BOUNCE", Const, 1}, + {"ETHERTYPE_SG_DIAG", Const, 1}, + {"ETHERTYPE_SG_NETGAMES", Const, 1}, + {"ETHERTYPE_SG_RESV", Const, 1}, + {"ETHERTYPE_SIMNET", Const, 1}, + {"ETHERTYPE_SLOW", Const, 1}, + {"ETHERTYPE_SLOWPROTOCOLS", Const, 1}, + {"ETHERTYPE_SNA", Const, 1}, + {"ETHERTYPE_SNMP", Const, 1}, + {"ETHERTYPE_SONIX", Const, 1}, + {"ETHERTYPE_SPIDER", Const, 1}, + {"ETHERTYPE_SPRITE", Const, 1}, + {"ETHERTYPE_STP", Const, 1}, + {"ETHERTYPE_TALARIS", Const, 1}, + {"ETHERTYPE_TALARISMC", Const, 1}, + {"ETHERTYPE_TCPCOMP", Const, 1}, + {"ETHERTYPE_TCPSM", Const, 1}, + {"ETHERTYPE_TEC", Const, 1}, + {"ETHERTYPE_TIGAN", Const, 1}, + {"ETHERTYPE_TRAIL", Const, 1}, + {"ETHERTYPE_TRANSETHER", Const, 1}, + {"ETHERTYPE_TYMSHARE", Const, 1}, + {"ETHERTYPE_UBBST", Const, 1}, + {"ETHERTYPE_UBDEBUG", Const, 1}, + {"ETHERTYPE_UBDIAGLOOP", Const, 1}, + {"ETHERTYPE_UBDL", Const, 1}, + {"ETHERTYPE_UBNIU", Const, 1}, + {"ETHERTYPE_UBNMC", Const, 1}, + {"ETHERTYPE_VALID", Const, 1}, + {"ETHERTYPE_VARIAN", Const, 1}, + {"ETHERTYPE_VAXELN", Const, 1}, + {"ETHERTYPE_VEECO", Const, 1}, + {"ETHERTYPE_VEXP", Const, 1}, + {"ETHERTYPE_VGLAB", Const, 1}, + {"ETHERTYPE_VINES", Const, 1}, + {"ETHERTYPE_VINESECHO", Const, 1}, + {"ETHERTYPE_VINESLOOP", Const, 1}, + {"ETHERTYPE_VITAL", Const, 1}, + {"ETHERTYPE_VLAN", Const, 1}, + {"ETHERTYPE_VLTLMAN", Const, 1}, + {"ETHERTYPE_VPROD", Const, 1}, + {"ETHERTYPE_VURESERVED", Const, 1}, + {"ETHERTYPE_WATERLOO", Const, 1}, + {"ETHERTYPE_WELLFLEET", Const, 1}, + {"ETHERTYPE_X25", Const, 1}, + {"ETHERTYPE_X75", Const, 1}, + {"ETHERTYPE_XNSSM", Const, 1}, + {"ETHERTYPE_XTP", Const, 1}, + {"ETHER_ADDR_LEN", Const, 1}, + {"ETHER_ALIGN", Const, 1}, + {"ETHER_CRC_LEN", Const, 1}, + {"ETHER_CRC_POLY_BE", Const, 1}, + {"ETHER_CRC_POLY_LE", Const, 1}, + {"ETHER_HDR_LEN", Const, 1}, + {"ETHER_MAX_DIX_LEN", Const, 1}, + {"ETHER_MAX_LEN", Const, 1}, + {"ETHER_MAX_LEN_JUMBO", Const, 1}, + {"ETHER_MIN_LEN", Const, 1}, + {"ETHER_PPPOE_ENCAP_LEN", Const, 1}, + {"ETHER_TYPE_LEN", Const, 1}, + {"ETHER_VLAN_ENCAP_LEN", Const, 1}, + {"ETH_P_1588", Const, 0}, + {"ETH_P_8021Q", Const, 0}, + {"ETH_P_802_2", Const, 0}, + {"ETH_P_802_3", Const, 0}, + {"ETH_P_AARP", Const, 0}, + {"ETH_P_ALL", Const, 0}, + {"ETH_P_AOE", Const, 0}, + {"ETH_P_ARCNET", Const, 0}, + {"ETH_P_ARP", Const, 0}, + {"ETH_P_ATALK", Const, 0}, + {"ETH_P_ATMFATE", Const, 0}, + {"ETH_P_ATMMPOA", Const, 0}, + {"ETH_P_AX25", Const, 0}, + {"ETH_P_BPQ", Const, 0}, + {"ETH_P_CAIF", Const, 0}, + {"ETH_P_CAN", Const, 0}, + {"ETH_P_CONTROL", Const, 0}, + {"ETH_P_CUST", Const, 0}, + {"ETH_P_DDCMP", Const, 0}, + {"ETH_P_DEC", Const, 0}, + {"ETH_P_DIAG", Const, 0}, + {"ETH_P_DNA_DL", Const, 0}, + {"ETH_P_DNA_RC", Const, 0}, + {"ETH_P_DNA_RT", Const, 0}, + {"ETH_P_DSA", Const, 0}, + {"ETH_P_ECONET", Const, 0}, + {"ETH_P_EDSA", Const, 0}, + {"ETH_P_FCOE", Const, 0}, + {"ETH_P_FIP", Const, 0}, + {"ETH_P_HDLC", Const, 0}, + {"ETH_P_IEEE802154", Const, 0}, + {"ETH_P_IEEEPUP", Const, 0}, + {"ETH_P_IEEEPUPAT", Const, 0}, + {"ETH_P_IP", Const, 0}, + {"ETH_P_IPV6", Const, 0}, + {"ETH_P_IPX", Const, 0}, + {"ETH_P_IRDA", Const, 0}, + {"ETH_P_LAT", Const, 0}, + {"ETH_P_LINK_CTL", Const, 0}, + {"ETH_P_LOCALTALK", Const, 0}, + {"ETH_P_LOOP", Const, 0}, + {"ETH_P_MOBITEX", Const, 0}, + {"ETH_P_MPLS_MC", Const, 0}, + {"ETH_P_MPLS_UC", Const, 0}, + {"ETH_P_PAE", Const, 0}, + {"ETH_P_PAUSE", Const, 0}, + {"ETH_P_PHONET", Const, 0}, + {"ETH_P_PPPTALK", Const, 0}, + {"ETH_P_PPP_DISC", Const, 0}, + {"ETH_P_PPP_MP", Const, 0}, + {"ETH_P_PPP_SES", Const, 0}, + {"ETH_P_PUP", Const, 0}, + {"ETH_P_PUPAT", Const, 0}, + {"ETH_P_RARP", Const, 0}, + {"ETH_P_SCA", Const, 0}, + {"ETH_P_SLOW", Const, 0}, + {"ETH_P_SNAP", Const, 0}, + {"ETH_P_TEB", Const, 0}, + {"ETH_P_TIPC", Const, 0}, + {"ETH_P_TRAILER", Const, 0}, + {"ETH_P_TR_802_2", Const, 0}, + {"ETH_P_WAN_PPP", Const, 0}, + {"ETH_P_WCCP", Const, 0}, + {"ETH_P_X25", Const, 0}, + {"ETIME", Const, 0}, + {"ETIMEDOUT", Const, 0}, + {"ETOOMANYREFS", Const, 0}, + {"ETXTBSY", Const, 0}, + {"EUCLEAN", Const, 0}, + {"EUNATCH", Const, 0}, + {"EUSERS", Const, 0}, + {"EVFILT_AIO", Const, 0}, + {"EVFILT_FS", Const, 0}, + {"EVFILT_LIO", Const, 0}, + {"EVFILT_MACHPORT", Const, 0}, + {"EVFILT_PROC", Const, 0}, + {"EVFILT_READ", Const, 0}, + {"EVFILT_SIGNAL", Const, 0}, + {"EVFILT_SYSCOUNT", Const, 0}, + {"EVFILT_THREADMARKER", Const, 0}, + {"EVFILT_TIMER", Const, 0}, + {"EVFILT_USER", Const, 0}, + {"EVFILT_VM", Const, 0}, + {"EVFILT_VNODE", Const, 0}, + {"EVFILT_WRITE", Const, 0}, + {"EV_ADD", Const, 0}, + {"EV_CLEAR", Const, 0}, + {"EV_DELETE", Const, 0}, + {"EV_DISABLE", Const, 0}, + {"EV_DISPATCH", Const, 0}, + {"EV_DROP", Const, 3}, + {"EV_ENABLE", Const, 0}, + {"EV_EOF", Const, 0}, + {"EV_ERROR", Const, 0}, + {"EV_FLAG0", Const, 0}, + {"EV_FLAG1", Const, 0}, + {"EV_ONESHOT", Const, 0}, + {"EV_OOBAND", Const, 0}, + {"EV_POLL", Const, 0}, + {"EV_RECEIPT", Const, 0}, + {"EV_SYSFLAGS", Const, 0}, + {"EWINDOWS", Const, 0}, + {"EWOULDBLOCK", Const, 0}, + {"EXDEV", Const, 0}, + {"EXFULL", Const, 0}, + {"EXTA", Const, 0}, + {"EXTB", Const, 0}, + {"EXTPROC", Const, 0}, + {"Environ", Func, 0}, + {"EpollCreate", Func, 0}, + {"EpollCreate1", Func, 0}, + {"EpollCtl", Func, 0}, + {"EpollEvent", Type, 0}, + {"EpollEvent.Events", Field, 0}, + {"EpollEvent.Fd", Field, 0}, + {"EpollEvent.Pad", Field, 0}, + {"EpollEvent.PadFd", Field, 0}, + {"EpollWait", Func, 0}, + {"Errno", Type, 0}, + {"EscapeArg", Func, 0}, + {"Exchangedata", Func, 0}, + {"Exec", Func, 0}, + {"Exit", Func, 0}, + {"ExitProcess", Func, 0}, + {"FD_CLOEXEC", Const, 0}, + {"FD_SETSIZE", Const, 0}, + {"FILE_ACTION_ADDED", Const, 0}, + {"FILE_ACTION_MODIFIED", Const, 0}, + {"FILE_ACTION_REMOVED", Const, 0}, + {"FILE_ACTION_RENAMED_NEW_NAME", Const, 0}, + {"FILE_ACTION_RENAMED_OLD_NAME", Const, 0}, + {"FILE_APPEND_DATA", Const, 0}, + {"FILE_ATTRIBUTE_ARCHIVE", Const, 0}, + {"FILE_ATTRIBUTE_DIRECTORY", Const, 0}, + {"FILE_ATTRIBUTE_HIDDEN", Const, 0}, + {"FILE_ATTRIBUTE_NORMAL", Const, 0}, + {"FILE_ATTRIBUTE_READONLY", Const, 0}, + {"FILE_ATTRIBUTE_REPARSE_POINT", Const, 4}, + {"FILE_ATTRIBUTE_SYSTEM", Const, 0}, + {"FILE_BEGIN", Const, 0}, + {"FILE_CURRENT", Const, 0}, + {"FILE_END", Const, 0}, + {"FILE_FLAG_BACKUP_SEMANTICS", Const, 0}, + {"FILE_FLAG_OPEN_REPARSE_POINT", Const, 4}, + {"FILE_FLAG_OVERLAPPED", Const, 0}, + {"FILE_LIST_DIRECTORY", Const, 0}, + {"FILE_MAP_COPY", Const, 0}, + {"FILE_MAP_EXECUTE", Const, 0}, + {"FILE_MAP_READ", Const, 0}, + {"FILE_MAP_WRITE", Const, 0}, + {"FILE_NOTIFY_CHANGE_ATTRIBUTES", Const, 0}, + {"FILE_NOTIFY_CHANGE_CREATION", Const, 0}, + {"FILE_NOTIFY_CHANGE_DIR_NAME", Const, 0}, + {"FILE_NOTIFY_CHANGE_FILE_NAME", Const, 0}, + {"FILE_NOTIFY_CHANGE_LAST_ACCESS", Const, 0}, + {"FILE_NOTIFY_CHANGE_LAST_WRITE", Const, 0}, + {"FILE_NOTIFY_CHANGE_SIZE", Const, 0}, + {"FILE_SHARE_DELETE", Const, 0}, + {"FILE_SHARE_READ", Const, 0}, + {"FILE_SHARE_WRITE", Const, 0}, + {"FILE_SKIP_COMPLETION_PORT_ON_SUCCESS", Const, 2}, + {"FILE_SKIP_SET_EVENT_ON_HANDLE", Const, 2}, + {"FILE_TYPE_CHAR", Const, 0}, + {"FILE_TYPE_DISK", Const, 0}, + {"FILE_TYPE_PIPE", Const, 0}, + {"FILE_TYPE_REMOTE", Const, 0}, + {"FILE_TYPE_UNKNOWN", Const, 0}, + {"FILE_WRITE_ATTRIBUTES", Const, 0}, + {"FLUSHO", Const, 0}, + {"FORMAT_MESSAGE_ALLOCATE_BUFFER", Const, 0}, + {"FORMAT_MESSAGE_ARGUMENT_ARRAY", Const, 0}, + {"FORMAT_MESSAGE_FROM_HMODULE", Const, 0}, + {"FORMAT_MESSAGE_FROM_STRING", Const, 0}, + {"FORMAT_MESSAGE_FROM_SYSTEM", Const, 0}, + {"FORMAT_MESSAGE_IGNORE_INSERTS", Const, 0}, + {"FORMAT_MESSAGE_MAX_WIDTH_MASK", Const, 0}, + {"FSCTL_GET_REPARSE_POINT", Const, 4}, + {"F_ADDFILESIGS", Const, 0}, + {"F_ADDSIGS", Const, 0}, + {"F_ALLOCATEALL", Const, 0}, + {"F_ALLOCATECONTIG", Const, 0}, + {"F_CANCEL", Const, 0}, + {"F_CHKCLEAN", Const, 0}, + {"F_CLOSEM", Const, 1}, + {"F_DUP2FD", Const, 0}, + {"F_DUP2FD_CLOEXEC", Const, 1}, + {"F_DUPFD", Const, 0}, + {"F_DUPFD_CLOEXEC", Const, 0}, + {"F_EXLCK", Const, 0}, + {"F_FINDSIGS", Const, 16}, + {"F_FLUSH_DATA", Const, 0}, + {"F_FREEZE_FS", Const, 0}, + {"F_FSCTL", Const, 1}, + {"F_FSDIRMASK", Const, 1}, + {"F_FSIN", Const, 1}, + {"F_FSINOUT", Const, 1}, + {"F_FSOUT", Const, 1}, + {"F_FSPRIV", Const, 1}, + {"F_FSVOID", Const, 1}, + {"F_FULLFSYNC", Const, 0}, + {"F_GETCODEDIR", Const, 16}, + {"F_GETFD", Const, 0}, + {"F_GETFL", Const, 0}, + {"F_GETLEASE", Const, 0}, + {"F_GETLK", Const, 0}, + {"F_GETLK64", Const, 0}, + {"F_GETLKPID", Const, 0}, + {"F_GETNOSIGPIPE", Const, 0}, + {"F_GETOWN", Const, 0}, + {"F_GETOWN_EX", Const, 0}, + {"F_GETPATH", Const, 0}, + {"F_GETPATH_MTMINFO", Const, 0}, + {"F_GETPIPE_SZ", Const, 0}, + {"F_GETPROTECTIONCLASS", Const, 0}, + {"F_GETPROTECTIONLEVEL", Const, 16}, + {"F_GETSIG", Const, 0}, + {"F_GLOBAL_NOCACHE", Const, 0}, + {"F_LOCK", Const, 0}, + {"F_LOG2PHYS", Const, 0}, + {"F_LOG2PHYS_EXT", Const, 0}, + {"F_MARKDEPENDENCY", Const, 0}, + {"F_MAXFD", Const, 1}, + {"F_NOCACHE", Const, 0}, + {"F_NODIRECT", Const, 0}, + {"F_NOTIFY", Const, 0}, + {"F_OGETLK", Const, 0}, + {"F_OK", Const, 0}, + {"F_OSETLK", Const, 0}, + {"F_OSETLKW", Const, 0}, + {"F_PARAM_MASK", Const, 1}, + {"F_PARAM_MAX", Const, 1}, + {"F_PATHPKG_CHECK", Const, 0}, + {"F_PEOFPOSMODE", Const, 0}, + {"F_PREALLOCATE", Const, 0}, + {"F_RDADVISE", Const, 0}, + {"F_RDAHEAD", Const, 0}, + {"F_RDLCK", Const, 0}, + {"F_READAHEAD", Const, 0}, + {"F_READBOOTSTRAP", Const, 0}, + {"F_SETBACKINGSTORE", Const, 0}, + {"F_SETFD", Const, 0}, + {"F_SETFL", Const, 0}, + {"F_SETLEASE", Const, 0}, + {"F_SETLK", Const, 0}, + {"F_SETLK64", Const, 0}, + {"F_SETLKW", Const, 0}, + {"F_SETLKW64", Const, 0}, + {"F_SETLKWTIMEOUT", Const, 16}, + {"F_SETLK_REMOTE", Const, 0}, + {"F_SETNOSIGPIPE", Const, 0}, + {"F_SETOWN", Const, 0}, + {"F_SETOWN_EX", Const, 0}, + {"F_SETPIPE_SZ", Const, 0}, + {"F_SETPROTECTIONCLASS", Const, 0}, + {"F_SETSIG", Const, 0}, + {"F_SETSIZE", Const, 0}, + {"F_SHLCK", Const, 0}, + {"F_SINGLE_WRITER", Const, 16}, + {"F_TEST", Const, 0}, + {"F_THAW_FS", Const, 0}, + {"F_TLOCK", Const, 0}, + {"F_TRANSCODEKEY", Const, 16}, + {"F_ULOCK", Const, 0}, + {"F_UNLCK", Const, 0}, + {"F_UNLCKSYS", Const, 0}, + {"F_VOLPOSMODE", Const, 0}, + {"F_WRITEBOOTSTRAP", Const, 0}, + {"F_WRLCK", Const, 0}, + {"Faccessat", Func, 0}, + {"Fallocate", Func, 0}, + {"Fbootstraptransfer_t", Type, 0}, + {"Fbootstraptransfer_t.Buffer", Field, 0}, + {"Fbootstraptransfer_t.Length", Field, 0}, + {"Fbootstraptransfer_t.Offset", Field, 0}, + {"Fchdir", Func, 0}, + {"Fchflags", Func, 0}, + {"Fchmod", Func, 0}, + {"Fchmodat", Func, 0}, + {"Fchown", Func, 0}, + {"Fchownat", Func, 0}, + {"FcntlFlock", Func, 3}, + {"FdSet", Type, 0}, + {"FdSet.Bits", Field, 0}, + {"FdSet.X__fds_bits", Field, 0}, + {"Fdatasync", Func, 0}, + {"FileNotifyInformation", Type, 0}, + {"FileNotifyInformation.Action", Field, 0}, + {"FileNotifyInformation.FileName", Field, 0}, + {"FileNotifyInformation.FileNameLength", Field, 0}, + {"FileNotifyInformation.NextEntryOffset", Field, 0}, + {"Filetime", Type, 0}, + {"Filetime.HighDateTime", Field, 0}, + {"Filetime.LowDateTime", Field, 0}, + {"FindClose", Func, 0}, + {"FindFirstFile", Func, 0}, + {"FindNextFile", Func, 0}, + {"Flock", Func, 0}, + {"Flock_t", Type, 0}, + {"Flock_t.Len", Field, 0}, + {"Flock_t.Pad_cgo_0", Field, 0}, + {"Flock_t.Pad_cgo_1", Field, 3}, + {"Flock_t.Pid", Field, 0}, + {"Flock_t.Start", Field, 0}, + {"Flock_t.Sysid", Field, 0}, + {"Flock_t.Type", Field, 0}, + {"Flock_t.Whence", Field, 0}, + {"FlushBpf", Func, 0}, + {"FlushFileBuffers", Func, 0}, + {"FlushViewOfFile", Func, 0}, + {"ForkExec", Func, 0}, + {"ForkLock", Var, 0}, + {"FormatMessage", Func, 0}, + {"Fpathconf", Func, 0}, + {"FreeAddrInfoW", Func, 1}, + {"FreeEnvironmentStrings", Func, 0}, + {"FreeLibrary", Func, 0}, + {"Fsid", Type, 0}, + {"Fsid.Val", Field, 0}, + {"Fsid.X__fsid_val", Field, 2}, + {"Fsid.X__val", Field, 0}, + {"Fstat", Func, 0}, + {"Fstatat", Func, 12}, + {"Fstatfs", Func, 0}, + {"Fstore_t", Type, 0}, + {"Fstore_t.Bytesalloc", Field, 0}, + {"Fstore_t.Flags", Field, 0}, + {"Fstore_t.Length", Field, 0}, + {"Fstore_t.Offset", Field, 0}, + {"Fstore_t.Posmode", Field, 0}, + {"Fsync", Func, 0}, + {"Ftruncate", Func, 0}, + {"FullPath", Func, 4}, + {"Futimes", Func, 0}, + {"Futimesat", Func, 0}, + {"GENERIC_ALL", Const, 0}, + {"GENERIC_EXECUTE", Const, 0}, + {"GENERIC_READ", Const, 0}, + {"GENERIC_WRITE", Const, 0}, + {"GUID", Type, 1}, + {"GUID.Data1", Field, 1}, + {"GUID.Data2", Field, 1}, + {"GUID.Data3", Field, 1}, + {"GUID.Data4", Field, 1}, + {"GetAcceptExSockaddrs", Func, 0}, + {"GetAdaptersInfo", Func, 0}, + {"GetAddrInfoW", Func, 1}, + {"GetCommandLine", Func, 0}, + {"GetComputerName", Func, 0}, + {"GetConsoleMode", Func, 1}, + {"GetCurrentDirectory", Func, 0}, + {"GetCurrentProcess", Func, 0}, + {"GetEnvironmentStrings", Func, 0}, + {"GetEnvironmentVariable", Func, 0}, + {"GetExitCodeProcess", Func, 0}, + {"GetFileAttributes", Func, 0}, + {"GetFileAttributesEx", Func, 0}, + {"GetFileExInfoStandard", Const, 0}, + {"GetFileExMaxInfoLevel", Const, 0}, + {"GetFileInformationByHandle", Func, 0}, + {"GetFileType", Func, 0}, + {"GetFullPathName", Func, 0}, + {"GetHostByName", Func, 0}, + {"GetIfEntry", Func, 0}, + {"GetLastError", Func, 0}, + {"GetLengthSid", Func, 0}, + {"GetLongPathName", Func, 0}, + {"GetProcAddress", Func, 0}, + {"GetProcessTimes", Func, 0}, + {"GetProtoByName", Func, 0}, + {"GetQueuedCompletionStatus", Func, 0}, + {"GetServByName", Func, 0}, + {"GetShortPathName", Func, 0}, + {"GetStartupInfo", Func, 0}, + {"GetStdHandle", Func, 0}, + {"GetSystemTimeAsFileTime", Func, 0}, + {"GetTempPath", Func, 0}, + {"GetTimeZoneInformation", Func, 0}, + {"GetTokenInformation", Func, 0}, + {"GetUserNameEx", Func, 0}, + {"GetUserProfileDirectory", Func, 0}, + {"GetVersion", Func, 0}, + {"Getcwd", Func, 0}, + {"Getdents", Func, 0}, + {"Getdirentries", Func, 0}, + {"Getdtablesize", Func, 0}, + {"Getegid", Func, 0}, + {"Getenv", Func, 0}, + {"Geteuid", Func, 0}, + {"Getfsstat", Func, 0}, + {"Getgid", Func, 0}, + {"Getgroups", Func, 0}, + {"Getpagesize", Func, 0}, + {"Getpeername", Func, 0}, + {"Getpgid", Func, 0}, + {"Getpgrp", Func, 0}, + {"Getpid", Func, 0}, + {"Getppid", Func, 0}, + {"Getpriority", Func, 0}, + {"Getrlimit", Func, 0}, + {"Getrusage", Func, 0}, + {"Getsid", Func, 0}, + {"Getsockname", Func, 0}, + {"Getsockopt", Func, 1}, + {"GetsockoptByte", Func, 0}, + {"GetsockoptICMPv6Filter", Func, 2}, + {"GetsockoptIPMreq", Func, 0}, + {"GetsockoptIPMreqn", Func, 0}, + {"GetsockoptIPv6MTUInfo", Func, 2}, + {"GetsockoptIPv6Mreq", Func, 0}, + {"GetsockoptInet4Addr", Func, 0}, + {"GetsockoptInt", Func, 0}, + {"GetsockoptUcred", Func, 1}, + {"Gettid", Func, 0}, + {"Gettimeofday", Func, 0}, + {"Getuid", Func, 0}, + {"Getwd", Func, 0}, + {"Getxattr", Func, 1}, + {"HANDLE_FLAG_INHERIT", Const, 0}, + {"HKEY_CLASSES_ROOT", Const, 0}, + {"HKEY_CURRENT_CONFIG", Const, 0}, + {"HKEY_CURRENT_USER", Const, 0}, + {"HKEY_DYN_DATA", Const, 0}, + {"HKEY_LOCAL_MACHINE", Const, 0}, + {"HKEY_PERFORMANCE_DATA", Const, 0}, + {"HKEY_USERS", Const, 0}, + {"HUPCL", Const, 0}, + {"Handle", Type, 0}, + {"Hostent", Type, 0}, + {"Hostent.AddrList", Field, 0}, + {"Hostent.AddrType", Field, 0}, + {"Hostent.Aliases", Field, 0}, + {"Hostent.Length", Field, 0}, + {"Hostent.Name", Field, 0}, + {"ICANON", Const, 0}, + {"ICMP6_FILTER", Const, 2}, + {"ICMPV6_FILTER", Const, 2}, + {"ICMPv6Filter", Type, 2}, + {"ICMPv6Filter.Data", Field, 2}, + {"ICMPv6Filter.Filt", Field, 2}, + {"ICRNL", Const, 0}, + {"IEXTEN", Const, 0}, + {"IFAN_ARRIVAL", Const, 1}, + {"IFAN_DEPARTURE", Const, 1}, + {"IFA_ADDRESS", Const, 0}, + {"IFA_ANYCAST", Const, 0}, + {"IFA_BROADCAST", Const, 0}, + {"IFA_CACHEINFO", Const, 0}, + {"IFA_F_DADFAILED", Const, 0}, + {"IFA_F_DEPRECATED", Const, 0}, + {"IFA_F_HOMEADDRESS", Const, 0}, + {"IFA_F_NODAD", Const, 0}, + {"IFA_F_OPTIMISTIC", Const, 0}, + {"IFA_F_PERMANENT", Const, 0}, + {"IFA_F_SECONDARY", Const, 0}, + {"IFA_F_TEMPORARY", Const, 0}, + {"IFA_F_TENTATIVE", Const, 0}, + {"IFA_LABEL", Const, 0}, + {"IFA_LOCAL", Const, 0}, + {"IFA_MAX", Const, 0}, + {"IFA_MULTICAST", Const, 0}, + {"IFA_ROUTE", Const, 1}, + {"IFA_UNSPEC", Const, 0}, + {"IFF_ALLMULTI", Const, 0}, + {"IFF_ALTPHYS", Const, 0}, + {"IFF_AUTOMEDIA", Const, 0}, + {"IFF_BROADCAST", Const, 0}, + {"IFF_CANTCHANGE", Const, 0}, + {"IFF_CANTCONFIG", Const, 1}, + {"IFF_DEBUG", Const, 0}, + {"IFF_DRV_OACTIVE", Const, 0}, + {"IFF_DRV_RUNNING", Const, 0}, + {"IFF_DYING", Const, 0}, + {"IFF_DYNAMIC", Const, 0}, + {"IFF_LINK0", Const, 0}, + {"IFF_LINK1", Const, 0}, + {"IFF_LINK2", Const, 0}, + {"IFF_LOOPBACK", Const, 0}, + {"IFF_MASTER", Const, 0}, + {"IFF_MONITOR", Const, 0}, + {"IFF_MULTICAST", Const, 0}, + {"IFF_NOARP", Const, 0}, + {"IFF_NOTRAILERS", Const, 0}, + {"IFF_NO_PI", Const, 0}, + {"IFF_OACTIVE", Const, 0}, + {"IFF_ONE_QUEUE", Const, 0}, + {"IFF_POINTOPOINT", Const, 0}, + {"IFF_POINTTOPOINT", Const, 0}, + {"IFF_PORTSEL", Const, 0}, + {"IFF_PPROMISC", Const, 0}, + {"IFF_PROMISC", Const, 0}, + {"IFF_RENAMING", Const, 0}, + {"IFF_RUNNING", Const, 0}, + {"IFF_SIMPLEX", Const, 0}, + {"IFF_SLAVE", Const, 0}, + {"IFF_SMART", Const, 0}, + {"IFF_STATICARP", Const, 0}, + {"IFF_TAP", Const, 0}, + {"IFF_TUN", Const, 0}, + {"IFF_TUN_EXCL", Const, 0}, + {"IFF_UP", Const, 0}, + {"IFF_VNET_HDR", Const, 0}, + {"IFLA_ADDRESS", Const, 0}, + {"IFLA_BROADCAST", Const, 0}, + {"IFLA_COST", Const, 0}, + {"IFLA_IFALIAS", Const, 0}, + {"IFLA_IFNAME", Const, 0}, + {"IFLA_LINK", Const, 0}, + {"IFLA_LINKINFO", Const, 0}, + {"IFLA_LINKMODE", Const, 0}, + {"IFLA_MAP", Const, 0}, + {"IFLA_MASTER", Const, 0}, + {"IFLA_MAX", Const, 0}, + {"IFLA_MTU", Const, 0}, + {"IFLA_NET_NS_PID", Const, 0}, + {"IFLA_OPERSTATE", Const, 0}, + {"IFLA_PRIORITY", Const, 0}, + {"IFLA_PROTINFO", Const, 0}, + {"IFLA_QDISC", Const, 0}, + {"IFLA_STATS", Const, 0}, + {"IFLA_TXQLEN", Const, 0}, + {"IFLA_UNSPEC", Const, 0}, + {"IFLA_WEIGHT", Const, 0}, + {"IFLA_WIRELESS", Const, 0}, + {"IFNAMSIZ", Const, 0}, + {"IFT_1822", Const, 0}, + {"IFT_A12MPPSWITCH", Const, 0}, + {"IFT_AAL2", Const, 0}, + {"IFT_AAL5", Const, 0}, + {"IFT_ADSL", Const, 0}, + {"IFT_AFLANE8023", Const, 0}, + {"IFT_AFLANE8025", Const, 0}, + {"IFT_ARAP", Const, 0}, + {"IFT_ARCNET", Const, 0}, + {"IFT_ARCNETPLUS", Const, 0}, + {"IFT_ASYNC", Const, 0}, + {"IFT_ATM", Const, 0}, + {"IFT_ATMDXI", Const, 0}, + {"IFT_ATMFUNI", Const, 0}, + {"IFT_ATMIMA", Const, 0}, + {"IFT_ATMLOGICAL", Const, 0}, + {"IFT_ATMRADIO", Const, 0}, + {"IFT_ATMSUBINTERFACE", Const, 0}, + {"IFT_ATMVCIENDPT", Const, 0}, + {"IFT_ATMVIRTUAL", Const, 0}, + {"IFT_BGPPOLICYACCOUNTING", Const, 0}, + {"IFT_BLUETOOTH", Const, 1}, + {"IFT_BRIDGE", Const, 0}, + {"IFT_BSC", Const, 0}, + {"IFT_CARP", Const, 0}, + {"IFT_CCTEMUL", Const, 0}, + {"IFT_CELLULAR", Const, 0}, + {"IFT_CEPT", Const, 0}, + {"IFT_CES", Const, 0}, + {"IFT_CHANNEL", Const, 0}, + {"IFT_CNR", Const, 0}, + {"IFT_COFFEE", Const, 0}, + {"IFT_COMPOSITELINK", Const, 0}, + {"IFT_DCN", Const, 0}, + {"IFT_DIGITALPOWERLINE", Const, 0}, + {"IFT_DIGITALWRAPPEROVERHEADCHANNEL", Const, 0}, + {"IFT_DLSW", Const, 0}, + {"IFT_DOCSCABLEDOWNSTREAM", Const, 0}, + {"IFT_DOCSCABLEMACLAYER", Const, 0}, + {"IFT_DOCSCABLEUPSTREAM", Const, 0}, + {"IFT_DOCSCABLEUPSTREAMCHANNEL", Const, 1}, + {"IFT_DS0", Const, 0}, + {"IFT_DS0BUNDLE", Const, 0}, + {"IFT_DS1FDL", Const, 0}, + {"IFT_DS3", Const, 0}, + {"IFT_DTM", Const, 0}, + {"IFT_DUMMY", Const, 1}, + {"IFT_DVBASILN", Const, 0}, + {"IFT_DVBASIOUT", Const, 0}, + {"IFT_DVBRCCDOWNSTREAM", Const, 0}, + {"IFT_DVBRCCMACLAYER", Const, 0}, + {"IFT_DVBRCCUPSTREAM", Const, 0}, + {"IFT_ECONET", Const, 1}, + {"IFT_ENC", Const, 0}, + {"IFT_EON", Const, 0}, + {"IFT_EPLRS", Const, 0}, + {"IFT_ESCON", Const, 0}, + {"IFT_ETHER", Const, 0}, + {"IFT_FAITH", Const, 0}, + {"IFT_FAST", Const, 0}, + {"IFT_FASTETHER", Const, 0}, + {"IFT_FASTETHERFX", Const, 0}, + {"IFT_FDDI", Const, 0}, + {"IFT_FIBRECHANNEL", Const, 0}, + {"IFT_FRAMERELAYINTERCONNECT", Const, 0}, + {"IFT_FRAMERELAYMPI", Const, 0}, + {"IFT_FRDLCIENDPT", Const, 0}, + {"IFT_FRELAY", Const, 0}, + {"IFT_FRELAYDCE", Const, 0}, + {"IFT_FRF16MFRBUNDLE", Const, 0}, + {"IFT_FRFORWARD", Const, 0}, + {"IFT_G703AT2MB", Const, 0}, + {"IFT_G703AT64K", Const, 0}, + {"IFT_GIF", Const, 0}, + {"IFT_GIGABITETHERNET", Const, 0}, + {"IFT_GR303IDT", Const, 0}, + {"IFT_GR303RDT", Const, 0}, + {"IFT_H323GATEKEEPER", Const, 0}, + {"IFT_H323PROXY", Const, 0}, + {"IFT_HDH1822", Const, 0}, + {"IFT_HDLC", Const, 0}, + {"IFT_HDSL2", Const, 0}, + {"IFT_HIPERLAN2", Const, 0}, + {"IFT_HIPPI", Const, 0}, + {"IFT_HIPPIINTERFACE", Const, 0}, + {"IFT_HOSTPAD", Const, 0}, + {"IFT_HSSI", Const, 0}, + {"IFT_HY", Const, 0}, + {"IFT_IBM370PARCHAN", Const, 0}, + {"IFT_IDSL", Const, 0}, + {"IFT_IEEE1394", Const, 0}, + {"IFT_IEEE80211", Const, 0}, + {"IFT_IEEE80212", Const, 0}, + {"IFT_IEEE8023ADLAG", Const, 0}, + {"IFT_IFGSN", Const, 0}, + {"IFT_IMT", Const, 0}, + {"IFT_INFINIBAND", Const, 1}, + {"IFT_INTERLEAVE", Const, 0}, + {"IFT_IP", Const, 0}, + {"IFT_IPFORWARD", Const, 0}, + {"IFT_IPOVERATM", Const, 0}, + {"IFT_IPOVERCDLC", Const, 0}, + {"IFT_IPOVERCLAW", Const, 0}, + {"IFT_IPSWITCH", Const, 0}, + {"IFT_IPXIP", Const, 0}, + {"IFT_ISDN", Const, 0}, + {"IFT_ISDNBASIC", Const, 0}, + {"IFT_ISDNPRIMARY", Const, 0}, + {"IFT_ISDNS", Const, 0}, + {"IFT_ISDNU", Const, 0}, + {"IFT_ISO88022LLC", Const, 0}, + {"IFT_ISO88023", Const, 0}, + {"IFT_ISO88024", Const, 0}, + {"IFT_ISO88025", Const, 0}, + {"IFT_ISO88025CRFPINT", Const, 0}, + {"IFT_ISO88025DTR", Const, 0}, + {"IFT_ISO88025FIBER", Const, 0}, + {"IFT_ISO88026", Const, 0}, + {"IFT_ISUP", Const, 0}, + {"IFT_L2VLAN", Const, 0}, + {"IFT_L3IPVLAN", Const, 0}, + {"IFT_L3IPXVLAN", Const, 0}, + {"IFT_LAPB", Const, 0}, + {"IFT_LAPD", Const, 0}, + {"IFT_LAPF", Const, 0}, + {"IFT_LINEGROUP", Const, 1}, + {"IFT_LOCALTALK", Const, 0}, + {"IFT_LOOP", Const, 0}, + {"IFT_MEDIAMAILOVERIP", Const, 0}, + {"IFT_MFSIGLINK", Const, 0}, + {"IFT_MIOX25", Const, 0}, + {"IFT_MODEM", Const, 0}, + {"IFT_MPC", Const, 0}, + {"IFT_MPLS", Const, 0}, + {"IFT_MPLSTUNNEL", Const, 0}, + {"IFT_MSDSL", Const, 0}, + {"IFT_MVL", Const, 0}, + {"IFT_MYRINET", Const, 0}, + {"IFT_NFAS", Const, 0}, + {"IFT_NSIP", Const, 0}, + {"IFT_OPTICALCHANNEL", Const, 0}, + {"IFT_OPTICALTRANSPORT", Const, 0}, + {"IFT_OTHER", Const, 0}, + {"IFT_P10", Const, 0}, + {"IFT_P80", Const, 0}, + {"IFT_PARA", Const, 0}, + {"IFT_PDP", Const, 0}, + {"IFT_PFLOG", Const, 0}, + {"IFT_PFLOW", Const, 1}, + {"IFT_PFSYNC", Const, 0}, + {"IFT_PLC", Const, 0}, + {"IFT_PON155", Const, 1}, + {"IFT_PON622", Const, 1}, + {"IFT_POS", Const, 0}, + {"IFT_PPP", Const, 0}, + {"IFT_PPPMULTILINKBUNDLE", Const, 0}, + {"IFT_PROPATM", Const, 1}, + {"IFT_PROPBWAP2MP", Const, 0}, + {"IFT_PROPCNLS", Const, 0}, + {"IFT_PROPDOCSWIRELESSDOWNSTREAM", Const, 0}, + {"IFT_PROPDOCSWIRELESSMACLAYER", Const, 0}, + {"IFT_PROPDOCSWIRELESSUPSTREAM", Const, 0}, + {"IFT_PROPMUX", Const, 0}, + {"IFT_PROPVIRTUAL", Const, 0}, + {"IFT_PROPWIRELESSP2P", Const, 0}, + {"IFT_PTPSERIAL", Const, 0}, + {"IFT_PVC", Const, 0}, + {"IFT_Q2931", Const, 1}, + {"IFT_QLLC", Const, 0}, + {"IFT_RADIOMAC", Const, 0}, + {"IFT_RADSL", Const, 0}, + {"IFT_REACHDSL", Const, 0}, + {"IFT_RFC1483", Const, 0}, + {"IFT_RS232", Const, 0}, + {"IFT_RSRB", Const, 0}, + {"IFT_SDLC", Const, 0}, + {"IFT_SDSL", Const, 0}, + {"IFT_SHDSL", Const, 0}, + {"IFT_SIP", Const, 0}, + {"IFT_SIPSIG", Const, 1}, + {"IFT_SIPTG", Const, 1}, + {"IFT_SLIP", Const, 0}, + {"IFT_SMDSDXI", Const, 0}, + {"IFT_SMDSICIP", Const, 0}, + {"IFT_SONET", Const, 0}, + {"IFT_SONETOVERHEADCHANNEL", Const, 0}, + {"IFT_SONETPATH", Const, 0}, + {"IFT_SONETVT", Const, 0}, + {"IFT_SRP", Const, 0}, + {"IFT_SS7SIGLINK", Const, 0}, + {"IFT_STACKTOSTACK", Const, 0}, + {"IFT_STARLAN", Const, 0}, + {"IFT_STF", Const, 0}, + {"IFT_T1", Const, 0}, + {"IFT_TDLC", Const, 0}, + {"IFT_TELINK", Const, 1}, + {"IFT_TERMPAD", Const, 0}, + {"IFT_TR008", Const, 0}, + {"IFT_TRANSPHDLC", Const, 0}, + {"IFT_TUNNEL", Const, 0}, + {"IFT_ULTRA", Const, 0}, + {"IFT_USB", Const, 0}, + {"IFT_V11", Const, 0}, + {"IFT_V35", Const, 0}, + {"IFT_V36", Const, 0}, + {"IFT_V37", Const, 0}, + {"IFT_VDSL", Const, 0}, + {"IFT_VIRTUALIPADDRESS", Const, 0}, + {"IFT_VIRTUALTG", Const, 1}, + {"IFT_VOICEDID", Const, 1}, + {"IFT_VOICEEM", Const, 0}, + {"IFT_VOICEEMFGD", Const, 1}, + {"IFT_VOICEENCAP", Const, 0}, + {"IFT_VOICEFGDEANA", Const, 1}, + {"IFT_VOICEFXO", Const, 0}, + {"IFT_VOICEFXS", Const, 0}, + {"IFT_VOICEOVERATM", Const, 0}, + {"IFT_VOICEOVERCABLE", Const, 1}, + {"IFT_VOICEOVERFRAMERELAY", Const, 0}, + {"IFT_VOICEOVERIP", Const, 0}, + {"IFT_X213", Const, 0}, + {"IFT_X25", Const, 0}, + {"IFT_X25DDN", Const, 0}, + {"IFT_X25HUNTGROUP", Const, 0}, + {"IFT_X25MLP", Const, 0}, + {"IFT_X25PLE", Const, 0}, + {"IFT_XETHER", Const, 0}, + {"IGNBRK", Const, 0}, + {"IGNCR", Const, 0}, + {"IGNORE", Const, 0}, + {"IGNPAR", Const, 0}, + {"IMAXBEL", Const, 0}, + {"INFINITE", Const, 0}, + {"INLCR", Const, 0}, + {"INPCK", Const, 0}, + {"INVALID_FILE_ATTRIBUTES", Const, 0}, + {"IN_ACCESS", Const, 0}, + {"IN_ALL_EVENTS", Const, 0}, + {"IN_ATTRIB", Const, 0}, + {"IN_CLASSA_HOST", Const, 0}, + {"IN_CLASSA_MAX", Const, 0}, + {"IN_CLASSA_NET", Const, 0}, + {"IN_CLASSA_NSHIFT", Const, 0}, + {"IN_CLASSB_HOST", Const, 0}, + {"IN_CLASSB_MAX", Const, 0}, + {"IN_CLASSB_NET", Const, 0}, + {"IN_CLASSB_NSHIFT", Const, 0}, + {"IN_CLASSC_HOST", Const, 0}, + {"IN_CLASSC_NET", Const, 0}, + {"IN_CLASSC_NSHIFT", Const, 0}, + {"IN_CLASSD_HOST", Const, 0}, + {"IN_CLASSD_NET", Const, 0}, + {"IN_CLASSD_NSHIFT", Const, 0}, + {"IN_CLOEXEC", Const, 0}, + {"IN_CLOSE", Const, 0}, + {"IN_CLOSE_NOWRITE", Const, 0}, + {"IN_CLOSE_WRITE", Const, 0}, + {"IN_CREATE", Const, 0}, + {"IN_DELETE", Const, 0}, + {"IN_DELETE_SELF", Const, 0}, + {"IN_DONT_FOLLOW", Const, 0}, + {"IN_EXCL_UNLINK", Const, 0}, + {"IN_IGNORED", Const, 0}, + {"IN_ISDIR", Const, 0}, + {"IN_LINKLOCALNETNUM", Const, 0}, + {"IN_LOOPBACKNET", Const, 0}, + {"IN_MASK_ADD", Const, 0}, + {"IN_MODIFY", Const, 0}, + {"IN_MOVE", Const, 0}, + {"IN_MOVED_FROM", Const, 0}, + {"IN_MOVED_TO", Const, 0}, + {"IN_MOVE_SELF", Const, 0}, + {"IN_NONBLOCK", Const, 0}, + {"IN_ONESHOT", Const, 0}, + {"IN_ONLYDIR", Const, 0}, + {"IN_OPEN", Const, 0}, + {"IN_Q_OVERFLOW", Const, 0}, + {"IN_RFC3021_HOST", Const, 1}, + {"IN_RFC3021_MASK", Const, 1}, + {"IN_RFC3021_NET", Const, 1}, + {"IN_RFC3021_NSHIFT", Const, 1}, + {"IN_UNMOUNT", Const, 0}, + {"IOC_IN", Const, 1}, + {"IOC_INOUT", Const, 1}, + {"IOC_OUT", Const, 1}, + {"IOC_VENDOR", Const, 3}, + {"IOC_WS2", Const, 1}, + {"IO_REPARSE_TAG_SYMLINK", Const, 4}, + {"IPMreq", Type, 0}, + {"IPMreq.Interface", Field, 0}, + {"IPMreq.Multiaddr", Field, 0}, + {"IPMreqn", Type, 0}, + {"IPMreqn.Address", Field, 0}, + {"IPMreqn.Ifindex", Field, 0}, + {"IPMreqn.Multiaddr", Field, 0}, + {"IPPROTO_3PC", Const, 0}, + {"IPPROTO_ADFS", Const, 0}, + {"IPPROTO_AH", Const, 0}, + {"IPPROTO_AHIP", Const, 0}, + {"IPPROTO_APES", Const, 0}, + {"IPPROTO_ARGUS", Const, 0}, + {"IPPROTO_AX25", Const, 0}, + {"IPPROTO_BHA", Const, 0}, + {"IPPROTO_BLT", Const, 0}, + {"IPPROTO_BRSATMON", Const, 0}, + {"IPPROTO_CARP", Const, 0}, + {"IPPROTO_CFTP", Const, 0}, + {"IPPROTO_CHAOS", Const, 0}, + {"IPPROTO_CMTP", Const, 0}, + {"IPPROTO_COMP", Const, 0}, + {"IPPROTO_CPHB", Const, 0}, + {"IPPROTO_CPNX", Const, 0}, + {"IPPROTO_DCCP", Const, 0}, + {"IPPROTO_DDP", Const, 0}, + {"IPPROTO_DGP", Const, 0}, + {"IPPROTO_DIVERT", Const, 0}, + {"IPPROTO_DIVERT_INIT", Const, 3}, + {"IPPROTO_DIVERT_RESP", Const, 3}, + {"IPPROTO_DONE", Const, 0}, + {"IPPROTO_DSTOPTS", Const, 0}, + {"IPPROTO_EGP", Const, 0}, + {"IPPROTO_EMCON", Const, 0}, + {"IPPROTO_ENCAP", Const, 0}, + {"IPPROTO_EON", Const, 0}, + {"IPPROTO_ESP", Const, 0}, + {"IPPROTO_ETHERIP", Const, 0}, + {"IPPROTO_FRAGMENT", Const, 0}, + {"IPPROTO_GGP", Const, 0}, + {"IPPROTO_GMTP", Const, 0}, + {"IPPROTO_GRE", Const, 0}, + {"IPPROTO_HELLO", Const, 0}, + {"IPPROTO_HMP", Const, 0}, + {"IPPROTO_HOPOPTS", Const, 0}, + {"IPPROTO_ICMP", Const, 0}, + {"IPPROTO_ICMPV6", Const, 0}, + {"IPPROTO_IDP", Const, 0}, + {"IPPROTO_IDPR", Const, 0}, + {"IPPROTO_IDRP", Const, 0}, + {"IPPROTO_IGMP", Const, 0}, + {"IPPROTO_IGP", Const, 0}, + {"IPPROTO_IGRP", Const, 0}, + {"IPPROTO_IL", Const, 0}, + {"IPPROTO_INLSP", Const, 0}, + {"IPPROTO_INP", Const, 0}, + {"IPPROTO_IP", Const, 0}, + {"IPPROTO_IPCOMP", Const, 0}, + {"IPPROTO_IPCV", Const, 0}, + {"IPPROTO_IPEIP", Const, 0}, + {"IPPROTO_IPIP", Const, 0}, + {"IPPROTO_IPPC", Const, 0}, + {"IPPROTO_IPV4", Const, 0}, + {"IPPROTO_IPV6", Const, 0}, + {"IPPROTO_IPV6_ICMP", Const, 1}, + {"IPPROTO_IRTP", Const, 0}, + {"IPPROTO_KRYPTOLAN", Const, 0}, + {"IPPROTO_LARP", Const, 0}, + {"IPPROTO_LEAF1", Const, 0}, + {"IPPROTO_LEAF2", Const, 0}, + {"IPPROTO_MAX", Const, 0}, + {"IPPROTO_MAXID", Const, 0}, + {"IPPROTO_MEAS", Const, 0}, + {"IPPROTO_MH", Const, 1}, + {"IPPROTO_MHRP", Const, 0}, + {"IPPROTO_MICP", Const, 0}, + {"IPPROTO_MOBILE", Const, 0}, + {"IPPROTO_MPLS", Const, 1}, + {"IPPROTO_MTP", Const, 0}, + {"IPPROTO_MUX", Const, 0}, + {"IPPROTO_ND", Const, 0}, + {"IPPROTO_NHRP", Const, 0}, + {"IPPROTO_NONE", Const, 0}, + {"IPPROTO_NSP", Const, 0}, + {"IPPROTO_NVPII", Const, 0}, + {"IPPROTO_OLD_DIVERT", Const, 0}, + {"IPPROTO_OSPFIGP", Const, 0}, + {"IPPROTO_PFSYNC", Const, 0}, + {"IPPROTO_PGM", Const, 0}, + {"IPPROTO_PIGP", Const, 0}, + {"IPPROTO_PIM", Const, 0}, + {"IPPROTO_PRM", Const, 0}, + {"IPPROTO_PUP", Const, 0}, + {"IPPROTO_PVP", Const, 0}, + {"IPPROTO_RAW", Const, 0}, + {"IPPROTO_RCCMON", Const, 0}, + {"IPPROTO_RDP", Const, 0}, + {"IPPROTO_ROUTING", Const, 0}, + {"IPPROTO_RSVP", Const, 0}, + {"IPPROTO_RVD", Const, 0}, + {"IPPROTO_SATEXPAK", Const, 0}, + {"IPPROTO_SATMON", Const, 0}, + {"IPPROTO_SCCSP", Const, 0}, + {"IPPROTO_SCTP", Const, 0}, + {"IPPROTO_SDRP", Const, 0}, + {"IPPROTO_SEND", Const, 1}, + {"IPPROTO_SEP", Const, 0}, + {"IPPROTO_SKIP", Const, 0}, + {"IPPROTO_SPACER", Const, 0}, + {"IPPROTO_SRPC", Const, 0}, + {"IPPROTO_ST", Const, 0}, + {"IPPROTO_SVMTP", Const, 0}, + {"IPPROTO_SWIPE", Const, 0}, + {"IPPROTO_TCF", Const, 0}, + {"IPPROTO_TCP", Const, 0}, + {"IPPROTO_TLSP", Const, 0}, + {"IPPROTO_TP", Const, 0}, + {"IPPROTO_TPXX", Const, 0}, + {"IPPROTO_TRUNK1", Const, 0}, + {"IPPROTO_TRUNK2", Const, 0}, + {"IPPROTO_TTP", Const, 0}, + {"IPPROTO_UDP", Const, 0}, + {"IPPROTO_UDPLITE", Const, 0}, + {"IPPROTO_VINES", Const, 0}, + {"IPPROTO_VISA", Const, 0}, + {"IPPROTO_VMTP", Const, 0}, + {"IPPROTO_VRRP", Const, 1}, + {"IPPROTO_WBEXPAK", Const, 0}, + {"IPPROTO_WBMON", Const, 0}, + {"IPPROTO_WSN", Const, 0}, + {"IPPROTO_XNET", Const, 0}, + {"IPPROTO_XTP", Const, 0}, + {"IPV6_2292DSTOPTS", Const, 0}, + {"IPV6_2292HOPLIMIT", Const, 0}, + {"IPV6_2292HOPOPTS", Const, 0}, + {"IPV6_2292NEXTHOP", Const, 0}, + {"IPV6_2292PKTINFO", Const, 0}, + {"IPV6_2292PKTOPTIONS", Const, 0}, + {"IPV6_2292RTHDR", Const, 0}, + {"IPV6_ADDRFORM", Const, 0}, + {"IPV6_ADD_MEMBERSHIP", Const, 0}, + {"IPV6_AUTHHDR", Const, 0}, + {"IPV6_AUTH_LEVEL", Const, 1}, + {"IPV6_AUTOFLOWLABEL", Const, 0}, + {"IPV6_BINDANY", Const, 0}, + {"IPV6_BINDV6ONLY", Const, 0}, + {"IPV6_BOUND_IF", Const, 0}, + {"IPV6_CHECKSUM", Const, 0}, + {"IPV6_DEFAULT_MULTICAST_HOPS", Const, 0}, + {"IPV6_DEFAULT_MULTICAST_LOOP", Const, 0}, + {"IPV6_DEFHLIM", Const, 0}, + {"IPV6_DONTFRAG", Const, 0}, + {"IPV6_DROP_MEMBERSHIP", Const, 0}, + {"IPV6_DSTOPTS", Const, 0}, + {"IPV6_ESP_NETWORK_LEVEL", Const, 1}, + {"IPV6_ESP_TRANS_LEVEL", Const, 1}, + {"IPV6_FAITH", Const, 0}, + {"IPV6_FLOWINFO_MASK", Const, 0}, + {"IPV6_FLOWLABEL_MASK", Const, 0}, + {"IPV6_FRAGTTL", Const, 0}, + {"IPV6_FW_ADD", Const, 0}, + {"IPV6_FW_DEL", Const, 0}, + {"IPV6_FW_FLUSH", Const, 0}, + {"IPV6_FW_GET", Const, 0}, + {"IPV6_FW_ZERO", Const, 0}, + {"IPV6_HLIMDEC", Const, 0}, + {"IPV6_HOPLIMIT", Const, 0}, + {"IPV6_HOPOPTS", Const, 0}, + {"IPV6_IPCOMP_LEVEL", Const, 1}, + {"IPV6_IPSEC_POLICY", Const, 0}, + {"IPV6_JOIN_ANYCAST", Const, 0}, + {"IPV6_JOIN_GROUP", Const, 0}, + {"IPV6_LEAVE_ANYCAST", Const, 0}, + {"IPV6_LEAVE_GROUP", Const, 0}, + {"IPV6_MAXHLIM", Const, 0}, + {"IPV6_MAXOPTHDR", Const, 0}, + {"IPV6_MAXPACKET", Const, 0}, + {"IPV6_MAX_GROUP_SRC_FILTER", Const, 0}, + {"IPV6_MAX_MEMBERSHIPS", Const, 0}, + {"IPV6_MAX_SOCK_SRC_FILTER", Const, 0}, + {"IPV6_MIN_MEMBERSHIPS", Const, 0}, + {"IPV6_MMTU", Const, 0}, + {"IPV6_MSFILTER", Const, 0}, + {"IPV6_MTU", Const, 0}, + {"IPV6_MTU_DISCOVER", Const, 0}, + {"IPV6_MULTICAST_HOPS", Const, 0}, + {"IPV6_MULTICAST_IF", Const, 0}, + {"IPV6_MULTICAST_LOOP", Const, 0}, + {"IPV6_NEXTHOP", Const, 0}, + {"IPV6_OPTIONS", Const, 1}, + {"IPV6_PATHMTU", Const, 0}, + {"IPV6_PIPEX", Const, 1}, + {"IPV6_PKTINFO", Const, 0}, + {"IPV6_PMTUDISC_DO", Const, 0}, + {"IPV6_PMTUDISC_DONT", Const, 0}, + {"IPV6_PMTUDISC_PROBE", Const, 0}, + {"IPV6_PMTUDISC_WANT", Const, 0}, + {"IPV6_PORTRANGE", Const, 0}, + {"IPV6_PORTRANGE_DEFAULT", Const, 0}, + {"IPV6_PORTRANGE_HIGH", Const, 0}, + {"IPV6_PORTRANGE_LOW", Const, 0}, + {"IPV6_PREFER_TEMPADDR", Const, 0}, + {"IPV6_RECVDSTOPTS", Const, 0}, + {"IPV6_RECVDSTPORT", Const, 3}, + {"IPV6_RECVERR", Const, 0}, + {"IPV6_RECVHOPLIMIT", Const, 0}, + {"IPV6_RECVHOPOPTS", Const, 0}, + {"IPV6_RECVPATHMTU", Const, 0}, + {"IPV6_RECVPKTINFO", Const, 0}, + {"IPV6_RECVRTHDR", Const, 0}, + {"IPV6_RECVTCLASS", Const, 0}, + {"IPV6_ROUTER_ALERT", Const, 0}, + {"IPV6_RTABLE", Const, 1}, + {"IPV6_RTHDR", Const, 0}, + {"IPV6_RTHDRDSTOPTS", Const, 0}, + {"IPV6_RTHDR_LOOSE", Const, 0}, + {"IPV6_RTHDR_STRICT", Const, 0}, + {"IPV6_RTHDR_TYPE_0", Const, 0}, + {"IPV6_RXDSTOPTS", Const, 0}, + {"IPV6_RXHOPOPTS", Const, 0}, + {"IPV6_SOCKOPT_RESERVED1", Const, 0}, + {"IPV6_TCLASS", Const, 0}, + {"IPV6_UNICAST_HOPS", Const, 0}, + {"IPV6_USE_MIN_MTU", Const, 0}, + {"IPV6_V6ONLY", Const, 0}, + {"IPV6_VERSION", Const, 0}, + {"IPV6_VERSION_MASK", Const, 0}, + {"IPV6_XFRM_POLICY", Const, 0}, + {"IP_ADD_MEMBERSHIP", Const, 0}, + {"IP_ADD_SOURCE_MEMBERSHIP", Const, 0}, + {"IP_AUTH_LEVEL", Const, 1}, + {"IP_BINDANY", Const, 0}, + {"IP_BLOCK_SOURCE", Const, 0}, + {"IP_BOUND_IF", Const, 0}, + {"IP_DEFAULT_MULTICAST_LOOP", Const, 0}, + {"IP_DEFAULT_MULTICAST_TTL", Const, 0}, + {"IP_DF", Const, 0}, + {"IP_DIVERTFL", Const, 3}, + {"IP_DONTFRAG", Const, 0}, + {"IP_DROP_MEMBERSHIP", Const, 0}, + {"IP_DROP_SOURCE_MEMBERSHIP", Const, 0}, + {"IP_DUMMYNET3", Const, 0}, + {"IP_DUMMYNET_CONFIGURE", Const, 0}, + {"IP_DUMMYNET_DEL", Const, 0}, + {"IP_DUMMYNET_FLUSH", Const, 0}, + {"IP_DUMMYNET_GET", Const, 0}, + {"IP_EF", Const, 1}, + {"IP_ERRORMTU", Const, 1}, + {"IP_ESP_NETWORK_LEVEL", Const, 1}, + {"IP_ESP_TRANS_LEVEL", Const, 1}, + {"IP_FAITH", Const, 0}, + {"IP_FREEBIND", Const, 0}, + {"IP_FW3", Const, 0}, + {"IP_FW_ADD", Const, 0}, + {"IP_FW_DEL", Const, 0}, + {"IP_FW_FLUSH", Const, 0}, + {"IP_FW_GET", Const, 0}, + {"IP_FW_NAT_CFG", Const, 0}, + {"IP_FW_NAT_DEL", Const, 0}, + {"IP_FW_NAT_GET_CONFIG", Const, 0}, + {"IP_FW_NAT_GET_LOG", Const, 0}, + {"IP_FW_RESETLOG", Const, 0}, + {"IP_FW_TABLE_ADD", Const, 0}, + {"IP_FW_TABLE_DEL", Const, 0}, + {"IP_FW_TABLE_FLUSH", Const, 0}, + {"IP_FW_TABLE_GETSIZE", Const, 0}, + {"IP_FW_TABLE_LIST", Const, 0}, + {"IP_FW_ZERO", Const, 0}, + {"IP_HDRINCL", Const, 0}, + {"IP_IPCOMP_LEVEL", Const, 1}, + {"IP_IPSECFLOWINFO", Const, 1}, + {"IP_IPSEC_LOCAL_AUTH", Const, 1}, + {"IP_IPSEC_LOCAL_CRED", Const, 1}, + {"IP_IPSEC_LOCAL_ID", Const, 1}, + {"IP_IPSEC_POLICY", Const, 0}, + {"IP_IPSEC_REMOTE_AUTH", Const, 1}, + {"IP_IPSEC_REMOTE_CRED", Const, 1}, + {"IP_IPSEC_REMOTE_ID", Const, 1}, + {"IP_MAXPACKET", Const, 0}, + {"IP_MAX_GROUP_SRC_FILTER", Const, 0}, + {"IP_MAX_MEMBERSHIPS", Const, 0}, + {"IP_MAX_SOCK_MUTE_FILTER", Const, 0}, + {"IP_MAX_SOCK_SRC_FILTER", Const, 0}, + {"IP_MAX_SOURCE_FILTER", Const, 0}, + {"IP_MF", Const, 0}, + {"IP_MINFRAGSIZE", Const, 1}, + {"IP_MINTTL", Const, 0}, + {"IP_MIN_MEMBERSHIPS", Const, 0}, + {"IP_MSFILTER", Const, 0}, + {"IP_MSS", Const, 0}, + {"IP_MTU", Const, 0}, + {"IP_MTU_DISCOVER", Const, 0}, + {"IP_MULTICAST_IF", Const, 0}, + {"IP_MULTICAST_IFINDEX", Const, 0}, + {"IP_MULTICAST_LOOP", Const, 0}, + {"IP_MULTICAST_TTL", Const, 0}, + {"IP_MULTICAST_VIF", Const, 0}, + {"IP_NAT__XXX", Const, 0}, + {"IP_OFFMASK", Const, 0}, + {"IP_OLD_FW_ADD", Const, 0}, + {"IP_OLD_FW_DEL", Const, 0}, + {"IP_OLD_FW_FLUSH", Const, 0}, + {"IP_OLD_FW_GET", Const, 0}, + {"IP_OLD_FW_RESETLOG", Const, 0}, + {"IP_OLD_FW_ZERO", Const, 0}, + {"IP_ONESBCAST", Const, 0}, + {"IP_OPTIONS", Const, 0}, + {"IP_ORIGDSTADDR", Const, 0}, + {"IP_PASSSEC", Const, 0}, + {"IP_PIPEX", Const, 1}, + {"IP_PKTINFO", Const, 0}, + {"IP_PKTOPTIONS", Const, 0}, + {"IP_PMTUDISC", Const, 0}, + {"IP_PMTUDISC_DO", Const, 0}, + {"IP_PMTUDISC_DONT", Const, 0}, + {"IP_PMTUDISC_PROBE", Const, 0}, + {"IP_PMTUDISC_WANT", Const, 0}, + {"IP_PORTRANGE", Const, 0}, + {"IP_PORTRANGE_DEFAULT", Const, 0}, + {"IP_PORTRANGE_HIGH", Const, 0}, + {"IP_PORTRANGE_LOW", Const, 0}, + {"IP_RECVDSTADDR", Const, 0}, + {"IP_RECVDSTPORT", Const, 1}, + {"IP_RECVERR", Const, 0}, + {"IP_RECVIF", Const, 0}, + {"IP_RECVOPTS", Const, 0}, + {"IP_RECVORIGDSTADDR", Const, 0}, + {"IP_RECVPKTINFO", Const, 0}, + {"IP_RECVRETOPTS", Const, 0}, + {"IP_RECVRTABLE", Const, 1}, + {"IP_RECVTOS", Const, 0}, + {"IP_RECVTTL", Const, 0}, + {"IP_RETOPTS", Const, 0}, + {"IP_RF", Const, 0}, + {"IP_ROUTER_ALERT", Const, 0}, + {"IP_RSVP_OFF", Const, 0}, + {"IP_RSVP_ON", Const, 0}, + {"IP_RSVP_VIF_OFF", Const, 0}, + {"IP_RSVP_VIF_ON", Const, 0}, + {"IP_RTABLE", Const, 1}, + {"IP_SENDSRCADDR", Const, 0}, + {"IP_STRIPHDR", Const, 0}, + {"IP_TOS", Const, 0}, + {"IP_TRAFFIC_MGT_BACKGROUND", Const, 0}, + {"IP_TRANSPARENT", Const, 0}, + {"IP_TTL", Const, 0}, + {"IP_UNBLOCK_SOURCE", Const, 0}, + {"IP_XFRM_POLICY", Const, 0}, + {"IPv6MTUInfo", Type, 2}, + {"IPv6MTUInfo.Addr", Field, 2}, + {"IPv6MTUInfo.Mtu", Field, 2}, + {"IPv6Mreq", Type, 0}, + {"IPv6Mreq.Interface", Field, 0}, + {"IPv6Mreq.Multiaddr", Field, 0}, + {"ISIG", Const, 0}, + {"ISTRIP", Const, 0}, + {"IUCLC", Const, 0}, + {"IUTF8", Const, 0}, + {"IXANY", Const, 0}, + {"IXOFF", Const, 0}, + {"IXON", Const, 0}, + {"IfAddrmsg", Type, 0}, + {"IfAddrmsg.Family", Field, 0}, + {"IfAddrmsg.Flags", Field, 0}, + {"IfAddrmsg.Index", Field, 0}, + {"IfAddrmsg.Prefixlen", Field, 0}, + {"IfAddrmsg.Scope", Field, 0}, + {"IfAnnounceMsghdr", Type, 1}, + {"IfAnnounceMsghdr.Hdrlen", Field, 2}, + {"IfAnnounceMsghdr.Index", Field, 1}, + {"IfAnnounceMsghdr.Msglen", Field, 1}, + {"IfAnnounceMsghdr.Name", Field, 1}, + {"IfAnnounceMsghdr.Type", Field, 1}, + {"IfAnnounceMsghdr.Version", Field, 1}, + {"IfAnnounceMsghdr.What", Field, 1}, + {"IfData", Type, 0}, + {"IfData.Addrlen", Field, 0}, + {"IfData.Baudrate", Field, 0}, + {"IfData.Capabilities", Field, 2}, + {"IfData.Collisions", Field, 0}, + {"IfData.Datalen", Field, 0}, + {"IfData.Epoch", Field, 0}, + {"IfData.Hdrlen", Field, 0}, + {"IfData.Hwassist", Field, 0}, + {"IfData.Ibytes", Field, 0}, + {"IfData.Ierrors", Field, 0}, + {"IfData.Imcasts", Field, 0}, + {"IfData.Ipackets", Field, 0}, + {"IfData.Iqdrops", Field, 0}, + {"IfData.Lastchange", Field, 0}, + {"IfData.Link_state", Field, 0}, + {"IfData.Mclpool", Field, 2}, + {"IfData.Metric", Field, 0}, + {"IfData.Mtu", Field, 0}, + {"IfData.Noproto", Field, 0}, + {"IfData.Obytes", Field, 0}, + {"IfData.Oerrors", Field, 0}, + {"IfData.Omcasts", Field, 0}, + {"IfData.Opackets", Field, 0}, + {"IfData.Pad", Field, 2}, + {"IfData.Pad_cgo_0", Field, 2}, + {"IfData.Pad_cgo_1", Field, 2}, + {"IfData.Physical", Field, 0}, + {"IfData.Recvquota", Field, 0}, + {"IfData.Recvtiming", Field, 0}, + {"IfData.Reserved1", Field, 0}, + {"IfData.Reserved2", Field, 0}, + {"IfData.Spare_char1", Field, 0}, + {"IfData.Spare_char2", Field, 0}, + {"IfData.Type", Field, 0}, + {"IfData.Typelen", Field, 0}, + {"IfData.Unused1", Field, 0}, + {"IfData.Unused2", Field, 0}, + {"IfData.Xmitquota", Field, 0}, + {"IfData.Xmittiming", Field, 0}, + {"IfInfomsg", Type, 0}, + {"IfInfomsg.Change", Field, 0}, + {"IfInfomsg.Family", Field, 0}, + {"IfInfomsg.Flags", Field, 0}, + {"IfInfomsg.Index", Field, 0}, + {"IfInfomsg.Type", Field, 0}, + {"IfInfomsg.X__ifi_pad", Field, 0}, + {"IfMsghdr", Type, 0}, + {"IfMsghdr.Addrs", Field, 0}, + {"IfMsghdr.Data", Field, 0}, + {"IfMsghdr.Flags", Field, 0}, + {"IfMsghdr.Hdrlen", Field, 2}, + {"IfMsghdr.Index", Field, 0}, + {"IfMsghdr.Msglen", Field, 0}, + {"IfMsghdr.Pad1", Field, 2}, + {"IfMsghdr.Pad2", Field, 2}, + {"IfMsghdr.Pad_cgo_0", Field, 0}, + {"IfMsghdr.Pad_cgo_1", Field, 2}, + {"IfMsghdr.Tableid", Field, 2}, + {"IfMsghdr.Type", Field, 0}, + {"IfMsghdr.Version", Field, 0}, + {"IfMsghdr.Xflags", Field, 2}, + {"IfaMsghdr", Type, 0}, + {"IfaMsghdr.Addrs", Field, 0}, + {"IfaMsghdr.Flags", Field, 0}, + {"IfaMsghdr.Hdrlen", Field, 2}, + {"IfaMsghdr.Index", Field, 0}, + {"IfaMsghdr.Metric", Field, 0}, + {"IfaMsghdr.Msglen", Field, 0}, + {"IfaMsghdr.Pad1", Field, 2}, + {"IfaMsghdr.Pad2", Field, 2}, + {"IfaMsghdr.Pad_cgo_0", Field, 0}, + {"IfaMsghdr.Tableid", Field, 2}, + {"IfaMsghdr.Type", Field, 0}, + {"IfaMsghdr.Version", Field, 0}, + {"IfmaMsghdr", Type, 0}, + {"IfmaMsghdr.Addrs", Field, 0}, + {"IfmaMsghdr.Flags", Field, 0}, + {"IfmaMsghdr.Index", Field, 0}, + {"IfmaMsghdr.Msglen", Field, 0}, + {"IfmaMsghdr.Pad_cgo_0", Field, 0}, + {"IfmaMsghdr.Type", Field, 0}, + {"IfmaMsghdr.Version", Field, 0}, + {"IfmaMsghdr2", Type, 0}, + {"IfmaMsghdr2.Addrs", Field, 0}, + {"IfmaMsghdr2.Flags", Field, 0}, + {"IfmaMsghdr2.Index", Field, 0}, + {"IfmaMsghdr2.Msglen", Field, 0}, + {"IfmaMsghdr2.Pad_cgo_0", Field, 0}, + {"IfmaMsghdr2.Refcount", Field, 0}, + {"IfmaMsghdr2.Type", Field, 0}, + {"IfmaMsghdr2.Version", Field, 0}, + {"ImplementsGetwd", Const, 0}, + {"Inet4Pktinfo", Type, 0}, + {"Inet4Pktinfo.Addr", Field, 0}, + {"Inet4Pktinfo.Ifindex", Field, 0}, + {"Inet4Pktinfo.Spec_dst", Field, 0}, + {"Inet6Pktinfo", Type, 0}, + {"Inet6Pktinfo.Addr", Field, 0}, + {"Inet6Pktinfo.Ifindex", Field, 0}, + {"InotifyAddWatch", Func, 0}, + {"InotifyEvent", Type, 0}, + {"InotifyEvent.Cookie", Field, 0}, + {"InotifyEvent.Len", Field, 0}, + {"InotifyEvent.Mask", Field, 0}, + {"InotifyEvent.Name", Field, 0}, + {"InotifyEvent.Wd", Field, 0}, + {"InotifyInit", Func, 0}, + {"InotifyInit1", Func, 0}, + {"InotifyRmWatch", Func, 0}, + {"InterfaceAddrMessage", Type, 0}, + {"InterfaceAddrMessage.Data", Field, 0}, + {"InterfaceAddrMessage.Header", Field, 0}, + {"InterfaceAnnounceMessage", Type, 1}, + {"InterfaceAnnounceMessage.Header", Field, 1}, + {"InterfaceInfo", Type, 0}, + {"InterfaceInfo.Address", Field, 0}, + {"InterfaceInfo.BroadcastAddress", Field, 0}, + {"InterfaceInfo.Flags", Field, 0}, + {"InterfaceInfo.Netmask", Field, 0}, + {"InterfaceMessage", Type, 0}, + {"InterfaceMessage.Data", Field, 0}, + {"InterfaceMessage.Header", Field, 0}, + {"InterfaceMulticastAddrMessage", Type, 0}, + {"InterfaceMulticastAddrMessage.Data", Field, 0}, + {"InterfaceMulticastAddrMessage.Header", Field, 0}, + {"InvalidHandle", Const, 0}, + {"Ioperm", Func, 0}, + {"Iopl", Func, 0}, + {"Iovec", Type, 0}, + {"Iovec.Base", Field, 0}, + {"Iovec.Len", Field, 0}, + {"IpAdapterInfo", Type, 0}, + {"IpAdapterInfo.AdapterName", Field, 0}, + {"IpAdapterInfo.Address", Field, 0}, + {"IpAdapterInfo.AddressLength", Field, 0}, + {"IpAdapterInfo.ComboIndex", Field, 0}, + {"IpAdapterInfo.CurrentIpAddress", Field, 0}, + {"IpAdapterInfo.Description", Field, 0}, + {"IpAdapterInfo.DhcpEnabled", Field, 0}, + {"IpAdapterInfo.DhcpServer", Field, 0}, + {"IpAdapterInfo.GatewayList", Field, 0}, + {"IpAdapterInfo.HaveWins", Field, 0}, + {"IpAdapterInfo.Index", Field, 0}, + {"IpAdapterInfo.IpAddressList", Field, 0}, + {"IpAdapterInfo.LeaseExpires", Field, 0}, + {"IpAdapterInfo.LeaseObtained", Field, 0}, + {"IpAdapterInfo.Next", Field, 0}, + {"IpAdapterInfo.PrimaryWinsServer", Field, 0}, + {"IpAdapterInfo.SecondaryWinsServer", Field, 0}, + {"IpAdapterInfo.Type", Field, 0}, + {"IpAddrString", Type, 0}, + {"IpAddrString.Context", Field, 0}, + {"IpAddrString.IpAddress", Field, 0}, + {"IpAddrString.IpMask", Field, 0}, + {"IpAddrString.Next", Field, 0}, + {"IpAddressString", Type, 0}, + {"IpAddressString.String", Field, 0}, + {"IpMaskString", Type, 0}, + {"IpMaskString.String", Field, 2}, + {"Issetugid", Func, 0}, + {"KEY_ALL_ACCESS", Const, 0}, + {"KEY_CREATE_LINK", Const, 0}, + {"KEY_CREATE_SUB_KEY", Const, 0}, + {"KEY_ENUMERATE_SUB_KEYS", Const, 0}, + {"KEY_EXECUTE", Const, 0}, + {"KEY_NOTIFY", Const, 0}, + {"KEY_QUERY_VALUE", Const, 0}, + {"KEY_READ", Const, 0}, + {"KEY_SET_VALUE", Const, 0}, + {"KEY_WOW64_32KEY", Const, 0}, + {"KEY_WOW64_64KEY", Const, 0}, + {"KEY_WRITE", Const, 0}, + {"Kevent", Func, 0}, + {"Kevent_t", Type, 0}, + {"Kevent_t.Data", Field, 0}, + {"Kevent_t.Fflags", Field, 0}, + {"Kevent_t.Filter", Field, 0}, + {"Kevent_t.Flags", Field, 0}, + {"Kevent_t.Ident", Field, 0}, + {"Kevent_t.Pad_cgo_0", Field, 2}, + {"Kevent_t.Udata", Field, 0}, + {"Kill", Func, 0}, + {"Klogctl", Func, 0}, + {"Kqueue", Func, 0}, + {"LANG_ENGLISH", Const, 0}, + {"LAYERED_PROTOCOL", Const, 2}, + {"LCNT_OVERLOAD_FLUSH", Const, 1}, + {"LINUX_REBOOT_CMD_CAD_OFF", Const, 0}, + {"LINUX_REBOOT_CMD_CAD_ON", Const, 0}, + {"LINUX_REBOOT_CMD_HALT", Const, 0}, + {"LINUX_REBOOT_CMD_KEXEC", Const, 0}, + {"LINUX_REBOOT_CMD_POWER_OFF", Const, 0}, + {"LINUX_REBOOT_CMD_RESTART", Const, 0}, + {"LINUX_REBOOT_CMD_RESTART2", Const, 0}, + {"LINUX_REBOOT_CMD_SW_SUSPEND", Const, 0}, + {"LINUX_REBOOT_MAGIC1", Const, 0}, + {"LINUX_REBOOT_MAGIC2", Const, 0}, + {"LOCK_EX", Const, 0}, + {"LOCK_NB", Const, 0}, + {"LOCK_SH", Const, 0}, + {"LOCK_UN", Const, 0}, + {"LazyDLL", Type, 0}, + {"LazyDLL.Name", Field, 0}, + {"LazyProc", Type, 0}, + {"LazyProc.Name", Field, 0}, + {"Lchown", Func, 0}, + {"Linger", Type, 0}, + {"Linger.Linger", Field, 0}, + {"Linger.Onoff", Field, 0}, + {"Link", Func, 0}, + {"Listen", Func, 0}, + {"Listxattr", Func, 1}, + {"LoadCancelIoEx", Func, 1}, + {"LoadConnectEx", Func, 1}, + {"LoadCreateSymbolicLink", Func, 4}, + {"LoadDLL", Func, 0}, + {"LoadGetAddrInfo", Func, 1}, + {"LoadLibrary", Func, 0}, + {"LoadSetFileCompletionNotificationModes", Func, 2}, + {"LocalFree", Func, 0}, + {"Log2phys_t", Type, 0}, + {"Log2phys_t.Contigbytes", Field, 0}, + {"Log2phys_t.Devoffset", Field, 0}, + {"Log2phys_t.Flags", Field, 0}, + {"LookupAccountName", Func, 0}, + {"LookupAccountSid", Func, 0}, + {"LookupSID", Func, 0}, + {"LsfJump", Func, 0}, + {"LsfSocket", Func, 0}, + {"LsfStmt", Func, 0}, + {"Lstat", Func, 0}, + {"MADV_AUTOSYNC", Const, 1}, + {"MADV_CAN_REUSE", Const, 0}, + {"MADV_CORE", Const, 1}, + {"MADV_DOFORK", Const, 0}, + {"MADV_DONTFORK", Const, 0}, + {"MADV_DONTNEED", Const, 0}, + {"MADV_FREE", Const, 0}, + {"MADV_FREE_REUSABLE", Const, 0}, + {"MADV_FREE_REUSE", Const, 0}, + {"MADV_HUGEPAGE", Const, 0}, + {"MADV_HWPOISON", Const, 0}, + {"MADV_MERGEABLE", Const, 0}, + {"MADV_NOCORE", Const, 1}, + {"MADV_NOHUGEPAGE", Const, 0}, + {"MADV_NORMAL", Const, 0}, + {"MADV_NOSYNC", Const, 1}, + {"MADV_PROTECT", Const, 1}, + {"MADV_RANDOM", Const, 0}, + {"MADV_REMOVE", Const, 0}, + {"MADV_SEQUENTIAL", Const, 0}, + {"MADV_SPACEAVAIL", Const, 3}, + {"MADV_UNMERGEABLE", Const, 0}, + {"MADV_WILLNEED", Const, 0}, + {"MADV_ZERO_WIRED_PAGES", Const, 0}, + {"MAP_32BIT", Const, 0}, + {"MAP_ALIGNED_SUPER", Const, 3}, + {"MAP_ALIGNMENT_16MB", Const, 3}, + {"MAP_ALIGNMENT_1TB", Const, 3}, + {"MAP_ALIGNMENT_256TB", Const, 3}, + {"MAP_ALIGNMENT_4GB", Const, 3}, + {"MAP_ALIGNMENT_64KB", Const, 3}, + {"MAP_ALIGNMENT_64PB", Const, 3}, + {"MAP_ALIGNMENT_MASK", Const, 3}, + {"MAP_ALIGNMENT_SHIFT", Const, 3}, + {"MAP_ANON", Const, 0}, + {"MAP_ANONYMOUS", Const, 0}, + {"MAP_COPY", Const, 0}, + {"MAP_DENYWRITE", Const, 0}, + {"MAP_EXECUTABLE", Const, 0}, + {"MAP_FILE", Const, 0}, + {"MAP_FIXED", Const, 0}, + {"MAP_FLAGMASK", Const, 3}, + {"MAP_GROWSDOWN", Const, 0}, + {"MAP_HASSEMAPHORE", Const, 0}, + {"MAP_HUGETLB", Const, 0}, + {"MAP_INHERIT", Const, 3}, + {"MAP_INHERIT_COPY", Const, 3}, + {"MAP_INHERIT_DEFAULT", Const, 3}, + {"MAP_INHERIT_DONATE_COPY", Const, 3}, + {"MAP_INHERIT_NONE", Const, 3}, + {"MAP_INHERIT_SHARE", Const, 3}, + {"MAP_JIT", Const, 0}, + {"MAP_LOCKED", Const, 0}, + {"MAP_NOCACHE", Const, 0}, + {"MAP_NOCORE", Const, 1}, + {"MAP_NOEXTEND", Const, 0}, + {"MAP_NONBLOCK", Const, 0}, + {"MAP_NORESERVE", Const, 0}, + {"MAP_NOSYNC", Const, 1}, + {"MAP_POPULATE", Const, 0}, + {"MAP_PREFAULT_READ", Const, 1}, + {"MAP_PRIVATE", Const, 0}, + {"MAP_RENAME", Const, 0}, + {"MAP_RESERVED0080", Const, 0}, + {"MAP_RESERVED0100", Const, 1}, + {"MAP_SHARED", Const, 0}, + {"MAP_STACK", Const, 0}, + {"MAP_TRYFIXED", Const, 3}, + {"MAP_TYPE", Const, 0}, + {"MAP_WIRED", Const, 3}, + {"MAXIMUM_REPARSE_DATA_BUFFER_SIZE", Const, 4}, + {"MAXLEN_IFDESCR", Const, 0}, + {"MAXLEN_PHYSADDR", Const, 0}, + {"MAX_ADAPTER_ADDRESS_LENGTH", Const, 0}, + {"MAX_ADAPTER_DESCRIPTION_LENGTH", Const, 0}, + {"MAX_ADAPTER_NAME_LENGTH", Const, 0}, + {"MAX_COMPUTERNAME_LENGTH", Const, 0}, + {"MAX_INTERFACE_NAME_LEN", Const, 0}, + {"MAX_LONG_PATH", Const, 0}, + {"MAX_PATH", Const, 0}, + {"MAX_PROTOCOL_CHAIN", Const, 2}, + {"MCL_CURRENT", Const, 0}, + {"MCL_FUTURE", Const, 0}, + {"MNT_DETACH", Const, 0}, + {"MNT_EXPIRE", Const, 0}, + {"MNT_FORCE", Const, 0}, + {"MSG_BCAST", Const, 1}, + {"MSG_CMSG_CLOEXEC", Const, 0}, + {"MSG_COMPAT", Const, 0}, + {"MSG_CONFIRM", Const, 0}, + {"MSG_CONTROLMBUF", Const, 1}, + {"MSG_CTRUNC", Const, 0}, + {"MSG_DONTROUTE", Const, 0}, + {"MSG_DONTWAIT", Const, 0}, + {"MSG_EOF", Const, 0}, + {"MSG_EOR", Const, 0}, + {"MSG_ERRQUEUE", Const, 0}, + {"MSG_FASTOPEN", Const, 1}, + {"MSG_FIN", Const, 0}, + {"MSG_FLUSH", Const, 0}, + {"MSG_HAVEMORE", Const, 0}, + {"MSG_HOLD", Const, 0}, + {"MSG_IOVUSRSPACE", Const, 1}, + {"MSG_LENUSRSPACE", Const, 1}, + {"MSG_MCAST", Const, 1}, + {"MSG_MORE", Const, 0}, + {"MSG_NAMEMBUF", Const, 1}, + {"MSG_NBIO", Const, 0}, + {"MSG_NEEDSA", Const, 0}, + {"MSG_NOSIGNAL", Const, 0}, + {"MSG_NOTIFICATION", Const, 0}, + {"MSG_OOB", Const, 0}, + {"MSG_PEEK", Const, 0}, + {"MSG_PROXY", Const, 0}, + {"MSG_RCVMORE", Const, 0}, + {"MSG_RST", Const, 0}, + {"MSG_SEND", Const, 0}, + {"MSG_SYN", Const, 0}, + {"MSG_TRUNC", Const, 0}, + {"MSG_TRYHARD", Const, 0}, + {"MSG_USERFLAGS", Const, 1}, + {"MSG_WAITALL", Const, 0}, + {"MSG_WAITFORONE", Const, 0}, + {"MSG_WAITSTREAM", Const, 0}, + {"MS_ACTIVE", Const, 0}, + {"MS_ASYNC", Const, 0}, + {"MS_BIND", Const, 0}, + {"MS_DEACTIVATE", Const, 0}, + {"MS_DIRSYNC", Const, 0}, + {"MS_INVALIDATE", Const, 0}, + {"MS_I_VERSION", Const, 0}, + {"MS_KERNMOUNT", Const, 0}, + {"MS_KILLPAGES", Const, 0}, + {"MS_MANDLOCK", Const, 0}, + {"MS_MGC_MSK", Const, 0}, + {"MS_MGC_VAL", Const, 0}, + {"MS_MOVE", Const, 0}, + {"MS_NOATIME", Const, 0}, + {"MS_NODEV", Const, 0}, + {"MS_NODIRATIME", Const, 0}, + {"MS_NOEXEC", Const, 0}, + {"MS_NOSUID", Const, 0}, + {"MS_NOUSER", Const, 0}, + {"MS_POSIXACL", Const, 0}, + {"MS_PRIVATE", Const, 0}, + {"MS_RDONLY", Const, 0}, + {"MS_REC", Const, 0}, + {"MS_RELATIME", Const, 0}, + {"MS_REMOUNT", Const, 0}, + {"MS_RMT_MASK", Const, 0}, + {"MS_SHARED", Const, 0}, + {"MS_SILENT", Const, 0}, + {"MS_SLAVE", Const, 0}, + {"MS_STRICTATIME", Const, 0}, + {"MS_SYNC", Const, 0}, + {"MS_SYNCHRONOUS", Const, 0}, + {"MS_UNBINDABLE", Const, 0}, + {"Madvise", Func, 0}, + {"MapViewOfFile", Func, 0}, + {"MaxTokenInfoClass", Const, 0}, + {"Mclpool", Type, 2}, + {"Mclpool.Alive", Field, 2}, + {"Mclpool.Cwm", Field, 2}, + {"Mclpool.Grown", Field, 2}, + {"Mclpool.Hwm", Field, 2}, + {"Mclpool.Lwm", Field, 2}, + {"MibIfRow", Type, 0}, + {"MibIfRow.AdminStatus", Field, 0}, + {"MibIfRow.Descr", Field, 0}, + {"MibIfRow.DescrLen", Field, 0}, + {"MibIfRow.InDiscards", Field, 0}, + {"MibIfRow.InErrors", Field, 0}, + {"MibIfRow.InNUcastPkts", Field, 0}, + {"MibIfRow.InOctets", Field, 0}, + {"MibIfRow.InUcastPkts", Field, 0}, + {"MibIfRow.InUnknownProtos", Field, 0}, + {"MibIfRow.Index", Field, 0}, + {"MibIfRow.LastChange", Field, 0}, + {"MibIfRow.Mtu", Field, 0}, + {"MibIfRow.Name", Field, 0}, + {"MibIfRow.OperStatus", Field, 0}, + {"MibIfRow.OutDiscards", Field, 0}, + {"MibIfRow.OutErrors", Field, 0}, + {"MibIfRow.OutNUcastPkts", Field, 0}, + {"MibIfRow.OutOctets", Field, 0}, + {"MibIfRow.OutQLen", Field, 0}, + {"MibIfRow.OutUcastPkts", Field, 0}, + {"MibIfRow.PhysAddr", Field, 0}, + {"MibIfRow.PhysAddrLen", Field, 0}, + {"MibIfRow.Speed", Field, 0}, + {"MibIfRow.Type", Field, 0}, + {"Mkdir", Func, 0}, + {"Mkdirat", Func, 0}, + {"Mkfifo", Func, 0}, + {"Mknod", Func, 0}, + {"Mknodat", Func, 0}, + {"Mlock", Func, 0}, + {"Mlockall", Func, 0}, + {"Mmap", Func, 0}, + {"Mount", Func, 0}, + {"MoveFile", Func, 0}, + {"Mprotect", Func, 0}, + {"Msghdr", Type, 0}, + {"Msghdr.Control", Field, 0}, + {"Msghdr.Controllen", Field, 0}, + {"Msghdr.Flags", Field, 0}, + {"Msghdr.Iov", Field, 0}, + {"Msghdr.Iovlen", Field, 0}, + {"Msghdr.Name", Field, 0}, + {"Msghdr.Namelen", Field, 0}, + {"Msghdr.Pad_cgo_0", Field, 0}, + {"Msghdr.Pad_cgo_1", Field, 0}, + {"Munlock", Func, 0}, + {"Munlockall", Func, 0}, + {"Munmap", Func, 0}, + {"MustLoadDLL", Func, 0}, + {"NAME_MAX", Const, 0}, + {"NETLINK_ADD_MEMBERSHIP", Const, 0}, + {"NETLINK_AUDIT", Const, 0}, + {"NETLINK_BROADCAST_ERROR", Const, 0}, + {"NETLINK_CONNECTOR", Const, 0}, + {"NETLINK_DNRTMSG", Const, 0}, + {"NETLINK_DROP_MEMBERSHIP", Const, 0}, + {"NETLINK_ECRYPTFS", Const, 0}, + {"NETLINK_FIB_LOOKUP", Const, 0}, + {"NETLINK_FIREWALL", Const, 0}, + {"NETLINK_GENERIC", Const, 0}, + {"NETLINK_INET_DIAG", Const, 0}, + {"NETLINK_IP6_FW", Const, 0}, + {"NETLINK_ISCSI", Const, 0}, + {"NETLINK_KOBJECT_UEVENT", Const, 0}, + {"NETLINK_NETFILTER", Const, 0}, + {"NETLINK_NFLOG", Const, 0}, + {"NETLINK_NO_ENOBUFS", Const, 0}, + {"NETLINK_PKTINFO", Const, 0}, + {"NETLINK_RDMA", Const, 0}, + {"NETLINK_ROUTE", Const, 0}, + {"NETLINK_SCSITRANSPORT", Const, 0}, + {"NETLINK_SELINUX", Const, 0}, + {"NETLINK_UNUSED", Const, 0}, + {"NETLINK_USERSOCK", Const, 0}, + {"NETLINK_XFRM", Const, 0}, + {"NET_RT_DUMP", Const, 0}, + {"NET_RT_DUMP2", Const, 0}, + {"NET_RT_FLAGS", Const, 0}, + {"NET_RT_IFLIST", Const, 0}, + {"NET_RT_IFLIST2", Const, 0}, + {"NET_RT_IFLISTL", Const, 1}, + {"NET_RT_IFMALIST", Const, 0}, + {"NET_RT_MAXID", Const, 0}, + {"NET_RT_OIFLIST", Const, 1}, + {"NET_RT_OOIFLIST", Const, 1}, + {"NET_RT_STAT", Const, 0}, + {"NET_RT_STATS", Const, 1}, + {"NET_RT_TABLE", Const, 1}, + {"NET_RT_TRASH", Const, 0}, + {"NLA_ALIGNTO", Const, 0}, + {"NLA_F_NESTED", Const, 0}, + {"NLA_F_NET_BYTEORDER", Const, 0}, + {"NLA_HDRLEN", Const, 0}, + {"NLMSG_ALIGNTO", Const, 0}, + {"NLMSG_DONE", Const, 0}, + {"NLMSG_ERROR", Const, 0}, + {"NLMSG_HDRLEN", Const, 0}, + {"NLMSG_MIN_TYPE", Const, 0}, + {"NLMSG_NOOP", Const, 0}, + {"NLMSG_OVERRUN", Const, 0}, + {"NLM_F_ACK", Const, 0}, + {"NLM_F_APPEND", Const, 0}, + {"NLM_F_ATOMIC", Const, 0}, + {"NLM_F_CREATE", Const, 0}, + {"NLM_F_DUMP", Const, 0}, + {"NLM_F_ECHO", Const, 0}, + {"NLM_F_EXCL", Const, 0}, + {"NLM_F_MATCH", Const, 0}, + {"NLM_F_MULTI", Const, 0}, + {"NLM_F_REPLACE", Const, 0}, + {"NLM_F_REQUEST", Const, 0}, + {"NLM_F_ROOT", Const, 0}, + {"NOFLSH", Const, 0}, + {"NOTE_ABSOLUTE", Const, 0}, + {"NOTE_ATTRIB", Const, 0}, + {"NOTE_BACKGROUND", Const, 16}, + {"NOTE_CHILD", Const, 0}, + {"NOTE_CRITICAL", Const, 16}, + {"NOTE_DELETE", Const, 0}, + {"NOTE_EOF", Const, 1}, + {"NOTE_EXEC", Const, 0}, + {"NOTE_EXIT", Const, 0}, + {"NOTE_EXITSTATUS", Const, 0}, + {"NOTE_EXIT_CSERROR", Const, 16}, + {"NOTE_EXIT_DECRYPTFAIL", Const, 16}, + {"NOTE_EXIT_DETAIL", Const, 16}, + {"NOTE_EXIT_DETAIL_MASK", Const, 16}, + {"NOTE_EXIT_MEMORY", Const, 16}, + {"NOTE_EXIT_REPARENTED", Const, 16}, + {"NOTE_EXTEND", Const, 0}, + {"NOTE_FFAND", Const, 0}, + {"NOTE_FFCOPY", Const, 0}, + {"NOTE_FFCTRLMASK", Const, 0}, + {"NOTE_FFLAGSMASK", Const, 0}, + {"NOTE_FFNOP", Const, 0}, + {"NOTE_FFOR", Const, 0}, + {"NOTE_FORK", Const, 0}, + {"NOTE_LEEWAY", Const, 16}, + {"NOTE_LINK", Const, 0}, + {"NOTE_LOWAT", Const, 0}, + {"NOTE_NONE", Const, 0}, + {"NOTE_NSECONDS", Const, 0}, + {"NOTE_PCTRLMASK", Const, 0}, + {"NOTE_PDATAMASK", Const, 0}, + {"NOTE_REAP", Const, 0}, + {"NOTE_RENAME", Const, 0}, + {"NOTE_RESOURCEEND", Const, 0}, + {"NOTE_REVOKE", Const, 0}, + {"NOTE_SECONDS", Const, 0}, + {"NOTE_SIGNAL", Const, 0}, + {"NOTE_TRACK", Const, 0}, + {"NOTE_TRACKERR", Const, 0}, + {"NOTE_TRIGGER", Const, 0}, + {"NOTE_TRUNCATE", Const, 1}, + {"NOTE_USECONDS", Const, 0}, + {"NOTE_VM_ERROR", Const, 0}, + {"NOTE_VM_PRESSURE", Const, 0}, + {"NOTE_VM_PRESSURE_SUDDEN_TERMINATE", Const, 0}, + {"NOTE_VM_PRESSURE_TERMINATE", Const, 0}, + {"NOTE_WRITE", Const, 0}, + {"NameCanonical", Const, 0}, + {"NameCanonicalEx", Const, 0}, + {"NameDisplay", Const, 0}, + {"NameDnsDomain", Const, 0}, + {"NameFullyQualifiedDN", Const, 0}, + {"NameSamCompatible", Const, 0}, + {"NameServicePrincipal", Const, 0}, + {"NameUniqueId", Const, 0}, + {"NameUnknown", Const, 0}, + {"NameUserPrincipal", Const, 0}, + {"Nanosleep", Func, 0}, + {"NetApiBufferFree", Func, 0}, + {"NetGetJoinInformation", Func, 2}, + {"NetSetupDomainName", Const, 2}, + {"NetSetupUnjoined", Const, 2}, + {"NetSetupUnknownStatus", Const, 2}, + {"NetSetupWorkgroupName", Const, 2}, + {"NetUserGetInfo", Func, 0}, + {"NetlinkMessage", Type, 0}, + {"NetlinkMessage.Data", Field, 0}, + {"NetlinkMessage.Header", Field, 0}, + {"NetlinkRIB", Func, 0}, + {"NetlinkRouteAttr", Type, 0}, + {"NetlinkRouteAttr.Attr", Field, 0}, + {"NetlinkRouteAttr.Value", Field, 0}, + {"NetlinkRouteRequest", Type, 0}, + {"NetlinkRouteRequest.Data", Field, 0}, + {"NetlinkRouteRequest.Header", Field, 0}, + {"NewCallback", Func, 0}, + {"NewCallbackCDecl", Func, 3}, + {"NewLazyDLL", Func, 0}, + {"NlAttr", Type, 0}, + {"NlAttr.Len", Field, 0}, + {"NlAttr.Type", Field, 0}, + {"NlMsgerr", Type, 0}, + {"NlMsgerr.Error", Field, 0}, + {"NlMsgerr.Msg", Field, 0}, + {"NlMsghdr", Type, 0}, + {"NlMsghdr.Flags", Field, 0}, + {"NlMsghdr.Len", Field, 0}, + {"NlMsghdr.Pid", Field, 0}, + {"NlMsghdr.Seq", Field, 0}, + {"NlMsghdr.Type", Field, 0}, + {"NsecToFiletime", Func, 0}, + {"NsecToTimespec", Func, 0}, + {"NsecToTimeval", Func, 0}, + {"Ntohs", Func, 0}, + {"OCRNL", Const, 0}, + {"OFDEL", Const, 0}, + {"OFILL", Const, 0}, + {"OFIOGETBMAP", Const, 1}, + {"OID_PKIX_KP_SERVER_AUTH", Var, 0}, + {"OID_SERVER_GATED_CRYPTO", Var, 0}, + {"OID_SGC_NETSCAPE", Var, 0}, + {"OLCUC", Const, 0}, + {"ONLCR", Const, 0}, + {"ONLRET", Const, 0}, + {"ONOCR", Const, 0}, + {"ONOEOT", Const, 1}, + {"OPEN_ALWAYS", Const, 0}, + {"OPEN_EXISTING", Const, 0}, + {"OPOST", Const, 0}, + {"O_ACCMODE", Const, 0}, + {"O_ALERT", Const, 0}, + {"O_ALT_IO", Const, 1}, + {"O_APPEND", Const, 0}, + {"O_ASYNC", Const, 0}, + {"O_CLOEXEC", Const, 0}, + {"O_CREAT", Const, 0}, + {"O_DIRECT", Const, 0}, + {"O_DIRECTORY", Const, 0}, + {"O_DP_GETRAWENCRYPTED", Const, 16}, + {"O_DSYNC", Const, 0}, + {"O_EVTONLY", Const, 0}, + {"O_EXCL", Const, 0}, + {"O_EXEC", Const, 0}, + {"O_EXLOCK", Const, 0}, + {"O_FSYNC", Const, 0}, + {"O_LARGEFILE", Const, 0}, + {"O_NDELAY", Const, 0}, + {"O_NOATIME", Const, 0}, + {"O_NOCTTY", Const, 0}, + {"O_NOFOLLOW", Const, 0}, + {"O_NONBLOCK", Const, 0}, + {"O_NOSIGPIPE", Const, 1}, + {"O_POPUP", Const, 0}, + {"O_RDONLY", Const, 0}, + {"O_RDWR", Const, 0}, + {"O_RSYNC", Const, 0}, + {"O_SHLOCK", Const, 0}, + {"O_SYMLINK", Const, 0}, + {"O_SYNC", Const, 0}, + {"O_TRUNC", Const, 0}, + {"O_TTY_INIT", Const, 0}, + {"O_WRONLY", Const, 0}, + {"Open", Func, 0}, + {"OpenCurrentProcessToken", Func, 0}, + {"OpenProcess", Func, 0}, + {"OpenProcessToken", Func, 0}, + {"Openat", Func, 0}, + {"Overlapped", Type, 0}, + {"Overlapped.HEvent", Field, 0}, + {"Overlapped.Internal", Field, 0}, + {"Overlapped.InternalHigh", Field, 0}, + {"Overlapped.Offset", Field, 0}, + {"Overlapped.OffsetHigh", Field, 0}, + {"PACKET_ADD_MEMBERSHIP", Const, 0}, + {"PACKET_BROADCAST", Const, 0}, + {"PACKET_DROP_MEMBERSHIP", Const, 0}, + {"PACKET_FASTROUTE", Const, 0}, + {"PACKET_HOST", Const, 0}, + {"PACKET_LOOPBACK", Const, 0}, + {"PACKET_MR_ALLMULTI", Const, 0}, + {"PACKET_MR_MULTICAST", Const, 0}, + {"PACKET_MR_PROMISC", Const, 0}, + {"PACKET_MULTICAST", Const, 0}, + {"PACKET_OTHERHOST", Const, 0}, + {"PACKET_OUTGOING", Const, 0}, + {"PACKET_RECV_OUTPUT", Const, 0}, + {"PACKET_RX_RING", Const, 0}, + {"PACKET_STATISTICS", Const, 0}, + {"PAGE_EXECUTE_READ", Const, 0}, + {"PAGE_EXECUTE_READWRITE", Const, 0}, + {"PAGE_EXECUTE_WRITECOPY", Const, 0}, + {"PAGE_READONLY", Const, 0}, + {"PAGE_READWRITE", Const, 0}, + {"PAGE_WRITECOPY", Const, 0}, + {"PARENB", Const, 0}, + {"PARMRK", Const, 0}, + {"PARODD", Const, 0}, + {"PENDIN", Const, 0}, + {"PFL_HIDDEN", Const, 2}, + {"PFL_MATCHES_PROTOCOL_ZERO", Const, 2}, + {"PFL_MULTIPLE_PROTO_ENTRIES", Const, 2}, + {"PFL_NETWORKDIRECT_PROVIDER", Const, 2}, + {"PFL_RECOMMENDED_PROTO_ENTRY", Const, 2}, + {"PF_FLUSH", Const, 1}, + {"PKCS_7_ASN_ENCODING", Const, 0}, + {"PMC5_PIPELINE_FLUSH", Const, 1}, + {"PRIO_PGRP", Const, 2}, + {"PRIO_PROCESS", Const, 2}, + {"PRIO_USER", Const, 2}, + {"PRI_IOFLUSH", Const, 1}, + {"PROCESS_QUERY_INFORMATION", Const, 0}, + {"PROCESS_TERMINATE", Const, 2}, + {"PROT_EXEC", Const, 0}, + {"PROT_GROWSDOWN", Const, 0}, + {"PROT_GROWSUP", Const, 0}, + {"PROT_NONE", Const, 0}, + {"PROT_READ", Const, 0}, + {"PROT_WRITE", Const, 0}, + {"PROV_DH_SCHANNEL", Const, 0}, + {"PROV_DSS", Const, 0}, + {"PROV_DSS_DH", Const, 0}, + {"PROV_EC_ECDSA_FULL", Const, 0}, + {"PROV_EC_ECDSA_SIG", Const, 0}, + {"PROV_EC_ECNRA_FULL", Const, 0}, + {"PROV_EC_ECNRA_SIG", Const, 0}, + {"PROV_FORTEZZA", Const, 0}, + {"PROV_INTEL_SEC", Const, 0}, + {"PROV_MS_EXCHANGE", Const, 0}, + {"PROV_REPLACE_OWF", Const, 0}, + {"PROV_RNG", Const, 0}, + {"PROV_RSA_AES", Const, 0}, + {"PROV_RSA_FULL", Const, 0}, + {"PROV_RSA_SCHANNEL", Const, 0}, + {"PROV_RSA_SIG", Const, 0}, + {"PROV_SPYRUS_LYNKS", Const, 0}, + {"PROV_SSL", Const, 0}, + {"PR_CAPBSET_DROP", Const, 0}, + {"PR_CAPBSET_READ", Const, 0}, + {"PR_CLEAR_SECCOMP_FILTER", Const, 0}, + {"PR_ENDIAN_BIG", Const, 0}, + {"PR_ENDIAN_LITTLE", Const, 0}, + {"PR_ENDIAN_PPC_LITTLE", Const, 0}, + {"PR_FPEMU_NOPRINT", Const, 0}, + {"PR_FPEMU_SIGFPE", Const, 0}, + {"PR_FP_EXC_ASYNC", Const, 0}, + {"PR_FP_EXC_DISABLED", Const, 0}, + {"PR_FP_EXC_DIV", Const, 0}, + {"PR_FP_EXC_INV", Const, 0}, + {"PR_FP_EXC_NONRECOV", Const, 0}, + {"PR_FP_EXC_OVF", Const, 0}, + {"PR_FP_EXC_PRECISE", Const, 0}, + {"PR_FP_EXC_RES", Const, 0}, + {"PR_FP_EXC_SW_ENABLE", Const, 0}, + {"PR_FP_EXC_UND", Const, 0}, + {"PR_GET_DUMPABLE", Const, 0}, + {"PR_GET_ENDIAN", Const, 0}, + {"PR_GET_FPEMU", Const, 0}, + {"PR_GET_FPEXC", Const, 0}, + {"PR_GET_KEEPCAPS", Const, 0}, + {"PR_GET_NAME", Const, 0}, + {"PR_GET_PDEATHSIG", Const, 0}, + {"PR_GET_SECCOMP", Const, 0}, + {"PR_GET_SECCOMP_FILTER", Const, 0}, + {"PR_GET_SECUREBITS", Const, 0}, + {"PR_GET_TIMERSLACK", Const, 0}, + {"PR_GET_TIMING", Const, 0}, + {"PR_GET_TSC", Const, 0}, + {"PR_GET_UNALIGN", Const, 0}, + {"PR_MCE_KILL", Const, 0}, + {"PR_MCE_KILL_CLEAR", Const, 0}, + {"PR_MCE_KILL_DEFAULT", Const, 0}, + {"PR_MCE_KILL_EARLY", Const, 0}, + {"PR_MCE_KILL_GET", Const, 0}, + {"PR_MCE_KILL_LATE", Const, 0}, + {"PR_MCE_KILL_SET", Const, 0}, + {"PR_SECCOMP_FILTER_EVENT", Const, 0}, + {"PR_SECCOMP_FILTER_SYSCALL", Const, 0}, + {"PR_SET_DUMPABLE", Const, 0}, + {"PR_SET_ENDIAN", Const, 0}, + {"PR_SET_FPEMU", Const, 0}, + {"PR_SET_FPEXC", Const, 0}, + {"PR_SET_KEEPCAPS", Const, 0}, + {"PR_SET_NAME", Const, 0}, + {"PR_SET_PDEATHSIG", Const, 0}, + {"PR_SET_PTRACER", Const, 0}, + {"PR_SET_SECCOMP", Const, 0}, + {"PR_SET_SECCOMP_FILTER", Const, 0}, + {"PR_SET_SECUREBITS", Const, 0}, + {"PR_SET_TIMERSLACK", Const, 0}, + {"PR_SET_TIMING", Const, 0}, + {"PR_SET_TSC", Const, 0}, + {"PR_SET_UNALIGN", Const, 0}, + {"PR_TASK_PERF_EVENTS_DISABLE", Const, 0}, + {"PR_TASK_PERF_EVENTS_ENABLE", Const, 0}, + {"PR_TIMING_STATISTICAL", Const, 0}, + {"PR_TIMING_TIMESTAMP", Const, 0}, + {"PR_TSC_ENABLE", Const, 0}, + {"PR_TSC_SIGSEGV", Const, 0}, + {"PR_UNALIGN_NOPRINT", Const, 0}, + {"PR_UNALIGN_SIGBUS", Const, 0}, + {"PTRACE_ARCH_PRCTL", Const, 0}, + {"PTRACE_ATTACH", Const, 0}, + {"PTRACE_CONT", Const, 0}, + {"PTRACE_DETACH", Const, 0}, + {"PTRACE_EVENT_CLONE", Const, 0}, + {"PTRACE_EVENT_EXEC", Const, 0}, + {"PTRACE_EVENT_EXIT", Const, 0}, + {"PTRACE_EVENT_FORK", Const, 0}, + {"PTRACE_EVENT_VFORK", Const, 0}, + {"PTRACE_EVENT_VFORK_DONE", Const, 0}, + {"PTRACE_GETCRUNCHREGS", Const, 0}, + {"PTRACE_GETEVENTMSG", Const, 0}, + {"PTRACE_GETFPREGS", Const, 0}, + {"PTRACE_GETFPXREGS", Const, 0}, + {"PTRACE_GETHBPREGS", Const, 0}, + {"PTRACE_GETREGS", Const, 0}, + {"PTRACE_GETREGSET", Const, 0}, + {"PTRACE_GETSIGINFO", Const, 0}, + {"PTRACE_GETVFPREGS", Const, 0}, + {"PTRACE_GETWMMXREGS", Const, 0}, + {"PTRACE_GET_THREAD_AREA", Const, 0}, + {"PTRACE_KILL", Const, 0}, + {"PTRACE_OLDSETOPTIONS", Const, 0}, + {"PTRACE_O_MASK", Const, 0}, + {"PTRACE_O_TRACECLONE", Const, 0}, + {"PTRACE_O_TRACEEXEC", Const, 0}, + {"PTRACE_O_TRACEEXIT", Const, 0}, + {"PTRACE_O_TRACEFORK", Const, 0}, + {"PTRACE_O_TRACESYSGOOD", Const, 0}, + {"PTRACE_O_TRACEVFORK", Const, 0}, + {"PTRACE_O_TRACEVFORKDONE", Const, 0}, + {"PTRACE_PEEKDATA", Const, 0}, + {"PTRACE_PEEKTEXT", Const, 0}, + {"PTRACE_PEEKUSR", Const, 0}, + {"PTRACE_POKEDATA", Const, 0}, + {"PTRACE_POKETEXT", Const, 0}, + {"PTRACE_POKEUSR", Const, 0}, + {"PTRACE_SETCRUNCHREGS", Const, 0}, + {"PTRACE_SETFPREGS", Const, 0}, + {"PTRACE_SETFPXREGS", Const, 0}, + {"PTRACE_SETHBPREGS", Const, 0}, + {"PTRACE_SETOPTIONS", Const, 0}, + {"PTRACE_SETREGS", Const, 0}, + {"PTRACE_SETREGSET", Const, 0}, + {"PTRACE_SETSIGINFO", Const, 0}, + {"PTRACE_SETVFPREGS", Const, 0}, + {"PTRACE_SETWMMXREGS", Const, 0}, + {"PTRACE_SET_SYSCALL", Const, 0}, + {"PTRACE_SET_THREAD_AREA", Const, 0}, + {"PTRACE_SINGLEBLOCK", Const, 0}, + {"PTRACE_SINGLESTEP", Const, 0}, + {"PTRACE_SYSCALL", Const, 0}, + {"PTRACE_SYSEMU", Const, 0}, + {"PTRACE_SYSEMU_SINGLESTEP", Const, 0}, + {"PTRACE_TRACEME", Const, 0}, + {"PT_ATTACH", Const, 0}, + {"PT_ATTACHEXC", Const, 0}, + {"PT_CONTINUE", Const, 0}, + {"PT_DATA_ADDR", Const, 0}, + {"PT_DENY_ATTACH", Const, 0}, + {"PT_DETACH", Const, 0}, + {"PT_FIRSTMACH", Const, 0}, + {"PT_FORCEQUOTA", Const, 0}, + {"PT_KILL", Const, 0}, + {"PT_MASK", Const, 1}, + {"PT_READ_D", Const, 0}, + {"PT_READ_I", Const, 0}, + {"PT_READ_U", Const, 0}, + {"PT_SIGEXC", Const, 0}, + {"PT_STEP", Const, 0}, + {"PT_TEXT_ADDR", Const, 0}, + {"PT_TEXT_END_ADDR", Const, 0}, + {"PT_THUPDATE", Const, 0}, + {"PT_TRACE_ME", Const, 0}, + {"PT_WRITE_D", Const, 0}, + {"PT_WRITE_I", Const, 0}, + {"PT_WRITE_U", Const, 0}, + {"ParseDirent", Func, 0}, + {"ParseNetlinkMessage", Func, 0}, + {"ParseNetlinkRouteAttr", Func, 0}, + {"ParseRoutingMessage", Func, 0}, + {"ParseRoutingSockaddr", Func, 0}, + {"ParseSocketControlMessage", Func, 0}, + {"ParseUnixCredentials", Func, 0}, + {"ParseUnixRights", Func, 0}, + {"PathMax", Const, 0}, + {"Pathconf", Func, 0}, + {"Pause", Func, 0}, + {"Pipe", Func, 0}, + {"Pipe2", Func, 1}, + {"PivotRoot", Func, 0}, + {"Pointer", Type, 11}, + {"PostQueuedCompletionStatus", Func, 0}, + {"Pread", Func, 0}, + {"Proc", Type, 0}, + {"Proc.Dll", Field, 0}, + {"Proc.Name", Field, 0}, + {"ProcAttr", Type, 0}, + {"ProcAttr.Dir", Field, 0}, + {"ProcAttr.Env", Field, 0}, + {"ProcAttr.Files", Field, 0}, + {"ProcAttr.Sys", Field, 0}, + {"Process32First", Func, 4}, + {"Process32Next", Func, 4}, + {"ProcessEntry32", Type, 4}, + {"ProcessEntry32.DefaultHeapID", Field, 4}, + {"ProcessEntry32.ExeFile", Field, 4}, + {"ProcessEntry32.Flags", Field, 4}, + {"ProcessEntry32.ModuleID", Field, 4}, + {"ProcessEntry32.ParentProcessID", Field, 4}, + {"ProcessEntry32.PriClassBase", Field, 4}, + {"ProcessEntry32.ProcessID", Field, 4}, + {"ProcessEntry32.Size", Field, 4}, + {"ProcessEntry32.Threads", Field, 4}, + {"ProcessEntry32.Usage", Field, 4}, + {"ProcessInformation", Type, 0}, + {"ProcessInformation.Process", Field, 0}, + {"ProcessInformation.ProcessId", Field, 0}, + {"ProcessInformation.Thread", Field, 0}, + {"ProcessInformation.ThreadId", Field, 0}, + {"Protoent", Type, 0}, + {"Protoent.Aliases", Field, 0}, + {"Protoent.Name", Field, 0}, + {"Protoent.Proto", Field, 0}, + {"PtraceAttach", Func, 0}, + {"PtraceCont", Func, 0}, + {"PtraceDetach", Func, 0}, + {"PtraceGetEventMsg", Func, 0}, + {"PtraceGetRegs", Func, 0}, + {"PtracePeekData", Func, 0}, + {"PtracePeekText", Func, 0}, + {"PtracePokeData", Func, 0}, + {"PtracePokeText", Func, 0}, + {"PtraceRegs", Type, 0}, + {"PtraceRegs.Cs", Field, 0}, + {"PtraceRegs.Ds", Field, 0}, + {"PtraceRegs.Eax", Field, 0}, + {"PtraceRegs.Ebp", Field, 0}, + {"PtraceRegs.Ebx", Field, 0}, + {"PtraceRegs.Ecx", Field, 0}, + {"PtraceRegs.Edi", Field, 0}, + {"PtraceRegs.Edx", Field, 0}, + {"PtraceRegs.Eflags", Field, 0}, + {"PtraceRegs.Eip", Field, 0}, + {"PtraceRegs.Es", Field, 0}, + {"PtraceRegs.Esi", Field, 0}, + {"PtraceRegs.Esp", Field, 0}, + {"PtraceRegs.Fs", Field, 0}, + {"PtraceRegs.Fs_base", Field, 0}, + {"PtraceRegs.Gs", Field, 0}, + {"PtraceRegs.Gs_base", Field, 0}, + {"PtraceRegs.Orig_eax", Field, 0}, + {"PtraceRegs.Orig_rax", Field, 0}, + {"PtraceRegs.R10", Field, 0}, + {"PtraceRegs.R11", Field, 0}, + {"PtraceRegs.R12", Field, 0}, + {"PtraceRegs.R13", Field, 0}, + {"PtraceRegs.R14", Field, 0}, + {"PtraceRegs.R15", Field, 0}, + {"PtraceRegs.R8", Field, 0}, + {"PtraceRegs.R9", Field, 0}, + {"PtraceRegs.Rax", Field, 0}, + {"PtraceRegs.Rbp", Field, 0}, + {"PtraceRegs.Rbx", Field, 0}, + {"PtraceRegs.Rcx", Field, 0}, + {"PtraceRegs.Rdi", Field, 0}, + {"PtraceRegs.Rdx", Field, 0}, + {"PtraceRegs.Rip", Field, 0}, + {"PtraceRegs.Rsi", Field, 0}, + {"PtraceRegs.Rsp", Field, 0}, + {"PtraceRegs.Ss", Field, 0}, + {"PtraceRegs.Uregs", Field, 0}, + {"PtraceRegs.Xcs", Field, 0}, + {"PtraceRegs.Xds", Field, 0}, + {"PtraceRegs.Xes", Field, 0}, + {"PtraceRegs.Xfs", Field, 0}, + {"PtraceRegs.Xgs", Field, 0}, + {"PtraceRegs.Xss", Field, 0}, + {"PtraceSetOptions", Func, 0}, + {"PtraceSetRegs", Func, 0}, + {"PtraceSingleStep", Func, 0}, + {"PtraceSyscall", Func, 1}, + {"Pwrite", Func, 0}, + {"REG_BINARY", Const, 0}, + {"REG_DWORD", Const, 0}, + {"REG_DWORD_BIG_ENDIAN", Const, 0}, + {"REG_DWORD_LITTLE_ENDIAN", Const, 0}, + {"REG_EXPAND_SZ", Const, 0}, + {"REG_FULL_RESOURCE_DESCRIPTOR", Const, 0}, + {"REG_LINK", Const, 0}, + {"REG_MULTI_SZ", Const, 0}, + {"REG_NONE", Const, 0}, + {"REG_QWORD", Const, 0}, + {"REG_QWORD_LITTLE_ENDIAN", Const, 0}, + {"REG_RESOURCE_LIST", Const, 0}, + {"REG_RESOURCE_REQUIREMENTS_LIST", Const, 0}, + {"REG_SZ", Const, 0}, + {"RLIMIT_AS", Const, 0}, + {"RLIMIT_CORE", Const, 0}, + {"RLIMIT_CPU", Const, 0}, + {"RLIMIT_CPU_USAGE_MONITOR", Const, 16}, + {"RLIMIT_DATA", Const, 0}, + {"RLIMIT_FSIZE", Const, 0}, + {"RLIMIT_NOFILE", Const, 0}, + {"RLIMIT_STACK", Const, 0}, + {"RLIM_INFINITY", Const, 0}, + {"RTAX_ADVMSS", Const, 0}, + {"RTAX_AUTHOR", Const, 0}, + {"RTAX_BRD", Const, 0}, + {"RTAX_CWND", Const, 0}, + {"RTAX_DST", Const, 0}, + {"RTAX_FEATURES", Const, 0}, + {"RTAX_FEATURE_ALLFRAG", Const, 0}, + {"RTAX_FEATURE_ECN", Const, 0}, + {"RTAX_FEATURE_SACK", Const, 0}, + {"RTAX_FEATURE_TIMESTAMP", Const, 0}, + {"RTAX_GATEWAY", Const, 0}, + {"RTAX_GENMASK", Const, 0}, + {"RTAX_HOPLIMIT", Const, 0}, + {"RTAX_IFA", Const, 0}, + {"RTAX_IFP", Const, 0}, + {"RTAX_INITCWND", Const, 0}, + {"RTAX_INITRWND", Const, 0}, + {"RTAX_LABEL", Const, 1}, + {"RTAX_LOCK", Const, 0}, + {"RTAX_MAX", Const, 0}, + {"RTAX_MTU", Const, 0}, + {"RTAX_NETMASK", Const, 0}, + {"RTAX_REORDERING", Const, 0}, + {"RTAX_RTO_MIN", Const, 0}, + {"RTAX_RTT", Const, 0}, + {"RTAX_RTTVAR", Const, 0}, + {"RTAX_SRC", Const, 1}, + {"RTAX_SRCMASK", Const, 1}, + {"RTAX_SSTHRESH", Const, 0}, + {"RTAX_TAG", Const, 1}, + {"RTAX_UNSPEC", Const, 0}, + {"RTAX_WINDOW", Const, 0}, + {"RTA_ALIGNTO", Const, 0}, + {"RTA_AUTHOR", Const, 0}, + {"RTA_BRD", Const, 0}, + {"RTA_CACHEINFO", Const, 0}, + {"RTA_DST", Const, 0}, + {"RTA_FLOW", Const, 0}, + {"RTA_GATEWAY", Const, 0}, + {"RTA_GENMASK", Const, 0}, + {"RTA_IFA", Const, 0}, + {"RTA_IFP", Const, 0}, + {"RTA_IIF", Const, 0}, + {"RTA_LABEL", Const, 1}, + {"RTA_MAX", Const, 0}, + {"RTA_METRICS", Const, 0}, + {"RTA_MULTIPATH", Const, 0}, + {"RTA_NETMASK", Const, 0}, + {"RTA_OIF", Const, 0}, + {"RTA_PREFSRC", Const, 0}, + {"RTA_PRIORITY", Const, 0}, + {"RTA_SRC", Const, 0}, + {"RTA_SRCMASK", Const, 1}, + {"RTA_TABLE", Const, 0}, + {"RTA_TAG", Const, 1}, + {"RTA_UNSPEC", Const, 0}, + {"RTCF_DIRECTSRC", Const, 0}, + {"RTCF_DOREDIRECT", Const, 0}, + {"RTCF_LOG", Const, 0}, + {"RTCF_MASQ", Const, 0}, + {"RTCF_NAT", Const, 0}, + {"RTCF_VALVE", Const, 0}, + {"RTF_ADDRCLASSMASK", Const, 0}, + {"RTF_ADDRCONF", Const, 0}, + {"RTF_ALLONLINK", Const, 0}, + {"RTF_ANNOUNCE", Const, 1}, + {"RTF_BLACKHOLE", Const, 0}, + {"RTF_BROADCAST", Const, 0}, + {"RTF_CACHE", Const, 0}, + {"RTF_CLONED", Const, 1}, + {"RTF_CLONING", Const, 0}, + {"RTF_CONDEMNED", Const, 0}, + {"RTF_DEFAULT", Const, 0}, + {"RTF_DELCLONE", Const, 0}, + {"RTF_DONE", Const, 0}, + {"RTF_DYNAMIC", Const, 0}, + {"RTF_FLOW", Const, 0}, + {"RTF_FMASK", Const, 0}, + {"RTF_GATEWAY", Const, 0}, + {"RTF_GWFLAG_COMPAT", Const, 3}, + {"RTF_HOST", Const, 0}, + {"RTF_IFREF", Const, 0}, + {"RTF_IFSCOPE", Const, 0}, + {"RTF_INTERFACE", Const, 0}, + {"RTF_IRTT", Const, 0}, + {"RTF_LINKRT", Const, 0}, + {"RTF_LLDATA", Const, 0}, + {"RTF_LLINFO", Const, 0}, + {"RTF_LOCAL", Const, 0}, + {"RTF_MASK", Const, 1}, + {"RTF_MODIFIED", Const, 0}, + {"RTF_MPATH", Const, 1}, + {"RTF_MPLS", Const, 1}, + {"RTF_MSS", Const, 0}, + {"RTF_MTU", Const, 0}, + {"RTF_MULTICAST", Const, 0}, + {"RTF_NAT", Const, 0}, + {"RTF_NOFORWARD", Const, 0}, + {"RTF_NONEXTHOP", Const, 0}, + {"RTF_NOPMTUDISC", Const, 0}, + {"RTF_PERMANENT_ARP", Const, 1}, + {"RTF_PINNED", Const, 0}, + {"RTF_POLICY", Const, 0}, + {"RTF_PRCLONING", Const, 0}, + {"RTF_PROTO1", Const, 0}, + {"RTF_PROTO2", Const, 0}, + {"RTF_PROTO3", Const, 0}, + {"RTF_PROXY", Const, 16}, + {"RTF_REINSTATE", Const, 0}, + {"RTF_REJECT", Const, 0}, + {"RTF_RNH_LOCKED", Const, 0}, + {"RTF_ROUTER", Const, 16}, + {"RTF_SOURCE", Const, 1}, + {"RTF_SRC", Const, 1}, + {"RTF_STATIC", Const, 0}, + {"RTF_STICKY", Const, 0}, + {"RTF_THROW", Const, 0}, + {"RTF_TUNNEL", Const, 1}, + {"RTF_UP", Const, 0}, + {"RTF_USETRAILERS", Const, 1}, + {"RTF_WASCLONED", Const, 0}, + {"RTF_WINDOW", Const, 0}, + {"RTF_XRESOLVE", Const, 0}, + {"RTM_ADD", Const, 0}, + {"RTM_BASE", Const, 0}, + {"RTM_CHANGE", Const, 0}, + {"RTM_CHGADDR", Const, 1}, + {"RTM_DELACTION", Const, 0}, + {"RTM_DELADDR", Const, 0}, + {"RTM_DELADDRLABEL", Const, 0}, + {"RTM_DELETE", Const, 0}, + {"RTM_DELLINK", Const, 0}, + {"RTM_DELMADDR", Const, 0}, + {"RTM_DELNEIGH", Const, 0}, + {"RTM_DELQDISC", Const, 0}, + {"RTM_DELROUTE", Const, 0}, + {"RTM_DELRULE", Const, 0}, + {"RTM_DELTCLASS", Const, 0}, + {"RTM_DELTFILTER", Const, 0}, + {"RTM_DESYNC", Const, 1}, + {"RTM_F_CLONED", Const, 0}, + {"RTM_F_EQUALIZE", Const, 0}, + {"RTM_F_NOTIFY", Const, 0}, + {"RTM_F_PREFIX", Const, 0}, + {"RTM_GET", Const, 0}, + {"RTM_GET2", Const, 0}, + {"RTM_GETACTION", Const, 0}, + {"RTM_GETADDR", Const, 0}, + {"RTM_GETADDRLABEL", Const, 0}, + {"RTM_GETANYCAST", Const, 0}, + {"RTM_GETDCB", Const, 0}, + {"RTM_GETLINK", Const, 0}, + {"RTM_GETMULTICAST", Const, 0}, + {"RTM_GETNEIGH", Const, 0}, + {"RTM_GETNEIGHTBL", Const, 0}, + {"RTM_GETQDISC", Const, 0}, + {"RTM_GETROUTE", Const, 0}, + {"RTM_GETRULE", Const, 0}, + {"RTM_GETTCLASS", Const, 0}, + {"RTM_GETTFILTER", Const, 0}, + {"RTM_IEEE80211", Const, 0}, + {"RTM_IFANNOUNCE", Const, 0}, + {"RTM_IFINFO", Const, 0}, + {"RTM_IFINFO2", Const, 0}, + {"RTM_LLINFO_UPD", Const, 1}, + {"RTM_LOCK", Const, 0}, + {"RTM_LOSING", Const, 0}, + {"RTM_MAX", Const, 0}, + {"RTM_MAXSIZE", Const, 1}, + {"RTM_MISS", Const, 0}, + {"RTM_NEWACTION", Const, 0}, + {"RTM_NEWADDR", Const, 0}, + {"RTM_NEWADDRLABEL", Const, 0}, + {"RTM_NEWLINK", Const, 0}, + {"RTM_NEWMADDR", Const, 0}, + {"RTM_NEWMADDR2", Const, 0}, + {"RTM_NEWNDUSEROPT", Const, 0}, + {"RTM_NEWNEIGH", Const, 0}, + {"RTM_NEWNEIGHTBL", Const, 0}, + {"RTM_NEWPREFIX", Const, 0}, + {"RTM_NEWQDISC", Const, 0}, + {"RTM_NEWROUTE", Const, 0}, + {"RTM_NEWRULE", Const, 0}, + {"RTM_NEWTCLASS", Const, 0}, + {"RTM_NEWTFILTER", Const, 0}, + {"RTM_NR_FAMILIES", Const, 0}, + {"RTM_NR_MSGTYPES", Const, 0}, + {"RTM_OIFINFO", Const, 1}, + {"RTM_OLDADD", Const, 0}, + {"RTM_OLDDEL", Const, 0}, + {"RTM_OOIFINFO", Const, 1}, + {"RTM_REDIRECT", Const, 0}, + {"RTM_RESOLVE", Const, 0}, + {"RTM_RTTUNIT", Const, 0}, + {"RTM_SETDCB", Const, 0}, + {"RTM_SETGATE", Const, 1}, + {"RTM_SETLINK", Const, 0}, + {"RTM_SETNEIGHTBL", Const, 0}, + {"RTM_VERSION", Const, 0}, + {"RTNH_ALIGNTO", Const, 0}, + {"RTNH_F_DEAD", Const, 0}, + {"RTNH_F_ONLINK", Const, 0}, + {"RTNH_F_PERVASIVE", Const, 0}, + {"RTNLGRP_IPV4_IFADDR", Const, 1}, + {"RTNLGRP_IPV4_MROUTE", Const, 1}, + {"RTNLGRP_IPV4_ROUTE", Const, 1}, + {"RTNLGRP_IPV4_RULE", Const, 1}, + {"RTNLGRP_IPV6_IFADDR", Const, 1}, + {"RTNLGRP_IPV6_IFINFO", Const, 1}, + {"RTNLGRP_IPV6_MROUTE", Const, 1}, + {"RTNLGRP_IPV6_PREFIX", Const, 1}, + {"RTNLGRP_IPV6_ROUTE", Const, 1}, + {"RTNLGRP_IPV6_RULE", Const, 1}, + {"RTNLGRP_LINK", Const, 1}, + {"RTNLGRP_ND_USEROPT", Const, 1}, + {"RTNLGRP_NEIGH", Const, 1}, + {"RTNLGRP_NONE", Const, 1}, + {"RTNLGRP_NOTIFY", Const, 1}, + {"RTNLGRP_TC", Const, 1}, + {"RTN_ANYCAST", Const, 0}, + {"RTN_BLACKHOLE", Const, 0}, + {"RTN_BROADCAST", Const, 0}, + {"RTN_LOCAL", Const, 0}, + {"RTN_MAX", Const, 0}, + {"RTN_MULTICAST", Const, 0}, + {"RTN_NAT", Const, 0}, + {"RTN_PROHIBIT", Const, 0}, + {"RTN_THROW", Const, 0}, + {"RTN_UNICAST", Const, 0}, + {"RTN_UNREACHABLE", Const, 0}, + {"RTN_UNSPEC", Const, 0}, + {"RTN_XRESOLVE", Const, 0}, + {"RTPROT_BIRD", Const, 0}, + {"RTPROT_BOOT", Const, 0}, + {"RTPROT_DHCP", Const, 0}, + {"RTPROT_DNROUTED", Const, 0}, + {"RTPROT_GATED", Const, 0}, + {"RTPROT_KERNEL", Const, 0}, + {"RTPROT_MRT", Const, 0}, + {"RTPROT_NTK", Const, 0}, + {"RTPROT_RA", Const, 0}, + {"RTPROT_REDIRECT", Const, 0}, + {"RTPROT_STATIC", Const, 0}, + {"RTPROT_UNSPEC", Const, 0}, + {"RTPROT_XORP", Const, 0}, + {"RTPROT_ZEBRA", Const, 0}, + {"RTV_EXPIRE", Const, 0}, + {"RTV_HOPCOUNT", Const, 0}, + {"RTV_MTU", Const, 0}, + {"RTV_RPIPE", Const, 0}, + {"RTV_RTT", Const, 0}, + {"RTV_RTTVAR", Const, 0}, + {"RTV_SPIPE", Const, 0}, + {"RTV_SSTHRESH", Const, 0}, + {"RTV_WEIGHT", Const, 0}, + {"RT_CACHING_CONTEXT", Const, 1}, + {"RT_CLASS_DEFAULT", Const, 0}, + {"RT_CLASS_LOCAL", Const, 0}, + {"RT_CLASS_MAIN", Const, 0}, + {"RT_CLASS_MAX", Const, 0}, + {"RT_CLASS_UNSPEC", Const, 0}, + {"RT_DEFAULT_FIB", Const, 1}, + {"RT_NORTREF", Const, 1}, + {"RT_SCOPE_HOST", Const, 0}, + {"RT_SCOPE_LINK", Const, 0}, + {"RT_SCOPE_NOWHERE", Const, 0}, + {"RT_SCOPE_SITE", Const, 0}, + {"RT_SCOPE_UNIVERSE", Const, 0}, + {"RT_TABLEID_MAX", Const, 1}, + {"RT_TABLE_COMPAT", Const, 0}, + {"RT_TABLE_DEFAULT", Const, 0}, + {"RT_TABLE_LOCAL", Const, 0}, + {"RT_TABLE_MAIN", Const, 0}, + {"RT_TABLE_MAX", Const, 0}, + {"RT_TABLE_UNSPEC", Const, 0}, + {"RUSAGE_CHILDREN", Const, 0}, + {"RUSAGE_SELF", Const, 0}, + {"RUSAGE_THREAD", Const, 0}, + {"Radvisory_t", Type, 0}, + {"Radvisory_t.Count", Field, 0}, + {"Radvisory_t.Offset", Field, 0}, + {"Radvisory_t.Pad_cgo_0", Field, 0}, + {"RawConn", Type, 9}, + {"RawSockaddr", Type, 0}, + {"RawSockaddr.Data", Field, 0}, + {"RawSockaddr.Family", Field, 0}, + {"RawSockaddr.Len", Field, 0}, + {"RawSockaddrAny", Type, 0}, + {"RawSockaddrAny.Addr", Field, 0}, + {"RawSockaddrAny.Pad", Field, 0}, + {"RawSockaddrDatalink", Type, 0}, + {"RawSockaddrDatalink.Alen", Field, 0}, + {"RawSockaddrDatalink.Data", Field, 0}, + {"RawSockaddrDatalink.Family", Field, 0}, + {"RawSockaddrDatalink.Index", Field, 0}, + {"RawSockaddrDatalink.Len", Field, 0}, + {"RawSockaddrDatalink.Nlen", Field, 0}, + {"RawSockaddrDatalink.Pad_cgo_0", Field, 2}, + {"RawSockaddrDatalink.Slen", Field, 0}, + {"RawSockaddrDatalink.Type", Field, 0}, + {"RawSockaddrInet4", Type, 0}, + {"RawSockaddrInet4.Addr", Field, 0}, + {"RawSockaddrInet4.Family", Field, 0}, + {"RawSockaddrInet4.Len", Field, 0}, + {"RawSockaddrInet4.Port", Field, 0}, + {"RawSockaddrInet4.Zero", Field, 0}, + {"RawSockaddrInet6", Type, 0}, + {"RawSockaddrInet6.Addr", Field, 0}, + {"RawSockaddrInet6.Family", Field, 0}, + {"RawSockaddrInet6.Flowinfo", Field, 0}, + {"RawSockaddrInet6.Len", Field, 0}, + {"RawSockaddrInet6.Port", Field, 0}, + {"RawSockaddrInet6.Scope_id", Field, 0}, + {"RawSockaddrLinklayer", Type, 0}, + {"RawSockaddrLinklayer.Addr", Field, 0}, + {"RawSockaddrLinklayer.Family", Field, 0}, + {"RawSockaddrLinklayer.Halen", Field, 0}, + {"RawSockaddrLinklayer.Hatype", Field, 0}, + {"RawSockaddrLinklayer.Ifindex", Field, 0}, + {"RawSockaddrLinklayer.Pkttype", Field, 0}, + {"RawSockaddrLinklayer.Protocol", Field, 0}, + {"RawSockaddrNetlink", Type, 0}, + {"RawSockaddrNetlink.Family", Field, 0}, + {"RawSockaddrNetlink.Groups", Field, 0}, + {"RawSockaddrNetlink.Pad", Field, 0}, + {"RawSockaddrNetlink.Pid", Field, 0}, + {"RawSockaddrUnix", Type, 0}, + {"RawSockaddrUnix.Family", Field, 0}, + {"RawSockaddrUnix.Len", Field, 0}, + {"RawSockaddrUnix.Pad_cgo_0", Field, 2}, + {"RawSockaddrUnix.Path", Field, 0}, + {"RawSyscall", Func, 0}, + {"RawSyscall6", Func, 0}, + {"Read", Func, 0}, + {"ReadConsole", Func, 1}, + {"ReadDirectoryChanges", Func, 0}, + {"ReadDirent", Func, 0}, + {"ReadFile", Func, 0}, + {"Readlink", Func, 0}, + {"Reboot", Func, 0}, + {"Recvfrom", Func, 0}, + {"Recvmsg", Func, 0}, + {"RegCloseKey", Func, 0}, + {"RegEnumKeyEx", Func, 0}, + {"RegOpenKeyEx", Func, 0}, + {"RegQueryInfoKey", Func, 0}, + {"RegQueryValueEx", Func, 0}, + {"RemoveDirectory", Func, 0}, + {"Removexattr", Func, 1}, + {"Rename", Func, 0}, + {"Renameat", Func, 0}, + {"Revoke", Func, 0}, + {"Rlimit", Type, 0}, + {"Rlimit.Cur", Field, 0}, + {"Rlimit.Max", Field, 0}, + {"Rmdir", Func, 0}, + {"RouteMessage", Type, 0}, + {"RouteMessage.Data", Field, 0}, + {"RouteMessage.Header", Field, 0}, + {"RouteRIB", Func, 0}, + {"RoutingMessage", Type, 0}, + {"RtAttr", Type, 0}, + {"RtAttr.Len", Field, 0}, + {"RtAttr.Type", Field, 0}, + {"RtGenmsg", Type, 0}, + {"RtGenmsg.Family", Field, 0}, + {"RtMetrics", Type, 0}, + {"RtMetrics.Expire", Field, 0}, + {"RtMetrics.Filler", Field, 0}, + {"RtMetrics.Hopcount", Field, 0}, + {"RtMetrics.Locks", Field, 0}, + {"RtMetrics.Mtu", Field, 0}, + {"RtMetrics.Pad", Field, 3}, + {"RtMetrics.Pksent", Field, 0}, + {"RtMetrics.Recvpipe", Field, 0}, + {"RtMetrics.Refcnt", Field, 2}, + {"RtMetrics.Rtt", Field, 0}, + {"RtMetrics.Rttvar", Field, 0}, + {"RtMetrics.Sendpipe", Field, 0}, + {"RtMetrics.Ssthresh", Field, 0}, + {"RtMetrics.Weight", Field, 0}, + {"RtMsg", Type, 0}, + {"RtMsg.Dst_len", Field, 0}, + {"RtMsg.Family", Field, 0}, + {"RtMsg.Flags", Field, 0}, + {"RtMsg.Protocol", Field, 0}, + {"RtMsg.Scope", Field, 0}, + {"RtMsg.Src_len", Field, 0}, + {"RtMsg.Table", Field, 0}, + {"RtMsg.Tos", Field, 0}, + {"RtMsg.Type", Field, 0}, + {"RtMsghdr", Type, 0}, + {"RtMsghdr.Addrs", Field, 0}, + {"RtMsghdr.Errno", Field, 0}, + {"RtMsghdr.Flags", Field, 0}, + {"RtMsghdr.Fmask", Field, 0}, + {"RtMsghdr.Hdrlen", Field, 2}, + {"RtMsghdr.Index", Field, 0}, + {"RtMsghdr.Inits", Field, 0}, + {"RtMsghdr.Mpls", Field, 2}, + {"RtMsghdr.Msglen", Field, 0}, + {"RtMsghdr.Pad_cgo_0", Field, 0}, + {"RtMsghdr.Pad_cgo_1", Field, 2}, + {"RtMsghdr.Pid", Field, 0}, + {"RtMsghdr.Priority", Field, 2}, + {"RtMsghdr.Rmx", Field, 0}, + {"RtMsghdr.Seq", Field, 0}, + {"RtMsghdr.Tableid", Field, 2}, + {"RtMsghdr.Type", Field, 0}, + {"RtMsghdr.Use", Field, 0}, + {"RtMsghdr.Version", Field, 0}, + {"RtNexthop", Type, 0}, + {"RtNexthop.Flags", Field, 0}, + {"RtNexthop.Hops", Field, 0}, + {"RtNexthop.Ifindex", Field, 0}, + {"RtNexthop.Len", Field, 0}, + {"Rusage", Type, 0}, + {"Rusage.CreationTime", Field, 0}, + {"Rusage.ExitTime", Field, 0}, + {"Rusage.Idrss", Field, 0}, + {"Rusage.Inblock", Field, 0}, + {"Rusage.Isrss", Field, 0}, + {"Rusage.Ixrss", Field, 0}, + {"Rusage.KernelTime", Field, 0}, + {"Rusage.Majflt", Field, 0}, + {"Rusage.Maxrss", Field, 0}, + {"Rusage.Minflt", Field, 0}, + {"Rusage.Msgrcv", Field, 0}, + {"Rusage.Msgsnd", Field, 0}, + {"Rusage.Nivcsw", Field, 0}, + {"Rusage.Nsignals", Field, 0}, + {"Rusage.Nswap", Field, 0}, + {"Rusage.Nvcsw", Field, 0}, + {"Rusage.Oublock", Field, 0}, + {"Rusage.Stime", Field, 0}, + {"Rusage.UserTime", Field, 0}, + {"Rusage.Utime", Field, 0}, + {"SCM_BINTIME", Const, 0}, + {"SCM_CREDENTIALS", Const, 0}, + {"SCM_CREDS", Const, 0}, + {"SCM_RIGHTS", Const, 0}, + {"SCM_TIMESTAMP", Const, 0}, + {"SCM_TIMESTAMPING", Const, 0}, + {"SCM_TIMESTAMPNS", Const, 0}, + {"SCM_TIMESTAMP_MONOTONIC", Const, 0}, + {"SHUT_RD", Const, 0}, + {"SHUT_RDWR", Const, 0}, + {"SHUT_WR", Const, 0}, + {"SID", Type, 0}, + {"SIDAndAttributes", Type, 0}, + {"SIDAndAttributes.Attributes", Field, 0}, + {"SIDAndAttributes.Sid", Field, 0}, + {"SIGABRT", Const, 0}, + {"SIGALRM", Const, 0}, + {"SIGBUS", Const, 0}, + {"SIGCHLD", Const, 0}, + {"SIGCLD", Const, 0}, + {"SIGCONT", Const, 0}, + {"SIGEMT", Const, 0}, + {"SIGFPE", Const, 0}, + {"SIGHUP", Const, 0}, + {"SIGILL", Const, 0}, + {"SIGINFO", Const, 0}, + {"SIGINT", Const, 0}, + {"SIGIO", Const, 0}, + {"SIGIOT", Const, 0}, + {"SIGKILL", Const, 0}, + {"SIGLIBRT", Const, 1}, + {"SIGLWP", Const, 0}, + {"SIGPIPE", Const, 0}, + {"SIGPOLL", Const, 0}, + {"SIGPROF", Const, 0}, + {"SIGPWR", Const, 0}, + {"SIGQUIT", Const, 0}, + {"SIGSEGV", Const, 0}, + {"SIGSTKFLT", Const, 0}, + {"SIGSTOP", Const, 0}, + {"SIGSYS", Const, 0}, + {"SIGTERM", Const, 0}, + {"SIGTHR", Const, 0}, + {"SIGTRAP", Const, 0}, + {"SIGTSTP", Const, 0}, + {"SIGTTIN", Const, 0}, + {"SIGTTOU", Const, 0}, + {"SIGUNUSED", Const, 0}, + {"SIGURG", Const, 0}, + {"SIGUSR1", Const, 0}, + {"SIGUSR2", Const, 0}, + {"SIGVTALRM", Const, 0}, + {"SIGWINCH", Const, 0}, + {"SIGXCPU", Const, 0}, + {"SIGXFSZ", Const, 0}, + {"SIOCADDDLCI", Const, 0}, + {"SIOCADDMULTI", Const, 0}, + {"SIOCADDRT", Const, 0}, + {"SIOCAIFADDR", Const, 0}, + {"SIOCAIFGROUP", Const, 0}, + {"SIOCALIFADDR", Const, 0}, + {"SIOCARPIPLL", Const, 0}, + {"SIOCATMARK", Const, 0}, + {"SIOCAUTOADDR", Const, 0}, + {"SIOCAUTONETMASK", Const, 0}, + {"SIOCBRDGADD", Const, 1}, + {"SIOCBRDGADDS", Const, 1}, + {"SIOCBRDGARL", Const, 1}, + {"SIOCBRDGDADDR", Const, 1}, + {"SIOCBRDGDEL", Const, 1}, + {"SIOCBRDGDELS", Const, 1}, + {"SIOCBRDGFLUSH", Const, 1}, + {"SIOCBRDGFRL", Const, 1}, + {"SIOCBRDGGCACHE", Const, 1}, + {"SIOCBRDGGFD", Const, 1}, + {"SIOCBRDGGHT", Const, 1}, + {"SIOCBRDGGIFFLGS", Const, 1}, + {"SIOCBRDGGMA", Const, 1}, + {"SIOCBRDGGPARAM", Const, 1}, + {"SIOCBRDGGPRI", Const, 1}, + {"SIOCBRDGGRL", Const, 1}, + {"SIOCBRDGGSIFS", Const, 1}, + {"SIOCBRDGGTO", Const, 1}, + {"SIOCBRDGIFS", Const, 1}, + {"SIOCBRDGRTS", Const, 1}, + {"SIOCBRDGSADDR", Const, 1}, + {"SIOCBRDGSCACHE", Const, 1}, + {"SIOCBRDGSFD", Const, 1}, + {"SIOCBRDGSHT", Const, 1}, + {"SIOCBRDGSIFCOST", Const, 1}, + {"SIOCBRDGSIFFLGS", Const, 1}, + {"SIOCBRDGSIFPRIO", Const, 1}, + {"SIOCBRDGSMA", Const, 1}, + {"SIOCBRDGSPRI", Const, 1}, + {"SIOCBRDGSPROTO", Const, 1}, + {"SIOCBRDGSTO", Const, 1}, + {"SIOCBRDGSTXHC", Const, 1}, + {"SIOCDARP", Const, 0}, + {"SIOCDELDLCI", Const, 0}, + {"SIOCDELMULTI", Const, 0}, + {"SIOCDELRT", Const, 0}, + {"SIOCDEVPRIVATE", Const, 0}, + {"SIOCDIFADDR", Const, 0}, + {"SIOCDIFGROUP", Const, 0}, + {"SIOCDIFPHYADDR", Const, 0}, + {"SIOCDLIFADDR", Const, 0}, + {"SIOCDRARP", Const, 0}, + {"SIOCGARP", Const, 0}, + {"SIOCGDRVSPEC", Const, 0}, + {"SIOCGETKALIVE", Const, 1}, + {"SIOCGETLABEL", Const, 1}, + {"SIOCGETPFLOW", Const, 1}, + {"SIOCGETPFSYNC", Const, 1}, + {"SIOCGETSGCNT", Const, 0}, + {"SIOCGETVIFCNT", Const, 0}, + {"SIOCGETVLAN", Const, 0}, + {"SIOCGHIWAT", Const, 0}, + {"SIOCGIFADDR", Const, 0}, + {"SIOCGIFADDRPREF", Const, 1}, + {"SIOCGIFALIAS", Const, 1}, + {"SIOCGIFALTMTU", Const, 0}, + {"SIOCGIFASYNCMAP", Const, 0}, + {"SIOCGIFBOND", Const, 0}, + {"SIOCGIFBR", Const, 0}, + {"SIOCGIFBRDADDR", Const, 0}, + {"SIOCGIFCAP", Const, 0}, + {"SIOCGIFCONF", Const, 0}, + {"SIOCGIFCOUNT", Const, 0}, + {"SIOCGIFDATA", Const, 1}, + {"SIOCGIFDESCR", Const, 0}, + {"SIOCGIFDEVMTU", Const, 0}, + {"SIOCGIFDLT", Const, 1}, + {"SIOCGIFDSTADDR", Const, 0}, + {"SIOCGIFENCAP", Const, 0}, + {"SIOCGIFFIB", Const, 1}, + {"SIOCGIFFLAGS", Const, 0}, + {"SIOCGIFGATTR", Const, 1}, + {"SIOCGIFGENERIC", Const, 0}, + {"SIOCGIFGMEMB", Const, 0}, + {"SIOCGIFGROUP", Const, 0}, + {"SIOCGIFHARDMTU", Const, 3}, + {"SIOCGIFHWADDR", Const, 0}, + {"SIOCGIFINDEX", Const, 0}, + {"SIOCGIFKPI", Const, 0}, + {"SIOCGIFMAC", Const, 0}, + {"SIOCGIFMAP", Const, 0}, + {"SIOCGIFMEDIA", Const, 0}, + {"SIOCGIFMEM", Const, 0}, + {"SIOCGIFMETRIC", Const, 0}, + {"SIOCGIFMTU", Const, 0}, + {"SIOCGIFNAME", Const, 0}, + {"SIOCGIFNETMASK", Const, 0}, + {"SIOCGIFPDSTADDR", Const, 0}, + {"SIOCGIFPFLAGS", Const, 0}, + {"SIOCGIFPHYS", Const, 0}, + {"SIOCGIFPRIORITY", Const, 1}, + {"SIOCGIFPSRCADDR", Const, 0}, + {"SIOCGIFRDOMAIN", Const, 1}, + {"SIOCGIFRTLABEL", Const, 1}, + {"SIOCGIFSLAVE", Const, 0}, + {"SIOCGIFSTATUS", Const, 0}, + {"SIOCGIFTIMESLOT", Const, 1}, + {"SIOCGIFTXQLEN", Const, 0}, + {"SIOCGIFVLAN", Const, 0}, + {"SIOCGIFWAKEFLAGS", Const, 0}, + {"SIOCGIFXFLAGS", Const, 1}, + {"SIOCGLIFADDR", Const, 0}, + {"SIOCGLIFPHYADDR", Const, 0}, + {"SIOCGLIFPHYRTABLE", Const, 1}, + {"SIOCGLIFPHYTTL", Const, 3}, + {"SIOCGLINKSTR", Const, 1}, + {"SIOCGLOWAT", Const, 0}, + {"SIOCGPGRP", Const, 0}, + {"SIOCGPRIVATE_0", Const, 0}, + {"SIOCGPRIVATE_1", Const, 0}, + {"SIOCGRARP", Const, 0}, + {"SIOCGSPPPPARAMS", Const, 3}, + {"SIOCGSTAMP", Const, 0}, + {"SIOCGSTAMPNS", Const, 0}, + {"SIOCGVH", Const, 1}, + {"SIOCGVNETID", Const, 3}, + {"SIOCIFCREATE", Const, 0}, + {"SIOCIFCREATE2", Const, 0}, + {"SIOCIFDESTROY", Const, 0}, + {"SIOCIFGCLONERS", Const, 0}, + {"SIOCINITIFADDR", Const, 1}, + {"SIOCPROTOPRIVATE", Const, 0}, + {"SIOCRSLVMULTI", Const, 0}, + {"SIOCRTMSG", Const, 0}, + {"SIOCSARP", Const, 0}, + {"SIOCSDRVSPEC", Const, 0}, + {"SIOCSETKALIVE", Const, 1}, + {"SIOCSETLABEL", Const, 1}, + {"SIOCSETPFLOW", Const, 1}, + {"SIOCSETPFSYNC", Const, 1}, + {"SIOCSETVLAN", Const, 0}, + {"SIOCSHIWAT", Const, 0}, + {"SIOCSIFADDR", Const, 0}, + {"SIOCSIFADDRPREF", Const, 1}, + {"SIOCSIFALTMTU", Const, 0}, + {"SIOCSIFASYNCMAP", Const, 0}, + {"SIOCSIFBOND", Const, 0}, + {"SIOCSIFBR", Const, 0}, + {"SIOCSIFBRDADDR", Const, 0}, + {"SIOCSIFCAP", Const, 0}, + {"SIOCSIFDESCR", Const, 0}, + {"SIOCSIFDSTADDR", Const, 0}, + {"SIOCSIFENCAP", Const, 0}, + {"SIOCSIFFIB", Const, 1}, + {"SIOCSIFFLAGS", Const, 0}, + {"SIOCSIFGATTR", Const, 1}, + {"SIOCSIFGENERIC", Const, 0}, + {"SIOCSIFHWADDR", Const, 0}, + {"SIOCSIFHWBROADCAST", Const, 0}, + {"SIOCSIFKPI", Const, 0}, + {"SIOCSIFLINK", Const, 0}, + {"SIOCSIFLLADDR", Const, 0}, + {"SIOCSIFMAC", Const, 0}, + {"SIOCSIFMAP", Const, 0}, + {"SIOCSIFMEDIA", Const, 0}, + {"SIOCSIFMEM", Const, 0}, + {"SIOCSIFMETRIC", Const, 0}, + {"SIOCSIFMTU", Const, 0}, + {"SIOCSIFNAME", Const, 0}, + {"SIOCSIFNETMASK", Const, 0}, + {"SIOCSIFPFLAGS", Const, 0}, + {"SIOCSIFPHYADDR", Const, 0}, + {"SIOCSIFPHYS", Const, 0}, + {"SIOCSIFPRIORITY", Const, 1}, + {"SIOCSIFRDOMAIN", Const, 1}, + {"SIOCSIFRTLABEL", Const, 1}, + {"SIOCSIFRVNET", Const, 0}, + {"SIOCSIFSLAVE", Const, 0}, + {"SIOCSIFTIMESLOT", Const, 1}, + {"SIOCSIFTXQLEN", Const, 0}, + {"SIOCSIFVLAN", Const, 0}, + {"SIOCSIFVNET", Const, 0}, + {"SIOCSIFXFLAGS", Const, 1}, + {"SIOCSLIFPHYADDR", Const, 0}, + {"SIOCSLIFPHYRTABLE", Const, 1}, + {"SIOCSLIFPHYTTL", Const, 3}, + {"SIOCSLINKSTR", Const, 1}, + {"SIOCSLOWAT", Const, 0}, + {"SIOCSPGRP", Const, 0}, + {"SIOCSRARP", Const, 0}, + {"SIOCSSPPPPARAMS", Const, 3}, + {"SIOCSVH", Const, 1}, + {"SIOCSVNETID", Const, 3}, + {"SIOCZIFDATA", Const, 1}, + {"SIO_GET_EXTENSION_FUNCTION_POINTER", Const, 1}, + {"SIO_GET_INTERFACE_LIST", Const, 0}, + {"SIO_KEEPALIVE_VALS", Const, 3}, + {"SIO_UDP_CONNRESET", Const, 4}, + {"SOCK_CLOEXEC", Const, 0}, + {"SOCK_DCCP", Const, 0}, + {"SOCK_DGRAM", Const, 0}, + {"SOCK_FLAGS_MASK", Const, 1}, + {"SOCK_MAXADDRLEN", Const, 0}, + {"SOCK_NONBLOCK", Const, 0}, + {"SOCK_NOSIGPIPE", Const, 1}, + {"SOCK_PACKET", Const, 0}, + {"SOCK_RAW", Const, 0}, + {"SOCK_RDM", Const, 0}, + {"SOCK_SEQPACKET", Const, 0}, + {"SOCK_STREAM", Const, 0}, + {"SOL_AAL", Const, 0}, + {"SOL_ATM", Const, 0}, + {"SOL_DECNET", Const, 0}, + {"SOL_ICMPV6", Const, 0}, + {"SOL_IP", Const, 0}, + {"SOL_IPV6", Const, 0}, + {"SOL_IRDA", Const, 0}, + {"SOL_PACKET", Const, 0}, + {"SOL_RAW", Const, 0}, + {"SOL_SOCKET", Const, 0}, + {"SOL_TCP", Const, 0}, + {"SOL_X25", Const, 0}, + {"SOMAXCONN", Const, 0}, + {"SO_ACCEPTCONN", Const, 0}, + {"SO_ACCEPTFILTER", Const, 0}, + {"SO_ATTACH_FILTER", Const, 0}, + {"SO_BINDANY", Const, 1}, + {"SO_BINDTODEVICE", Const, 0}, + {"SO_BINTIME", Const, 0}, + {"SO_BROADCAST", Const, 0}, + {"SO_BSDCOMPAT", Const, 0}, + {"SO_DEBUG", Const, 0}, + {"SO_DETACH_FILTER", Const, 0}, + {"SO_DOMAIN", Const, 0}, + {"SO_DONTROUTE", Const, 0}, + {"SO_DONTTRUNC", Const, 0}, + {"SO_ERROR", Const, 0}, + {"SO_KEEPALIVE", Const, 0}, + {"SO_LABEL", Const, 0}, + {"SO_LINGER", Const, 0}, + {"SO_LINGER_SEC", Const, 0}, + {"SO_LISTENINCQLEN", Const, 0}, + {"SO_LISTENQLEN", Const, 0}, + {"SO_LISTENQLIMIT", Const, 0}, + {"SO_MARK", Const, 0}, + {"SO_NETPROC", Const, 1}, + {"SO_NKE", Const, 0}, + {"SO_NOADDRERR", Const, 0}, + {"SO_NOHEADER", Const, 1}, + {"SO_NOSIGPIPE", Const, 0}, + {"SO_NOTIFYCONFLICT", Const, 0}, + {"SO_NO_CHECK", Const, 0}, + {"SO_NO_DDP", Const, 0}, + {"SO_NO_OFFLOAD", Const, 0}, + {"SO_NP_EXTENSIONS", Const, 0}, + {"SO_NREAD", Const, 0}, + {"SO_NUMRCVPKT", Const, 16}, + {"SO_NWRITE", Const, 0}, + {"SO_OOBINLINE", Const, 0}, + {"SO_OVERFLOWED", Const, 1}, + {"SO_PASSCRED", Const, 0}, + {"SO_PASSSEC", Const, 0}, + {"SO_PEERCRED", Const, 0}, + {"SO_PEERLABEL", Const, 0}, + {"SO_PEERNAME", Const, 0}, + {"SO_PEERSEC", Const, 0}, + {"SO_PRIORITY", Const, 0}, + {"SO_PROTOCOL", Const, 0}, + {"SO_PROTOTYPE", Const, 1}, + {"SO_RANDOMPORT", Const, 0}, + {"SO_RCVBUF", Const, 0}, + {"SO_RCVBUFFORCE", Const, 0}, + {"SO_RCVLOWAT", Const, 0}, + {"SO_RCVTIMEO", Const, 0}, + {"SO_RESTRICTIONS", Const, 0}, + {"SO_RESTRICT_DENYIN", Const, 0}, + {"SO_RESTRICT_DENYOUT", Const, 0}, + {"SO_RESTRICT_DENYSET", Const, 0}, + {"SO_REUSEADDR", Const, 0}, + {"SO_REUSEPORT", Const, 0}, + {"SO_REUSESHAREUID", Const, 0}, + {"SO_RTABLE", Const, 1}, + {"SO_RXQ_OVFL", Const, 0}, + {"SO_SECURITY_AUTHENTICATION", Const, 0}, + {"SO_SECURITY_ENCRYPTION_NETWORK", Const, 0}, + {"SO_SECURITY_ENCRYPTION_TRANSPORT", Const, 0}, + {"SO_SETFIB", Const, 0}, + {"SO_SNDBUF", Const, 0}, + {"SO_SNDBUFFORCE", Const, 0}, + {"SO_SNDLOWAT", Const, 0}, + {"SO_SNDTIMEO", Const, 0}, + {"SO_SPLICE", Const, 1}, + {"SO_TIMESTAMP", Const, 0}, + {"SO_TIMESTAMPING", Const, 0}, + {"SO_TIMESTAMPNS", Const, 0}, + {"SO_TIMESTAMP_MONOTONIC", Const, 0}, + {"SO_TYPE", Const, 0}, + {"SO_UPCALLCLOSEWAIT", Const, 0}, + {"SO_UPDATE_ACCEPT_CONTEXT", Const, 0}, + {"SO_UPDATE_CONNECT_CONTEXT", Const, 1}, + {"SO_USELOOPBACK", Const, 0}, + {"SO_USER_COOKIE", Const, 1}, + {"SO_VENDOR", Const, 3}, + {"SO_WANTMORE", Const, 0}, + {"SO_WANTOOBFLAG", Const, 0}, + {"SSLExtraCertChainPolicyPara", Type, 0}, + {"SSLExtraCertChainPolicyPara.AuthType", Field, 0}, + {"SSLExtraCertChainPolicyPara.Checks", Field, 0}, + {"SSLExtraCertChainPolicyPara.ServerName", Field, 0}, + {"SSLExtraCertChainPolicyPara.Size", Field, 0}, + {"STANDARD_RIGHTS_ALL", Const, 0}, + {"STANDARD_RIGHTS_EXECUTE", Const, 0}, + {"STANDARD_RIGHTS_READ", Const, 0}, + {"STANDARD_RIGHTS_REQUIRED", Const, 0}, + {"STANDARD_RIGHTS_WRITE", Const, 0}, + {"STARTF_USESHOWWINDOW", Const, 0}, + {"STARTF_USESTDHANDLES", Const, 0}, + {"STD_ERROR_HANDLE", Const, 0}, + {"STD_INPUT_HANDLE", Const, 0}, + {"STD_OUTPUT_HANDLE", Const, 0}, + {"SUBLANG_ENGLISH_US", Const, 0}, + {"SW_FORCEMINIMIZE", Const, 0}, + {"SW_HIDE", Const, 0}, + {"SW_MAXIMIZE", Const, 0}, + {"SW_MINIMIZE", Const, 0}, + {"SW_NORMAL", Const, 0}, + {"SW_RESTORE", Const, 0}, + {"SW_SHOW", Const, 0}, + {"SW_SHOWDEFAULT", Const, 0}, + {"SW_SHOWMAXIMIZED", Const, 0}, + {"SW_SHOWMINIMIZED", Const, 0}, + {"SW_SHOWMINNOACTIVE", Const, 0}, + {"SW_SHOWNA", Const, 0}, + {"SW_SHOWNOACTIVATE", Const, 0}, + {"SW_SHOWNORMAL", Const, 0}, + {"SYMBOLIC_LINK_FLAG_DIRECTORY", Const, 4}, + {"SYNCHRONIZE", Const, 0}, + {"SYSCTL_VERSION", Const, 1}, + {"SYSCTL_VERS_0", Const, 1}, + {"SYSCTL_VERS_1", Const, 1}, + {"SYSCTL_VERS_MASK", Const, 1}, + {"SYS_ABORT2", Const, 0}, + {"SYS_ACCEPT", Const, 0}, + {"SYS_ACCEPT4", Const, 0}, + {"SYS_ACCEPT_NOCANCEL", Const, 0}, + {"SYS_ACCESS", Const, 0}, + {"SYS_ACCESS_EXTENDED", Const, 0}, + {"SYS_ACCT", Const, 0}, + {"SYS_ADD_KEY", Const, 0}, + {"SYS_ADD_PROFIL", Const, 0}, + {"SYS_ADJFREQ", Const, 1}, + {"SYS_ADJTIME", Const, 0}, + {"SYS_ADJTIMEX", Const, 0}, + {"SYS_AFS_SYSCALL", Const, 0}, + {"SYS_AIO_CANCEL", Const, 0}, + {"SYS_AIO_ERROR", Const, 0}, + {"SYS_AIO_FSYNC", Const, 0}, + {"SYS_AIO_MLOCK", Const, 14}, + {"SYS_AIO_READ", Const, 0}, + {"SYS_AIO_RETURN", Const, 0}, + {"SYS_AIO_SUSPEND", Const, 0}, + {"SYS_AIO_SUSPEND_NOCANCEL", Const, 0}, + {"SYS_AIO_WAITCOMPLETE", Const, 14}, + {"SYS_AIO_WRITE", Const, 0}, + {"SYS_ALARM", Const, 0}, + {"SYS_ARCH_PRCTL", Const, 0}, + {"SYS_ARM_FADVISE64_64", Const, 0}, + {"SYS_ARM_SYNC_FILE_RANGE", Const, 0}, + {"SYS_ATGETMSG", Const, 0}, + {"SYS_ATPGETREQ", Const, 0}, + {"SYS_ATPGETRSP", Const, 0}, + {"SYS_ATPSNDREQ", Const, 0}, + {"SYS_ATPSNDRSP", Const, 0}, + {"SYS_ATPUTMSG", Const, 0}, + {"SYS_ATSOCKET", Const, 0}, + {"SYS_AUDIT", Const, 0}, + {"SYS_AUDITCTL", Const, 0}, + {"SYS_AUDITON", Const, 0}, + {"SYS_AUDIT_SESSION_JOIN", Const, 0}, + {"SYS_AUDIT_SESSION_PORT", Const, 0}, + {"SYS_AUDIT_SESSION_SELF", Const, 0}, + {"SYS_BDFLUSH", Const, 0}, + {"SYS_BIND", Const, 0}, + {"SYS_BINDAT", Const, 3}, + {"SYS_BREAK", Const, 0}, + {"SYS_BRK", Const, 0}, + {"SYS_BSDTHREAD_CREATE", Const, 0}, + {"SYS_BSDTHREAD_REGISTER", Const, 0}, + {"SYS_BSDTHREAD_TERMINATE", Const, 0}, + {"SYS_CAPGET", Const, 0}, + {"SYS_CAPSET", Const, 0}, + {"SYS_CAP_ENTER", Const, 0}, + {"SYS_CAP_FCNTLS_GET", Const, 1}, + {"SYS_CAP_FCNTLS_LIMIT", Const, 1}, + {"SYS_CAP_GETMODE", Const, 0}, + {"SYS_CAP_GETRIGHTS", Const, 0}, + {"SYS_CAP_IOCTLS_GET", Const, 1}, + {"SYS_CAP_IOCTLS_LIMIT", Const, 1}, + {"SYS_CAP_NEW", Const, 0}, + {"SYS_CAP_RIGHTS_GET", Const, 1}, + {"SYS_CAP_RIGHTS_LIMIT", Const, 1}, + {"SYS_CHDIR", Const, 0}, + {"SYS_CHFLAGS", Const, 0}, + {"SYS_CHFLAGSAT", Const, 3}, + {"SYS_CHMOD", Const, 0}, + {"SYS_CHMOD_EXTENDED", Const, 0}, + {"SYS_CHOWN", Const, 0}, + {"SYS_CHOWN32", Const, 0}, + {"SYS_CHROOT", Const, 0}, + {"SYS_CHUD", Const, 0}, + {"SYS_CLOCK_ADJTIME", Const, 0}, + {"SYS_CLOCK_GETCPUCLOCKID2", Const, 1}, + {"SYS_CLOCK_GETRES", Const, 0}, + {"SYS_CLOCK_GETTIME", Const, 0}, + {"SYS_CLOCK_NANOSLEEP", Const, 0}, + {"SYS_CLOCK_SETTIME", Const, 0}, + {"SYS_CLONE", Const, 0}, + {"SYS_CLOSE", Const, 0}, + {"SYS_CLOSEFROM", Const, 0}, + {"SYS_CLOSE_NOCANCEL", Const, 0}, + {"SYS_CONNECT", Const, 0}, + {"SYS_CONNECTAT", Const, 3}, + {"SYS_CONNECT_NOCANCEL", Const, 0}, + {"SYS_COPYFILE", Const, 0}, + {"SYS_CPUSET", Const, 0}, + {"SYS_CPUSET_GETAFFINITY", Const, 0}, + {"SYS_CPUSET_GETID", Const, 0}, + {"SYS_CPUSET_SETAFFINITY", Const, 0}, + {"SYS_CPUSET_SETID", Const, 0}, + {"SYS_CREAT", Const, 0}, + {"SYS_CREATE_MODULE", Const, 0}, + {"SYS_CSOPS", Const, 0}, + {"SYS_CSOPS_AUDITTOKEN", Const, 16}, + {"SYS_DELETE", Const, 0}, + {"SYS_DELETE_MODULE", Const, 0}, + {"SYS_DUP", Const, 0}, + {"SYS_DUP2", Const, 0}, + {"SYS_DUP3", Const, 0}, + {"SYS_EACCESS", Const, 0}, + {"SYS_EPOLL_CREATE", Const, 0}, + {"SYS_EPOLL_CREATE1", Const, 0}, + {"SYS_EPOLL_CTL", Const, 0}, + {"SYS_EPOLL_CTL_OLD", Const, 0}, + {"SYS_EPOLL_PWAIT", Const, 0}, + {"SYS_EPOLL_WAIT", Const, 0}, + {"SYS_EPOLL_WAIT_OLD", Const, 0}, + {"SYS_EVENTFD", Const, 0}, + {"SYS_EVENTFD2", Const, 0}, + {"SYS_EXCHANGEDATA", Const, 0}, + {"SYS_EXECVE", Const, 0}, + {"SYS_EXIT", Const, 0}, + {"SYS_EXIT_GROUP", Const, 0}, + {"SYS_EXTATTRCTL", Const, 0}, + {"SYS_EXTATTR_DELETE_FD", Const, 0}, + {"SYS_EXTATTR_DELETE_FILE", Const, 0}, + {"SYS_EXTATTR_DELETE_LINK", Const, 0}, + {"SYS_EXTATTR_GET_FD", Const, 0}, + {"SYS_EXTATTR_GET_FILE", Const, 0}, + {"SYS_EXTATTR_GET_LINK", Const, 0}, + {"SYS_EXTATTR_LIST_FD", Const, 0}, + {"SYS_EXTATTR_LIST_FILE", Const, 0}, + {"SYS_EXTATTR_LIST_LINK", Const, 0}, + {"SYS_EXTATTR_SET_FD", Const, 0}, + {"SYS_EXTATTR_SET_FILE", Const, 0}, + {"SYS_EXTATTR_SET_LINK", Const, 0}, + {"SYS_FACCESSAT", Const, 0}, + {"SYS_FADVISE64", Const, 0}, + {"SYS_FADVISE64_64", Const, 0}, + {"SYS_FALLOCATE", Const, 0}, + {"SYS_FANOTIFY_INIT", Const, 0}, + {"SYS_FANOTIFY_MARK", Const, 0}, + {"SYS_FCHDIR", Const, 0}, + {"SYS_FCHFLAGS", Const, 0}, + {"SYS_FCHMOD", Const, 0}, + {"SYS_FCHMODAT", Const, 0}, + {"SYS_FCHMOD_EXTENDED", Const, 0}, + {"SYS_FCHOWN", Const, 0}, + {"SYS_FCHOWN32", Const, 0}, + {"SYS_FCHOWNAT", Const, 0}, + {"SYS_FCHROOT", Const, 1}, + {"SYS_FCNTL", Const, 0}, + {"SYS_FCNTL64", Const, 0}, + {"SYS_FCNTL_NOCANCEL", Const, 0}, + {"SYS_FDATASYNC", Const, 0}, + {"SYS_FEXECVE", Const, 0}, + {"SYS_FFCLOCK_GETCOUNTER", Const, 0}, + {"SYS_FFCLOCK_GETESTIMATE", Const, 0}, + {"SYS_FFCLOCK_SETESTIMATE", Const, 0}, + {"SYS_FFSCTL", Const, 0}, + {"SYS_FGETATTRLIST", Const, 0}, + {"SYS_FGETXATTR", Const, 0}, + {"SYS_FHOPEN", Const, 0}, + {"SYS_FHSTAT", Const, 0}, + {"SYS_FHSTATFS", Const, 0}, + {"SYS_FILEPORT_MAKEFD", Const, 0}, + {"SYS_FILEPORT_MAKEPORT", Const, 0}, + {"SYS_FKTRACE", Const, 1}, + {"SYS_FLISTXATTR", Const, 0}, + {"SYS_FLOCK", Const, 0}, + {"SYS_FORK", Const, 0}, + {"SYS_FPATHCONF", Const, 0}, + {"SYS_FREEBSD6_FTRUNCATE", Const, 0}, + {"SYS_FREEBSD6_LSEEK", Const, 0}, + {"SYS_FREEBSD6_MMAP", Const, 0}, + {"SYS_FREEBSD6_PREAD", Const, 0}, + {"SYS_FREEBSD6_PWRITE", Const, 0}, + {"SYS_FREEBSD6_TRUNCATE", Const, 0}, + {"SYS_FREMOVEXATTR", Const, 0}, + {"SYS_FSCTL", Const, 0}, + {"SYS_FSETATTRLIST", Const, 0}, + {"SYS_FSETXATTR", Const, 0}, + {"SYS_FSGETPATH", Const, 0}, + {"SYS_FSTAT", Const, 0}, + {"SYS_FSTAT64", Const, 0}, + {"SYS_FSTAT64_EXTENDED", Const, 0}, + {"SYS_FSTATAT", Const, 0}, + {"SYS_FSTATAT64", Const, 0}, + {"SYS_FSTATFS", Const, 0}, + {"SYS_FSTATFS64", Const, 0}, + {"SYS_FSTATV", Const, 0}, + {"SYS_FSTATVFS1", Const, 1}, + {"SYS_FSTAT_EXTENDED", Const, 0}, + {"SYS_FSYNC", Const, 0}, + {"SYS_FSYNC_NOCANCEL", Const, 0}, + {"SYS_FSYNC_RANGE", Const, 1}, + {"SYS_FTIME", Const, 0}, + {"SYS_FTRUNCATE", Const, 0}, + {"SYS_FTRUNCATE64", Const, 0}, + {"SYS_FUTEX", Const, 0}, + {"SYS_FUTIMENS", Const, 1}, + {"SYS_FUTIMES", Const, 0}, + {"SYS_FUTIMESAT", Const, 0}, + {"SYS_GETATTRLIST", Const, 0}, + {"SYS_GETAUDIT", Const, 0}, + {"SYS_GETAUDIT_ADDR", Const, 0}, + {"SYS_GETAUID", Const, 0}, + {"SYS_GETCONTEXT", Const, 0}, + {"SYS_GETCPU", Const, 0}, + {"SYS_GETCWD", Const, 0}, + {"SYS_GETDENTS", Const, 0}, + {"SYS_GETDENTS64", Const, 0}, + {"SYS_GETDIRENTRIES", Const, 0}, + {"SYS_GETDIRENTRIES64", Const, 0}, + {"SYS_GETDIRENTRIESATTR", Const, 0}, + {"SYS_GETDTABLECOUNT", Const, 1}, + {"SYS_GETDTABLESIZE", Const, 0}, + {"SYS_GETEGID", Const, 0}, + {"SYS_GETEGID32", Const, 0}, + {"SYS_GETEUID", Const, 0}, + {"SYS_GETEUID32", Const, 0}, + {"SYS_GETFH", Const, 0}, + {"SYS_GETFSSTAT", Const, 0}, + {"SYS_GETFSSTAT64", Const, 0}, + {"SYS_GETGID", Const, 0}, + {"SYS_GETGID32", Const, 0}, + {"SYS_GETGROUPS", Const, 0}, + {"SYS_GETGROUPS32", Const, 0}, + {"SYS_GETHOSTUUID", Const, 0}, + {"SYS_GETITIMER", Const, 0}, + {"SYS_GETLCID", Const, 0}, + {"SYS_GETLOGIN", Const, 0}, + {"SYS_GETLOGINCLASS", Const, 0}, + {"SYS_GETPEERNAME", Const, 0}, + {"SYS_GETPGID", Const, 0}, + {"SYS_GETPGRP", Const, 0}, + {"SYS_GETPID", Const, 0}, + {"SYS_GETPMSG", Const, 0}, + {"SYS_GETPPID", Const, 0}, + {"SYS_GETPRIORITY", Const, 0}, + {"SYS_GETRESGID", Const, 0}, + {"SYS_GETRESGID32", Const, 0}, + {"SYS_GETRESUID", Const, 0}, + {"SYS_GETRESUID32", Const, 0}, + {"SYS_GETRLIMIT", Const, 0}, + {"SYS_GETRTABLE", Const, 1}, + {"SYS_GETRUSAGE", Const, 0}, + {"SYS_GETSGROUPS", Const, 0}, + {"SYS_GETSID", Const, 0}, + {"SYS_GETSOCKNAME", Const, 0}, + {"SYS_GETSOCKOPT", Const, 0}, + {"SYS_GETTHRID", Const, 1}, + {"SYS_GETTID", Const, 0}, + {"SYS_GETTIMEOFDAY", Const, 0}, + {"SYS_GETUID", Const, 0}, + {"SYS_GETUID32", Const, 0}, + {"SYS_GETVFSSTAT", Const, 1}, + {"SYS_GETWGROUPS", Const, 0}, + {"SYS_GETXATTR", Const, 0}, + {"SYS_GET_KERNEL_SYMS", Const, 0}, + {"SYS_GET_MEMPOLICY", Const, 0}, + {"SYS_GET_ROBUST_LIST", Const, 0}, + {"SYS_GET_THREAD_AREA", Const, 0}, + {"SYS_GSSD_SYSCALL", Const, 14}, + {"SYS_GTTY", Const, 0}, + {"SYS_IDENTITYSVC", Const, 0}, + {"SYS_IDLE", Const, 0}, + {"SYS_INITGROUPS", Const, 0}, + {"SYS_INIT_MODULE", Const, 0}, + {"SYS_INOTIFY_ADD_WATCH", Const, 0}, + {"SYS_INOTIFY_INIT", Const, 0}, + {"SYS_INOTIFY_INIT1", Const, 0}, + {"SYS_INOTIFY_RM_WATCH", Const, 0}, + {"SYS_IOCTL", Const, 0}, + {"SYS_IOPERM", Const, 0}, + {"SYS_IOPL", Const, 0}, + {"SYS_IOPOLICYSYS", Const, 0}, + {"SYS_IOPRIO_GET", Const, 0}, + {"SYS_IOPRIO_SET", Const, 0}, + {"SYS_IO_CANCEL", Const, 0}, + {"SYS_IO_DESTROY", Const, 0}, + {"SYS_IO_GETEVENTS", Const, 0}, + {"SYS_IO_SETUP", Const, 0}, + {"SYS_IO_SUBMIT", Const, 0}, + {"SYS_IPC", Const, 0}, + {"SYS_ISSETUGID", Const, 0}, + {"SYS_JAIL", Const, 0}, + {"SYS_JAIL_ATTACH", Const, 0}, + {"SYS_JAIL_GET", Const, 0}, + {"SYS_JAIL_REMOVE", Const, 0}, + {"SYS_JAIL_SET", Const, 0}, + {"SYS_KAS_INFO", Const, 16}, + {"SYS_KDEBUG_TRACE", Const, 0}, + {"SYS_KENV", Const, 0}, + {"SYS_KEVENT", Const, 0}, + {"SYS_KEVENT64", Const, 0}, + {"SYS_KEXEC_LOAD", Const, 0}, + {"SYS_KEYCTL", Const, 0}, + {"SYS_KILL", Const, 0}, + {"SYS_KLDFIND", Const, 0}, + {"SYS_KLDFIRSTMOD", Const, 0}, + {"SYS_KLDLOAD", Const, 0}, + {"SYS_KLDNEXT", Const, 0}, + {"SYS_KLDSTAT", Const, 0}, + {"SYS_KLDSYM", Const, 0}, + {"SYS_KLDUNLOAD", Const, 0}, + {"SYS_KLDUNLOADF", Const, 0}, + {"SYS_KMQ_NOTIFY", Const, 14}, + {"SYS_KMQ_OPEN", Const, 14}, + {"SYS_KMQ_SETATTR", Const, 14}, + {"SYS_KMQ_TIMEDRECEIVE", Const, 14}, + {"SYS_KMQ_TIMEDSEND", Const, 14}, + {"SYS_KMQ_UNLINK", Const, 14}, + {"SYS_KQUEUE", Const, 0}, + {"SYS_KQUEUE1", Const, 1}, + {"SYS_KSEM_CLOSE", Const, 14}, + {"SYS_KSEM_DESTROY", Const, 14}, + {"SYS_KSEM_GETVALUE", Const, 14}, + {"SYS_KSEM_INIT", Const, 14}, + {"SYS_KSEM_OPEN", Const, 14}, + {"SYS_KSEM_POST", Const, 14}, + {"SYS_KSEM_TIMEDWAIT", Const, 14}, + {"SYS_KSEM_TRYWAIT", Const, 14}, + {"SYS_KSEM_UNLINK", Const, 14}, + {"SYS_KSEM_WAIT", Const, 14}, + {"SYS_KTIMER_CREATE", Const, 0}, + {"SYS_KTIMER_DELETE", Const, 0}, + {"SYS_KTIMER_GETOVERRUN", Const, 0}, + {"SYS_KTIMER_GETTIME", Const, 0}, + {"SYS_KTIMER_SETTIME", Const, 0}, + {"SYS_KTRACE", Const, 0}, + {"SYS_LCHFLAGS", Const, 0}, + {"SYS_LCHMOD", Const, 0}, + {"SYS_LCHOWN", Const, 0}, + {"SYS_LCHOWN32", Const, 0}, + {"SYS_LEDGER", Const, 16}, + {"SYS_LGETFH", Const, 0}, + {"SYS_LGETXATTR", Const, 0}, + {"SYS_LINK", Const, 0}, + {"SYS_LINKAT", Const, 0}, + {"SYS_LIO_LISTIO", Const, 0}, + {"SYS_LISTEN", Const, 0}, + {"SYS_LISTXATTR", Const, 0}, + {"SYS_LLISTXATTR", Const, 0}, + {"SYS_LOCK", Const, 0}, + {"SYS_LOOKUP_DCOOKIE", Const, 0}, + {"SYS_LPATHCONF", Const, 0}, + {"SYS_LREMOVEXATTR", Const, 0}, + {"SYS_LSEEK", Const, 0}, + {"SYS_LSETXATTR", Const, 0}, + {"SYS_LSTAT", Const, 0}, + {"SYS_LSTAT64", Const, 0}, + {"SYS_LSTAT64_EXTENDED", Const, 0}, + {"SYS_LSTATV", Const, 0}, + {"SYS_LSTAT_EXTENDED", Const, 0}, + {"SYS_LUTIMES", Const, 0}, + {"SYS_MAC_SYSCALL", Const, 0}, + {"SYS_MADVISE", Const, 0}, + {"SYS_MADVISE1", Const, 0}, + {"SYS_MAXSYSCALL", Const, 0}, + {"SYS_MBIND", Const, 0}, + {"SYS_MIGRATE_PAGES", Const, 0}, + {"SYS_MINCORE", Const, 0}, + {"SYS_MINHERIT", Const, 0}, + {"SYS_MKCOMPLEX", Const, 0}, + {"SYS_MKDIR", Const, 0}, + {"SYS_MKDIRAT", Const, 0}, + {"SYS_MKDIR_EXTENDED", Const, 0}, + {"SYS_MKFIFO", Const, 0}, + {"SYS_MKFIFOAT", Const, 0}, + {"SYS_MKFIFO_EXTENDED", Const, 0}, + {"SYS_MKNOD", Const, 0}, + {"SYS_MKNODAT", Const, 0}, + {"SYS_MLOCK", Const, 0}, + {"SYS_MLOCKALL", Const, 0}, + {"SYS_MMAP", Const, 0}, + {"SYS_MMAP2", Const, 0}, + {"SYS_MODCTL", Const, 1}, + {"SYS_MODFIND", Const, 0}, + {"SYS_MODFNEXT", Const, 0}, + {"SYS_MODIFY_LDT", Const, 0}, + {"SYS_MODNEXT", Const, 0}, + {"SYS_MODSTAT", Const, 0}, + {"SYS_MODWATCH", Const, 0}, + {"SYS_MOUNT", Const, 0}, + {"SYS_MOVE_PAGES", Const, 0}, + {"SYS_MPROTECT", Const, 0}, + {"SYS_MPX", Const, 0}, + {"SYS_MQUERY", Const, 1}, + {"SYS_MQ_GETSETATTR", Const, 0}, + {"SYS_MQ_NOTIFY", Const, 0}, + {"SYS_MQ_OPEN", Const, 0}, + {"SYS_MQ_TIMEDRECEIVE", Const, 0}, + {"SYS_MQ_TIMEDSEND", Const, 0}, + {"SYS_MQ_UNLINK", Const, 0}, + {"SYS_MREMAP", Const, 0}, + {"SYS_MSGCTL", Const, 0}, + {"SYS_MSGGET", Const, 0}, + {"SYS_MSGRCV", Const, 0}, + {"SYS_MSGRCV_NOCANCEL", Const, 0}, + {"SYS_MSGSND", Const, 0}, + {"SYS_MSGSND_NOCANCEL", Const, 0}, + {"SYS_MSGSYS", Const, 0}, + {"SYS_MSYNC", Const, 0}, + {"SYS_MSYNC_NOCANCEL", Const, 0}, + {"SYS_MUNLOCK", Const, 0}, + {"SYS_MUNLOCKALL", Const, 0}, + {"SYS_MUNMAP", Const, 0}, + {"SYS_NAME_TO_HANDLE_AT", Const, 0}, + {"SYS_NANOSLEEP", Const, 0}, + {"SYS_NEWFSTATAT", Const, 0}, + {"SYS_NFSCLNT", Const, 0}, + {"SYS_NFSSERVCTL", Const, 0}, + {"SYS_NFSSVC", Const, 0}, + {"SYS_NFSTAT", Const, 0}, + {"SYS_NICE", Const, 0}, + {"SYS_NLM_SYSCALL", Const, 14}, + {"SYS_NLSTAT", Const, 0}, + {"SYS_NMOUNT", Const, 0}, + {"SYS_NSTAT", Const, 0}, + {"SYS_NTP_ADJTIME", Const, 0}, + {"SYS_NTP_GETTIME", Const, 0}, + {"SYS_NUMA_GETAFFINITY", Const, 14}, + {"SYS_NUMA_SETAFFINITY", Const, 14}, + {"SYS_OABI_SYSCALL_BASE", Const, 0}, + {"SYS_OBREAK", Const, 0}, + {"SYS_OLDFSTAT", Const, 0}, + {"SYS_OLDLSTAT", Const, 0}, + {"SYS_OLDOLDUNAME", Const, 0}, + {"SYS_OLDSTAT", Const, 0}, + {"SYS_OLDUNAME", Const, 0}, + {"SYS_OPEN", Const, 0}, + {"SYS_OPENAT", Const, 0}, + {"SYS_OPENBSD_POLL", Const, 0}, + {"SYS_OPEN_BY_HANDLE_AT", Const, 0}, + {"SYS_OPEN_DPROTECTED_NP", Const, 16}, + {"SYS_OPEN_EXTENDED", Const, 0}, + {"SYS_OPEN_NOCANCEL", Const, 0}, + {"SYS_OVADVISE", Const, 0}, + {"SYS_PACCEPT", Const, 1}, + {"SYS_PATHCONF", Const, 0}, + {"SYS_PAUSE", Const, 0}, + {"SYS_PCICONFIG_IOBASE", Const, 0}, + {"SYS_PCICONFIG_READ", Const, 0}, + {"SYS_PCICONFIG_WRITE", Const, 0}, + {"SYS_PDFORK", Const, 0}, + {"SYS_PDGETPID", Const, 0}, + {"SYS_PDKILL", Const, 0}, + {"SYS_PERF_EVENT_OPEN", Const, 0}, + {"SYS_PERSONALITY", Const, 0}, + {"SYS_PID_HIBERNATE", Const, 0}, + {"SYS_PID_RESUME", Const, 0}, + {"SYS_PID_SHUTDOWN_SOCKETS", Const, 0}, + {"SYS_PID_SUSPEND", Const, 0}, + {"SYS_PIPE", Const, 0}, + {"SYS_PIPE2", Const, 0}, + {"SYS_PIVOT_ROOT", Const, 0}, + {"SYS_PMC_CONTROL", Const, 1}, + {"SYS_PMC_GET_INFO", Const, 1}, + {"SYS_POLL", Const, 0}, + {"SYS_POLLTS", Const, 1}, + {"SYS_POLL_NOCANCEL", Const, 0}, + {"SYS_POSIX_FADVISE", Const, 0}, + {"SYS_POSIX_FALLOCATE", Const, 0}, + {"SYS_POSIX_OPENPT", Const, 0}, + {"SYS_POSIX_SPAWN", Const, 0}, + {"SYS_PPOLL", Const, 0}, + {"SYS_PRCTL", Const, 0}, + {"SYS_PREAD", Const, 0}, + {"SYS_PREAD64", Const, 0}, + {"SYS_PREADV", Const, 0}, + {"SYS_PREAD_NOCANCEL", Const, 0}, + {"SYS_PRLIMIT64", Const, 0}, + {"SYS_PROCCTL", Const, 3}, + {"SYS_PROCESS_POLICY", Const, 0}, + {"SYS_PROCESS_VM_READV", Const, 0}, + {"SYS_PROCESS_VM_WRITEV", Const, 0}, + {"SYS_PROC_INFO", Const, 0}, + {"SYS_PROF", Const, 0}, + {"SYS_PROFIL", Const, 0}, + {"SYS_PSELECT", Const, 0}, + {"SYS_PSELECT6", Const, 0}, + {"SYS_PSET_ASSIGN", Const, 1}, + {"SYS_PSET_CREATE", Const, 1}, + {"SYS_PSET_DESTROY", Const, 1}, + {"SYS_PSYNCH_CVBROAD", Const, 0}, + {"SYS_PSYNCH_CVCLRPREPOST", Const, 0}, + {"SYS_PSYNCH_CVSIGNAL", Const, 0}, + {"SYS_PSYNCH_CVWAIT", Const, 0}, + {"SYS_PSYNCH_MUTEXDROP", Const, 0}, + {"SYS_PSYNCH_MUTEXWAIT", Const, 0}, + {"SYS_PSYNCH_RW_DOWNGRADE", Const, 0}, + {"SYS_PSYNCH_RW_LONGRDLOCK", Const, 0}, + {"SYS_PSYNCH_RW_RDLOCK", Const, 0}, + {"SYS_PSYNCH_RW_UNLOCK", Const, 0}, + {"SYS_PSYNCH_RW_UNLOCK2", Const, 0}, + {"SYS_PSYNCH_RW_UPGRADE", Const, 0}, + {"SYS_PSYNCH_RW_WRLOCK", Const, 0}, + {"SYS_PSYNCH_RW_YIELDWRLOCK", Const, 0}, + {"SYS_PTRACE", Const, 0}, + {"SYS_PUTPMSG", Const, 0}, + {"SYS_PWRITE", Const, 0}, + {"SYS_PWRITE64", Const, 0}, + {"SYS_PWRITEV", Const, 0}, + {"SYS_PWRITE_NOCANCEL", Const, 0}, + {"SYS_QUERY_MODULE", Const, 0}, + {"SYS_QUOTACTL", Const, 0}, + {"SYS_RASCTL", Const, 1}, + {"SYS_RCTL_ADD_RULE", Const, 0}, + {"SYS_RCTL_GET_LIMITS", Const, 0}, + {"SYS_RCTL_GET_RACCT", Const, 0}, + {"SYS_RCTL_GET_RULES", Const, 0}, + {"SYS_RCTL_REMOVE_RULE", Const, 0}, + {"SYS_READ", Const, 0}, + {"SYS_READAHEAD", Const, 0}, + {"SYS_READDIR", Const, 0}, + {"SYS_READLINK", Const, 0}, + {"SYS_READLINKAT", Const, 0}, + {"SYS_READV", Const, 0}, + {"SYS_READV_NOCANCEL", Const, 0}, + {"SYS_READ_NOCANCEL", Const, 0}, + {"SYS_REBOOT", Const, 0}, + {"SYS_RECV", Const, 0}, + {"SYS_RECVFROM", Const, 0}, + {"SYS_RECVFROM_NOCANCEL", Const, 0}, + {"SYS_RECVMMSG", Const, 0}, + {"SYS_RECVMSG", Const, 0}, + {"SYS_RECVMSG_NOCANCEL", Const, 0}, + {"SYS_REMAP_FILE_PAGES", Const, 0}, + {"SYS_REMOVEXATTR", Const, 0}, + {"SYS_RENAME", Const, 0}, + {"SYS_RENAMEAT", Const, 0}, + {"SYS_REQUEST_KEY", Const, 0}, + {"SYS_RESTART_SYSCALL", Const, 0}, + {"SYS_REVOKE", Const, 0}, + {"SYS_RFORK", Const, 0}, + {"SYS_RMDIR", Const, 0}, + {"SYS_RTPRIO", Const, 0}, + {"SYS_RTPRIO_THREAD", Const, 0}, + {"SYS_RT_SIGACTION", Const, 0}, + {"SYS_RT_SIGPENDING", Const, 0}, + {"SYS_RT_SIGPROCMASK", Const, 0}, + {"SYS_RT_SIGQUEUEINFO", Const, 0}, + {"SYS_RT_SIGRETURN", Const, 0}, + {"SYS_RT_SIGSUSPEND", Const, 0}, + {"SYS_RT_SIGTIMEDWAIT", Const, 0}, + {"SYS_RT_TGSIGQUEUEINFO", Const, 0}, + {"SYS_SBRK", Const, 0}, + {"SYS_SCHED_GETAFFINITY", Const, 0}, + {"SYS_SCHED_GETPARAM", Const, 0}, + {"SYS_SCHED_GETSCHEDULER", Const, 0}, + {"SYS_SCHED_GET_PRIORITY_MAX", Const, 0}, + {"SYS_SCHED_GET_PRIORITY_MIN", Const, 0}, + {"SYS_SCHED_RR_GET_INTERVAL", Const, 0}, + {"SYS_SCHED_SETAFFINITY", Const, 0}, + {"SYS_SCHED_SETPARAM", Const, 0}, + {"SYS_SCHED_SETSCHEDULER", Const, 0}, + {"SYS_SCHED_YIELD", Const, 0}, + {"SYS_SCTP_GENERIC_RECVMSG", Const, 0}, + {"SYS_SCTP_GENERIC_SENDMSG", Const, 0}, + {"SYS_SCTP_GENERIC_SENDMSG_IOV", Const, 0}, + {"SYS_SCTP_PEELOFF", Const, 0}, + {"SYS_SEARCHFS", Const, 0}, + {"SYS_SECURITY", Const, 0}, + {"SYS_SELECT", Const, 0}, + {"SYS_SELECT_NOCANCEL", Const, 0}, + {"SYS_SEMCONFIG", Const, 1}, + {"SYS_SEMCTL", Const, 0}, + {"SYS_SEMGET", Const, 0}, + {"SYS_SEMOP", Const, 0}, + {"SYS_SEMSYS", Const, 0}, + {"SYS_SEMTIMEDOP", Const, 0}, + {"SYS_SEM_CLOSE", Const, 0}, + {"SYS_SEM_DESTROY", Const, 0}, + {"SYS_SEM_GETVALUE", Const, 0}, + {"SYS_SEM_INIT", Const, 0}, + {"SYS_SEM_OPEN", Const, 0}, + {"SYS_SEM_POST", Const, 0}, + {"SYS_SEM_TRYWAIT", Const, 0}, + {"SYS_SEM_UNLINK", Const, 0}, + {"SYS_SEM_WAIT", Const, 0}, + {"SYS_SEM_WAIT_NOCANCEL", Const, 0}, + {"SYS_SEND", Const, 0}, + {"SYS_SENDFILE", Const, 0}, + {"SYS_SENDFILE64", Const, 0}, + {"SYS_SENDMMSG", Const, 0}, + {"SYS_SENDMSG", Const, 0}, + {"SYS_SENDMSG_NOCANCEL", Const, 0}, + {"SYS_SENDTO", Const, 0}, + {"SYS_SENDTO_NOCANCEL", Const, 0}, + {"SYS_SETATTRLIST", Const, 0}, + {"SYS_SETAUDIT", Const, 0}, + {"SYS_SETAUDIT_ADDR", Const, 0}, + {"SYS_SETAUID", Const, 0}, + {"SYS_SETCONTEXT", Const, 0}, + {"SYS_SETDOMAINNAME", Const, 0}, + {"SYS_SETEGID", Const, 0}, + {"SYS_SETEUID", Const, 0}, + {"SYS_SETFIB", Const, 0}, + {"SYS_SETFSGID", Const, 0}, + {"SYS_SETFSGID32", Const, 0}, + {"SYS_SETFSUID", Const, 0}, + {"SYS_SETFSUID32", Const, 0}, + {"SYS_SETGID", Const, 0}, + {"SYS_SETGID32", Const, 0}, + {"SYS_SETGROUPS", Const, 0}, + {"SYS_SETGROUPS32", Const, 0}, + {"SYS_SETHOSTNAME", Const, 0}, + {"SYS_SETITIMER", Const, 0}, + {"SYS_SETLCID", Const, 0}, + {"SYS_SETLOGIN", Const, 0}, + {"SYS_SETLOGINCLASS", Const, 0}, + {"SYS_SETNS", Const, 0}, + {"SYS_SETPGID", Const, 0}, + {"SYS_SETPRIORITY", Const, 0}, + {"SYS_SETPRIVEXEC", Const, 0}, + {"SYS_SETREGID", Const, 0}, + {"SYS_SETREGID32", Const, 0}, + {"SYS_SETRESGID", Const, 0}, + {"SYS_SETRESGID32", Const, 0}, + {"SYS_SETRESUID", Const, 0}, + {"SYS_SETRESUID32", Const, 0}, + {"SYS_SETREUID", Const, 0}, + {"SYS_SETREUID32", Const, 0}, + {"SYS_SETRLIMIT", Const, 0}, + {"SYS_SETRTABLE", Const, 1}, + {"SYS_SETSGROUPS", Const, 0}, + {"SYS_SETSID", Const, 0}, + {"SYS_SETSOCKOPT", Const, 0}, + {"SYS_SETTID", Const, 0}, + {"SYS_SETTID_WITH_PID", Const, 0}, + {"SYS_SETTIMEOFDAY", Const, 0}, + {"SYS_SETUID", Const, 0}, + {"SYS_SETUID32", Const, 0}, + {"SYS_SETWGROUPS", Const, 0}, + {"SYS_SETXATTR", Const, 0}, + {"SYS_SET_MEMPOLICY", Const, 0}, + {"SYS_SET_ROBUST_LIST", Const, 0}, + {"SYS_SET_THREAD_AREA", Const, 0}, + {"SYS_SET_TID_ADDRESS", Const, 0}, + {"SYS_SGETMASK", Const, 0}, + {"SYS_SHARED_REGION_CHECK_NP", Const, 0}, + {"SYS_SHARED_REGION_MAP_AND_SLIDE_NP", Const, 0}, + {"SYS_SHMAT", Const, 0}, + {"SYS_SHMCTL", Const, 0}, + {"SYS_SHMDT", Const, 0}, + {"SYS_SHMGET", Const, 0}, + {"SYS_SHMSYS", Const, 0}, + {"SYS_SHM_OPEN", Const, 0}, + {"SYS_SHM_UNLINK", Const, 0}, + {"SYS_SHUTDOWN", Const, 0}, + {"SYS_SIGACTION", Const, 0}, + {"SYS_SIGALTSTACK", Const, 0}, + {"SYS_SIGNAL", Const, 0}, + {"SYS_SIGNALFD", Const, 0}, + {"SYS_SIGNALFD4", Const, 0}, + {"SYS_SIGPENDING", Const, 0}, + {"SYS_SIGPROCMASK", Const, 0}, + {"SYS_SIGQUEUE", Const, 0}, + {"SYS_SIGQUEUEINFO", Const, 1}, + {"SYS_SIGRETURN", Const, 0}, + {"SYS_SIGSUSPEND", Const, 0}, + {"SYS_SIGSUSPEND_NOCANCEL", Const, 0}, + {"SYS_SIGTIMEDWAIT", Const, 0}, + {"SYS_SIGWAIT", Const, 0}, + {"SYS_SIGWAITINFO", Const, 0}, + {"SYS_SOCKET", Const, 0}, + {"SYS_SOCKETCALL", Const, 0}, + {"SYS_SOCKETPAIR", Const, 0}, + {"SYS_SPLICE", Const, 0}, + {"SYS_SSETMASK", Const, 0}, + {"SYS_SSTK", Const, 0}, + {"SYS_STACK_SNAPSHOT", Const, 0}, + {"SYS_STAT", Const, 0}, + {"SYS_STAT64", Const, 0}, + {"SYS_STAT64_EXTENDED", Const, 0}, + {"SYS_STATFS", Const, 0}, + {"SYS_STATFS64", Const, 0}, + {"SYS_STATV", Const, 0}, + {"SYS_STATVFS1", Const, 1}, + {"SYS_STAT_EXTENDED", Const, 0}, + {"SYS_STIME", Const, 0}, + {"SYS_STTY", Const, 0}, + {"SYS_SWAPCONTEXT", Const, 0}, + {"SYS_SWAPCTL", Const, 1}, + {"SYS_SWAPOFF", Const, 0}, + {"SYS_SWAPON", Const, 0}, + {"SYS_SYMLINK", Const, 0}, + {"SYS_SYMLINKAT", Const, 0}, + {"SYS_SYNC", Const, 0}, + {"SYS_SYNCFS", Const, 0}, + {"SYS_SYNC_FILE_RANGE", Const, 0}, + {"SYS_SYSARCH", Const, 0}, + {"SYS_SYSCALL", Const, 0}, + {"SYS_SYSCALL_BASE", Const, 0}, + {"SYS_SYSFS", Const, 0}, + {"SYS_SYSINFO", Const, 0}, + {"SYS_SYSLOG", Const, 0}, + {"SYS_TEE", Const, 0}, + {"SYS_TGKILL", Const, 0}, + {"SYS_THREAD_SELFID", Const, 0}, + {"SYS_THR_CREATE", Const, 0}, + {"SYS_THR_EXIT", Const, 0}, + {"SYS_THR_KILL", Const, 0}, + {"SYS_THR_KILL2", Const, 0}, + {"SYS_THR_NEW", Const, 0}, + {"SYS_THR_SELF", Const, 0}, + {"SYS_THR_SET_NAME", Const, 0}, + {"SYS_THR_SUSPEND", Const, 0}, + {"SYS_THR_WAKE", Const, 0}, + {"SYS_TIME", Const, 0}, + {"SYS_TIMERFD_CREATE", Const, 0}, + {"SYS_TIMERFD_GETTIME", Const, 0}, + {"SYS_TIMERFD_SETTIME", Const, 0}, + {"SYS_TIMER_CREATE", Const, 0}, + {"SYS_TIMER_DELETE", Const, 0}, + {"SYS_TIMER_GETOVERRUN", Const, 0}, + {"SYS_TIMER_GETTIME", Const, 0}, + {"SYS_TIMER_SETTIME", Const, 0}, + {"SYS_TIMES", Const, 0}, + {"SYS_TKILL", Const, 0}, + {"SYS_TRUNCATE", Const, 0}, + {"SYS_TRUNCATE64", Const, 0}, + {"SYS_TUXCALL", Const, 0}, + {"SYS_UGETRLIMIT", Const, 0}, + {"SYS_ULIMIT", Const, 0}, + {"SYS_UMASK", Const, 0}, + {"SYS_UMASK_EXTENDED", Const, 0}, + {"SYS_UMOUNT", Const, 0}, + {"SYS_UMOUNT2", Const, 0}, + {"SYS_UNAME", Const, 0}, + {"SYS_UNDELETE", Const, 0}, + {"SYS_UNLINK", Const, 0}, + {"SYS_UNLINKAT", Const, 0}, + {"SYS_UNMOUNT", Const, 0}, + {"SYS_UNSHARE", Const, 0}, + {"SYS_USELIB", Const, 0}, + {"SYS_USTAT", Const, 0}, + {"SYS_UTIME", Const, 0}, + {"SYS_UTIMENSAT", Const, 0}, + {"SYS_UTIMES", Const, 0}, + {"SYS_UTRACE", Const, 0}, + {"SYS_UUIDGEN", Const, 0}, + {"SYS_VADVISE", Const, 1}, + {"SYS_VFORK", Const, 0}, + {"SYS_VHANGUP", Const, 0}, + {"SYS_VM86", Const, 0}, + {"SYS_VM86OLD", Const, 0}, + {"SYS_VMSPLICE", Const, 0}, + {"SYS_VM_PRESSURE_MONITOR", Const, 0}, + {"SYS_VSERVER", Const, 0}, + {"SYS_WAIT4", Const, 0}, + {"SYS_WAIT4_NOCANCEL", Const, 0}, + {"SYS_WAIT6", Const, 1}, + {"SYS_WAITEVENT", Const, 0}, + {"SYS_WAITID", Const, 0}, + {"SYS_WAITID_NOCANCEL", Const, 0}, + {"SYS_WAITPID", Const, 0}, + {"SYS_WATCHEVENT", Const, 0}, + {"SYS_WORKQ_KERNRETURN", Const, 0}, + {"SYS_WORKQ_OPEN", Const, 0}, + {"SYS_WRITE", Const, 0}, + {"SYS_WRITEV", Const, 0}, + {"SYS_WRITEV_NOCANCEL", Const, 0}, + {"SYS_WRITE_NOCANCEL", Const, 0}, + {"SYS_YIELD", Const, 0}, + {"SYS__LLSEEK", Const, 0}, + {"SYS__LWP_CONTINUE", Const, 1}, + {"SYS__LWP_CREATE", Const, 1}, + {"SYS__LWP_CTL", Const, 1}, + {"SYS__LWP_DETACH", Const, 1}, + {"SYS__LWP_EXIT", Const, 1}, + {"SYS__LWP_GETNAME", Const, 1}, + {"SYS__LWP_GETPRIVATE", Const, 1}, + {"SYS__LWP_KILL", Const, 1}, + {"SYS__LWP_PARK", Const, 1}, + {"SYS__LWP_SELF", Const, 1}, + {"SYS__LWP_SETNAME", Const, 1}, + {"SYS__LWP_SETPRIVATE", Const, 1}, + {"SYS__LWP_SUSPEND", Const, 1}, + {"SYS__LWP_UNPARK", Const, 1}, + {"SYS__LWP_UNPARK_ALL", Const, 1}, + {"SYS__LWP_WAIT", Const, 1}, + {"SYS__LWP_WAKEUP", Const, 1}, + {"SYS__NEWSELECT", Const, 0}, + {"SYS__PSET_BIND", Const, 1}, + {"SYS__SCHED_GETAFFINITY", Const, 1}, + {"SYS__SCHED_GETPARAM", Const, 1}, + {"SYS__SCHED_SETAFFINITY", Const, 1}, + {"SYS__SCHED_SETPARAM", Const, 1}, + {"SYS__SYSCTL", Const, 0}, + {"SYS__UMTX_LOCK", Const, 0}, + {"SYS__UMTX_OP", Const, 0}, + {"SYS__UMTX_UNLOCK", Const, 0}, + {"SYS___ACL_ACLCHECK_FD", Const, 0}, + {"SYS___ACL_ACLCHECK_FILE", Const, 0}, + {"SYS___ACL_ACLCHECK_LINK", Const, 0}, + {"SYS___ACL_DELETE_FD", Const, 0}, + {"SYS___ACL_DELETE_FILE", Const, 0}, + {"SYS___ACL_DELETE_LINK", Const, 0}, + {"SYS___ACL_GET_FD", Const, 0}, + {"SYS___ACL_GET_FILE", Const, 0}, + {"SYS___ACL_GET_LINK", Const, 0}, + {"SYS___ACL_SET_FD", Const, 0}, + {"SYS___ACL_SET_FILE", Const, 0}, + {"SYS___ACL_SET_LINK", Const, 0}, + {"SYS___CAP_RIGHTS_GET", Const, 14}, + {"SYS___CLONE", Const, 1}, + {"SYS___DISABLE_THREADSIGNAL", Const, 0}, + {"SYS___GETCWD", Const, 0}, + {"SYS___GETLOGIN", Const, 1}, + {"SYS___GET_TCB", Const, 1}, + {"SYS___MAC_EXECVE", Const, 0}, + {"SYS___MAC_GETFSSTAT", Const, 0}, + {"SYS___MAC_GET_FD", Const, 0}, + {"SYS___MAC_GET_FILE", Const, 0}, + {"SYS___MAC_GET_LCID", Const, 0}, + {"SYS___MAC_GET_LCTX", Const, 0}, + {"SYS___MAC_GET_LINK", Const, 0}, + {"SYS___MAC_GET_MOUNT", Const, 0}, + {"SYS___MAC_GET_PID", Const, 0}, + {"SYS___MAC_GET_PROC", Const, 0}, + {"SYS___MAC_MOUNT", Const, 0}, + {"SYS___MAC_SET_FD", Const, 0}, + {"SYS___MAC_SET_FILE", Const, 0}, + {"SYS___MAC_SET_LCTX", Const, 0}, + {"SYS___MAC_SET_LINK", Const, 0}, + {"SYS___MAC_SET_PROC", Const, 0}, + {"SYS___MAC_SYSCALL", Const, 0}, + {"SYS___OLD_SEMWAIT_SIGNAL", Const, 0}, + {"SYS___OLD_SEMWAIT_SIGNAL_NOCANCEL", Const, 0}, + {"SYS___POSIX_CHOWN", Const, 1}, + {"SYS___POSIX_FCHOWN", Const, 1}, + {"SYS___POSIX_LCHOWN", Const, 1}, + {"SYS___POSIX_RENAME", Const, 1}, + {"SYS___PTHREAD_CANCELED", Const, 0}, + {"SYS___PTHREAD_CHDIR", Const, 0}, + {"SYS___PTHREAD_FCHDIR", Const, 0}, + {"SYS___PTHREAD_KILL", Const, 0}, + {"SYS___PTHREAD_MARKCANCEL", Const, 0}, + {"SYS___PTHREAD_SIGMASK", Const, 0}, + {"SYS___QUOTACTL", Const, 1}, + {"SYS___SEMCTL", Const, 1}, + {"SYS___SEMWAIT_SIGNAL", Const, 0}, + {"SYS___SEMWAIT_SIGNAL_NOCANCEL", Const, 0}, + {"SYS___SETLOGIN", Const, 1}, + {"SYS___SETUGID", Const, 0}, + {"SYS___SET_TCB", Const, 1}, + {"SYS___SIGACTION_SIGTRAMP", Const, 1}, + {"SYS___SIGTIMEDWAIT", Const, 1}, + {"SYS___SIGWAIT", Const, 0}, + {"SYS___SIGWAIT_NOCANCEL", Const, 0}, + {"SYS___SYSCTL", Const, 0}, + {"SYS___TFORK", Const, 1}, + {"SYS___THREXIT", Const, 1}, + {"SYS___THRSIGDIVERT", Const, 1}, + {"SYS___THRSLEEP", Const, 1}, + {"SYS___THRWAKEUP", Const, 1}, + {"S_ARCH1", Const, 1}, + {"S_ARCH2", Const, 1}, + {"S_BLKSIZE", Const, 0}, + {"S_IEXEC", Const, 0}, + {"S_IFBLK", Const, 0}, + {"S_IFCHR", Const, 0}, + {"S_IFDIR", Const, 0}, + {"S_IFIFO", Const, 0}, + {"S_IFLNK", Const, 0}, + {"S_IFMT", Const, 0}, + {"S_IFREG", Const, 0}, + {"S_IFSOCK", Const, 0}, + {"S_IFWHT", Const, 0}, + {"S_IREAD", Const, 0}, + {"S_IRGRP", Const, 0}, + {"S_IROTH", Const, 0}, + {"S_IRUSR", Const, 0}, + {"S_IRWXG", Const, 0}, + {"S_IRWXO", Const, 0}, + {"S_IRWXU", Const, 0}, + {"S_ISGID", Const, 0}, + {"S_ISTXT", Const, 0}, + {"S_ISUID", Const, 0}, + {"S_ISVTX", Const, 0}, + {"S_IWGRP", Const, 0}, + {"S_IWOTH", Const, 0}, + {"S_IWRITE", Const, 0}, + {"S_IWUSR", Const, 0}, + {"S_IXGRP", Const, 0}, + {"S_IXOTH", Const, 0}, + {"S_IXUSR", Const, 0}, + {"S_LOGIN_SET", Const, 1}, + {"SecurityAttributes", Type, 0}, + {"SecurityAttributes.InheritHandle", Field, 0}, + {"SecurityAttributes.Length", Field, 0}, + {"SecurityAttributes.SecurityDescriptor", Field, 0}, + {"Seek", Func, 0}, + {"Select", Func, 0}, + {"Sendfile", Func, 0}, + {"Sendmsg", Func, 0}, + {"SendmsgN", Func, 3}, + {"Sendto", Func, 0}, + {"Servent", Type, 0}, + {"Servent.Aliases", Field, 0}, + {"Servent.Name", Field, 0}, + {"Servent.Port", Field, 0}, + {"Servent.Proto", Field, 0}, + {"SetBpf", Func, 0}, + {"SetBpfBuflen", Func, 0}, + {"SetBpfDatalink", Func, 0}, + {"SetBpfHeadercmpl", Func, 0}, + {"SetBpfImmediate", Func, 0}, + {"SetBpfInterface", Func, 0}, + {"SetBpfPromisc", Func, 0}, + {"SetBpfTimeout", Func, 0}, + {"SetCurrentDirectory", Func, 0}, + {"SetEndOfFile", Func, 0}, + {"SetEnvironmentVariable", Func, 0}, + {"SetFileAttributes", Func, 0}, + {"SetFileCompletionNotificationModes", Func, 2}, + {"SetFilePointer", Func, 0}, + {"SetFileTime", Func, 0}, + {"SetHandleInformation", Func, 0}, + {"SetKevent", Func, 0}, + {"SetLsfPromisc", Func, 0}, + {"SetNonblock", Func, 0}, + {"Setdomainname", Func, 0}, + {"Setegid", Func, 0}, + {"Setenv", Func, 0}, + {"Seteuid", Func, 0}, + {"Setfsgid", Func, 0}, + {"Setfsuid", Func, 0}, + {"Setgid", Func, 0}, + {"Setgroups", Func, 0}, + {"Sethostname", Func, 0}, + {"Setlogin", Func, 0}, + {"Setpgid", Func, 0}, + {"Setpriority", Func, 0}, + {"Setprivexec", Func, 0}, + {"Setregid", Func, 0}, + {"Setresgid", Func, 0}, + {"Setresuid", Func, 0}, + {"Setreuid", Func, 0}, + {"Setrlimit", Func, 0}, + {"Setsid", Func, 0}, + {"Setsockopt", Func, 0}, + {"SetsockoptByte", Func, 0}, + {"SetsockoptICMPv6Filter", Func, 2}, + {"SetsockoptIPMreq", Func, 0}, + {"SetsockoptIPMreqn", Func, 0}, + {"SetsockoptIPv6Mreq", Func, 0}, + {"SetsockoptInet4Addr", Func, 0}, + {"SetsockoptInt", Func, 0}, + {"SetsockoptLinger", Func, 0}, + {"SetsockoptString", Func, 0}, + {"SetsockoptTimeval", Func, 0}, + {"Settimeofday", Func, 0}, + {"Setuid", Func, 0}, + {"Setxattr", Func, 1}, + {"Shutdown", Func, 0}, + {"SidTypeAlias", Const, 0}, + {"SidTypeComputer", Const, 0}, + {"SidTypeDeletedAccount", Const, 0}, + {"SidTypeDomain", Const, 0}, + {"SidTypeGroup", Const, 0}, + {"SidTypeInvalid", Const, 0}, + {"SidTypeLabel", Const, 0}, + {"SidTypeUnknown", Const, 0}, + {"SidTypeUser", Const, 0}, + {"SidTypeWellKnownGroup", Const, 0}, + {"Signal", Type, 0}, + {"SizeofBpfHdr", Const, 0}, + {"SizeofBpfInsn", Const, 0}, + {"SizeofBpfProgram", Const, 0}, + {"SizeofBpfStat", Const, 0}, + {"SizeofBpfVersion", Const, 0}, + {"SizeofBpfZbuf", Const, 0}, + {"SizeofBpfZbufHeader", Const, 0}, + {"SizeofCmsghdr", Const, 0}, + {"SizeofICMPv6Filter", Const, 2}, + {"SizeofIPMreq", Const, 0}, + {"SizeofIPMreqn", Const, 0}, + {"SizeofIPv6MTUInfo", Const, 2}, + {"SizeofIPv6Mreq", Const, 0}, + {"SizeofIfAddrmsg", Const, 0}, + {"SizeofIfAnnounceMsghdr", Const, 1}, + {"SizeofIfData", Const, 0}, + {"SizeofIfInfomsg", Const, 0}, + {"SizeofIfMsghdr", Const, 0}, + {"SizeofIfaMsghdr", Const, 0}, + {"SizeofIfmaMsghdr", Const, 0}, + {"SizeofIfmaMsghdr2", Const, 0}, + {"SizeofInet4Pktinfo", Const, 0}, + {"SizeofInet6Pktinfo", Const, 0}, + {"SizeofInotifyEvent", Const, 0}, + {"SizeofLinger", Const, 0}, + {"SizeofMsghdr", Const, 0}, + {"SizeofNlAttr", Const, 0}, + {"SizeofNlMsgerr", Const, 0}, + {"SizeofNlMsghdr", Const, 0}, + {"SizeofRtAttr", Const, 0}, + {"SizeofRtGenmsg", Const, 0}, + {"SizeofRtMetrics", Const, 0}, + {"SizeofRtMsg", Const, 0}, + {"SizeofRtMsghdr", Const, 0}, + {"SizeofRtNexthop", Const, 0}, + {"SizeofSockFilter", Const, 0}, + {"SizeofSockFprog", Const, 0}, + {"SizeofSockaddrAny", Const, 0}, + {"SizeofSockaddrDatalink", Const, 0}, + {"SizeofSockaddrInet4", Const, 0}, + {"SizeofSockaddrInet6", Const, 0}, + {"SizeofSockaddrLinklayer", Const, 0}, + {"SizeofSockaddrNetlink", Const, 0}, + {"SizeofSockaddrUnix", Const, 0}, + {"SizeofTCPInfo", Const, 1}, + {"SizeofUcred", Const, 0}, + {"SlicePtrFromStrings", Func, 1}, + {"SockFilter", Type, 0}, + {"SockFilter.Code", Field, 0}, + {"SockFilter.Jf", Field, 0}, + {"SockFilter.Jt", Field, 0}, + {"SockFilter.K", Field, 0}, + {"SockFprog", Type, 0}, + {"SockFprog.Filter", Field, 0}, + {"SockFprog.Len", Field, 0}, + {"SockFprog.Pad_cgo_0", Field, 0}, + {"Sockaddr", Type, 0}, + {"SockaddrDatalink", Type, 0}, + {"SockaddrDatalink.Alen", Field, 0}, + {"SockaddrDatalink.Data", Field, 0}, + {"SockaddrDatalink.Family", Field, 0}, + {"SockaddrDatalink.Index", Field, 0}, + {"SockaddrDatalink.Len", Field, 0}, + {"SockaddrDatalink.Nlen", Field, 0}, + {"SockaddrDatalink.Slen", Field, 0}, + {"SockaddrDatalink.Type", Field, 0}, + {"SockaddrGen", Type, 0}, + {"SockaddrInet4", Type, 0}, + {"SockaddrInet4.Addr", Field, 0}, + {"SockaddrInet4.Port", Field, 0}, + {"SockaddrInet6", Type, 0}, + {"SockaddrInet6.Addr", Field, 0}, + {"SockaddrInet6.Port", Field, 0}, + {"SockaddrInet6.ZoneId", Field, 0}, + {"SockaddrLinklayer", Type, 0}, + {"SockaddrLinklayer.Addr", Field, 0}, + {"SockaddrLinklayer.Halen", Field, 0}, + {"SockaddrLinklayer.Hatype", Field, 0}, + {"SockaddrLinklayer.Ifindex", Field, 0}, + {"SockaddrLinklayer.Pkttype", Field, 0}, + {"SockaddrLinklayer.Protocol", Field, 0}, + {"SockaddrNetlink", Type, 0}, + {"SockaddrNetlink.Family", Field, 0}, + {"SockaddrNetlink.Groups", Field, 0}, + {"SockaddrNetlink.Pad", Field, 0}, + {"SockaddrNetlink.Pid", Field, 0}, + {"SockaddrUnix", Type, 0}, + {"SockaddrUnix.Name", Field, 0}, + {"Socket", Func, 0}, + {"SocketControlMessage", Type, 0}, + {"SocketControlMessage.Data", Field, 0}, + {"SocketControlMessage.Header", Field, 0}, + {"SocketDisableIPv6", Var, 0}, + {"Socketpair", Func, 0}, + {"Splice", Func, 0}, + {"StartProcess", Func, 0}, + {"StartupInfo", Type, 0}, + {"StartupInfo.Cb", Field, 0}, + {"StartupInfo.Desktop", Field, 0}, + {"StartupInfo.FillAttribute", Field, 0}, + {"StartupInfo.Flags", Field, 0}, + {"StartupInfo.ShowWindow", Field, 0}, + {"StartupInfo.StdErr", Field, 0}, + {"StartupInfo.StdInput", Field, 0}, + {"StartupInfo.StdOutput", Field, 0}, + {"StartupInfo.Title", Field, 0}, + {"StartupInfo.X", Field, 0}, + {"StartupInfo.XCountChars", Field, 0}, + {"StartupInfo.XSize", Field, 0}, + {"StartupInfo.Y", Field, 0}, + {"StartupInfo.YCountChars", Field, 0}, + {"StartupInfo.YSize", Field, 0}, + {"Stat", Func, 0}, + {"Stat_t", Type, 0}, + {"Stat_t.Atim", Field, 0}, + {"Stat_t.Atim_ext", Field, 12}, + {"Stat_t.Atimespec", Field, 0}, + {"Stat_t.Birthtimespec", Field, 0}, + {"Stat_t.Blksize", Field, 0}, + {"Stat_t.Blocks", Field, 0}, + {"Stat_t.Btim_ext", Field, 12}, + {"Stat_t.Ctim", Field, 0}, + {"Stat_t.Ctim_ext", Field, 12}, + {"Stat_t.Ctimespec", Field, 0}, + {"Stat_t.Dev", Field, 0}, + {"Stat_t.Flags", Field, 0}, + {"Stat_t.Gen", Field, 0}, + {"Stat_t.Gid", Field, 0}, + {"Stat_t.Ino", Field, 0}, + {"Stat_t.Lspare", Field, 0}, + {"Stat_t.Lspare0", Field, 2}, + {"Stat_t.Lspare1", Field, 2}, + {"Stat_t.Mode", Field, 0}, + {"Stat_t.Mtim", Field, 0}, + {"Stat_t.Mtim_ext", Field, 12}, + {"Stat_t.Mtimespec", Field, 0}, + {"Stat_t.Nlink", Field, 0}, + {"Stat_t.Pad_cgo_0", Field, 0}, + {"Stat_t.Pad_cgo_1", Field, 0}, + {"Stat_t.Pad_cgo_2", Field, 0}, + {"Stat_t.Padding0", Field, 12}, + {"Stat_t.Padding1", Field, 12}, + {"Stat_t.Qspare", Field, 0}, + {"Stat_t.Rdev", Field, 0}, + {"Stat_t.Size", Field, 0}, + {"Stat_t.Spare", Field, 2}, + {"Stat_t.Uid", Field, 0}, + {"Stat_t.X__pad0", Field, 0}, + {"Stat_t.X__pad1", Field, 0}, + {"Stat_t.X__pad2", Field, 0}, + {"Stat_t.X__st_birthtim", Field, 2}, + {"Stat_t.X__st_ino", Field, 0}, + {"Stat_t.X__unused", Field, 0}, + {"Statfs", Func, 0}, + {"Statfs_t", Type, 0}, + {"Statfs_t.Asyncreads", Field, 0}, + {"Statfs_t.Asyncwrites", Field, 0}, + {"Statfs_t.Bavail", Field, 0}, + {"Statfs_t.Bfree", Field, 0}, + {"Statfs_t.Blocks", Field, 0}, + {"Statfs_t.Bsize", Field, 0}, + {"Statfs_t.Charspare", Field, 0}, + {"Statfs_t.F_asyncreads", Field, 2}, + {"Statfs_t.F_asyncwrites", Field, 2}, + {"Statfs_t.F_bavail", Field, 2}, + {"Statfs_t.F_bfree", Field, 2}, + {"Statfs_t.F_blocks", Field, 2}, + {"Statfs_t.F_bsize", Field, 2}, + {"Statfs_t.F_ctime", Field, 2}, + {"Statfs_t.F_favail", Field, 2}, + {"Statfs_t.F_ffree", Field, 2}, + {"Statfs_t.F_files", Field, 2}, + {"Statfs_t.F_flags", Field, 2}, + {"Statfs_t.F_fsid", Field, 2}, + {"Statfs_t.F_fstypename", Field, 2}, + {"Statfs_t.F_iosize", Field, 2}, + {"Statfs_t.F_mntfromname", Field, 2}, + {"Statfs_t.F_mntfromspec", Field, 3}, + {"Statfs_t.F_mntonname", Field, 2}, + {"Statfs_t.F_namemax", Field, 2}, + {"Statfs_t.F_owner", Field, 2}, + {"Statfs_t.F_spare", Field, 2}, + {"Statfs_t.F_syncreads", Field, 2}, + {"Statfs_t.F_syncwrites", Field, 2}, + {"Statfs_t.Ffree", Field, 0}, + {"Statfs_t.Files", Field, 0}, + {"Statfs_t.Flags", Field, 0}, + {"Statfs_t.Frsize", Field, 0}, + {"Statfs_t.Fsid", Field, 0}, + {"Statfs_t.Fssubtype", Field, 0}, + {"Statfs_t.Fstypename", Field, 0}, + {"Statfs_t.Iosize", Field, 0}, + {"Statfs_t.Mntfromname", Field, 0}, + {"Statfs_t.Mntonname", Field, 0}, + {"Statfs_t.Mount_info", Field, 2}, + {"Statfs_t.Namelen", Field, 0}, + {"Statfs_t.Namemax", Field, 0}, + {"Statfs_t.Owner", Field, 0}, + {"Statfs_t.Pad_cgo_0", Field, 0}, + {"Statfs_t.Pad_cgo_1", Field, 2}, + {"Statfs_t.Reserved", Field, 0}, + {"Statfs_t.Spare", Field, 0}, + {"Statfs_t.Syncreads", Field, 0}, + {"Statfs_t.Syncwrites", Field, 0}, + {"Statfs_t.Type", Field, 0}, + {"Statfs_t.Version", Field, 0}, + {"Stderr", Var, 0}, + {"Stdin", Var, 0}, + {"Stdout", Var, 0}, + {"StringBytePtr", Func, 0}, + {"StringByteSlice", Func, 0}, + {"StringSlicePtr", Func, 0}, + {"StringToSid", Func, 0}, + {"StringToUTF16", Func, 0}, + {"StringToUTF16Ptr", Func, 0}, + {"Symlink", Func, 0}, + {"Sync", Func, 0}, + {"SyncFileRange", Func, 0}, + {"SysProcAttr", Type, 0}, + {"SysProcAttr.AdditionalInheritedHandles", Field, 17}, + {"SysProcAttr.AmbientCaps", Field, 9}, + {"SysProcAttr.CgroupFD", Field, 20}, + {"SysProcAttr.Chroot", Field, 0}, + {"SysProcAttr.Cloneflags", Field, 2}, + {"SysProcAttr.CmdLine", Field, 0}, + {"SysProcAttr.CreationFlags", Field, 1}, + {"SysProcAttr.Credential", Field, 0}, + {"SysProcAttr.Ctty", Field, 1}, + {"SysProcAttr.Foreground", Field, 5}, + {"SysProcAttr.GidMappings", Field, 4}, + {"SysProcAttr.GidMappingsEnableSetgroups", Field, 5}, + {"SysProcAttr.HideWindow", Field, 0}, + {"SysProcAttr.Jail", Field, 21}, + {"SysProcAttr.NoInheritHandles", Field, 16}, + {"SysProcAttr.Noctty", Field, 0}, + {"SysProcAttr.ParentProcess", Field, 17}, + {"SysProcAttr.Pdeathsig", Field, 0}, + {"SysProcAttr.Pgid", Field, 5}, + {"SysProcAttr.PidFD", Field, 22}, + {"SysProcAttr.ProcessAttributes", Field, 13}, + {"SysProcAttr.Ptrace", Field, 0}, + {"SysProcAttr.Setctty", Field, 0}, + {"SysProcAttr.Setpgid", Field, 0}, + {"SysProcAttr.Setsid", Field, 0}, + {"SysProcAttr.ThreadAttributes", Field, 13}, + {"SysProcAttr.Token", Field, 10}, + {"SysProcAttr.UidMappings", Field, 4}, + {"SysProcAttr.Unshareflags", Field, 7}, + {"SysProcAttr.UseCgroupFD", Field, 20}, + {"SysProcIDMap", Type, 4}, + {"SysProcIDMap.ContainerID", Field, 4}, + {"SysProcIDMap.HostID", Field, 4}, + {"SysProcIDMap.Size", Field, 4}, + {"Syscall", Func, 0}, + {"Syscall12", Func, 0}, + {"Syscall15", Func, 0}, + {"Syscall18", Func, 12}, + {"Syscall6", Func, 0}, + {"Syscall9", Func, 0}, + {"SyscallN", Func, 18}, + {"Sysctl", Func, 0}, + {"SysctlUint32", Func, 0}, + {"Sysctlnode", Type, 2}, + {"Sysctlnode.Flags", Field, 2}, + {"Sysctlnode.Name", Field, 2}, + {"Sysctlnode.Num", Field, 2}, + {"Sysctlnode.Un", Field, 2}, + {"Sysctlnode.Ver", Field, 2}, + {"Sysctlnode.X__rsvd", Field, 2}, + {"Sysctlnode.X_sysctl_desc", Field, 2}, + {"Sysctlnode.X_sysctl_func", Field, 2}, + {"Sysctlnode.X_sysctl_parent", Field, 2}, + {"Sysctlnode.X_sysctl_size", Field, 2}, + {"Sysinfo", Func, 0}, + {"Sysinfo_t", Type, 0}, + {"Sysinfo_t.Bufferram", Field, 0}, + {"Sysinfo_t.Freehigh", Field, 0}, + {"Sysinfo_t.Freeram", Field, 0}, + {"Sysinfo_t.Freeswap", Field, 0}, + {"Sysinfo_t.Loads", Field, 0}, + {"Sysinfo_t.Pad", Field, 0}, + {"Sysinfo_t.Pad_cgo_0", Field, 0}, + {"Sysinfo_t.Pad_cgo_1", Field, 0}, + {"Sysinfo_t.Procs", Field, 0}, + {"Sysinfo_t.Sharedram", Field, 0}, + {"Sysinfo_t.Totalhigh", Field, 0}, + {"Sysinfo_t.Totalram", Field, 0}, + {"Sysinfo_t.Totalswap", Field, 0}, + {"Sysinfo_t.Unit", Field, 0}, + {"Sysinfo_t.Uptime", Field, 0}, + {"Sysinfo_t.X_f", Field, 0}, + {"Systemtime", Type, 0}, + {"Systemtime.Day", Field, 0}, + {"Systemtime.DayOfWeek", Field, 0}, + {"Systemtime.Hour", Field, 0}, + {"Systemtime.Milliseconds", Field, 0}, + {"Systemtime.Minute", Field, 0}, + {"Systemtime.Month", Field, 0}, + {"Systemtime.Second", Field, 0}, + {"Systemtime.Year", Field, 0}, + {"TCGETS", Const, 0}, + {"TCIFLUSH", Const, 1}, + {"TCIOFLUSH", Const, 1}, + {"TCOFLUSH", Const, 1}, + {"TCPInfo", Type, 1}, + {"TCPInfo.Advmss", Field, 1}, + {"TCPInfo.Ato", Field, 1}, + {"TCPInfo.Backoff", Field, 1}, + {"TCPInfo.Ca_state", Field, 1}, + {"TCPInfo.Fackets", Field, 1}, + {"TCPInfo.Last_ack_recv", Field, 1}, + {"TCPInfo.Last_ack_sent", Field, 1}, + {"TCPInfo.Last_data_recv", Field, 1}, + {"TCPInfo.Last_data_sent", Field, 1}, + {"TCPInfo.Lost", Field, 1}, + {"TCPInfo.Options", Field, 1}, + {"TCPInfo.Pad_cgo_0", Field, 1}, + {"TCPInfo.Pmtu", Field, 1}, + {"TCPInfo.Probes", Field, 1}, + {"TCPInfo.Rcv_mss", Field, 1}, + {"TCPInfo.Rcv_rtt", Field, 1}, + {"TCPInfo.Rcv_space", Field, 1}, + {"TCPInfo.Rcv_ssthresh", Field, 1}, + {"TCPInfo.Reordering", Field, 1}, + {"TCPInfo.Retrans", Field, 1}, + {"TCPInfo.Retransmits", Field, 1}, + {"TCPInfo.Rto", Field, 1}, + {"TCPInfo.Rtt", Field, 1}, + {"TCPInfo.Rttvar", Field, 1}, + {"TCPInfo.Sacked", Field, 1}, + {"TCPInfo.Snd_cwnd", Field, 1}, + {"TCPInfo.Snd_mss", Field, 1}, + {"TCPInfo.Snd_ssthresh", Field, 1}, + {"TCPInfo.State", Field, 1}, + {"TCPInfo.Total_retrans", Field, 1}, + {"TCPInfo.Unacked", Field, 1}, + {"TCPKeepalive", Type, 3}, + {"TCPKeepalive.Interval", Field, 3}, + {"TCPKeepalive.OnOff", Field, 3}, + {"TCPKeepalive.Time", Field, 3}, + {"TCP_CA_NAME_MAX", Const, 0}, + {"TCP_CONGCTL", Const, 1}, + {"TCP_CONGESTION", Const, 0}, + {"TCP_CONNECTIONTIMEOUT", Const, 0}, + {"TCP_CORK", Const, 0}, + {"TCP_DEFER_ACCEPT", Const, 0}, + {"TCP_ENABLE_ECN", Const, 16}, + {"TCP_INFO", Const, 0}, + {"TCP_KEEPALIVE", Const, 0}, + {"TCP_KEEPCNT", Const, 0}, + {"TCP_KEEPIDLE", Const, 0}, + {"TCP_KEEPINIT", Const, 1}, + {"TCP_KEEPINTVL", Const, 0}, + {"TCP_LINGER2", Const, 0}, + {"TCP_MAXBURST", Const, 0}, + {"TCP_MAXHLEN", Const, 0}, + {"TCP_MAXOLEN", Const, 0}, + {"TCP_MAXSEG", Const, 0}, + {"TCP_MAXWIN", Const, 0}, + {"TCP_MAX_SACK", Const, 0}, + {"TCP_MAX_WINSHIFT", Const, 0}, + {"TCP_MD5SIG", Const, 0}, + {"TCP_MD5SIG_MAXKEYLEN", Const, 0}, + {"TCP_MINMSS", Const, 0}, + {"TCP_MINMSSOVERLOAD", Const, 0}, + {"TCP_MSS", Const, 0}, + {"TCP_NODELAY", Const, 0}, + {"TCP_NOOPT", Const, 0}, + {"TCP_NOPUSH", Const, 0}, + {"TCP_NOTSENT_LOWAT", Const, 16}, + {"TCP_NSTATES", Const, 1}, + {"TCP_QUICKACK", Const, 0}, + {"TCP_RXT_CONNDROPTIME", Const, 0}, + {"TCP_RXT_FINDROP", Const, 0}, + {"TCP_SACK_ENABLE", Const, 1}, + {"TCP_SENDMOREACKS", Const, 16}, + {"TCP_SYNCNT", Const, 0}, + {"TCP_VENDOR", Const, 3}, + {"TCP_WINDOW_CLAMP", Const, 0}, + {"TCSAFLUSH", Const, 1}, + {"TCSETS", Const, 0}, + {"TF_DISCONNECT", Const, 0}, + {"TF_REUSE_SOCKET", Const, 0}, + {"TF_USE_DEFAULT_WORKER", Const, 0}, + {"TF_USE_KERNEL_APC", Const, 0}, + {"TF_USE_SYSTEM_THREAD", Const, 0}, + {"TF_WRITE_BEHIND", Const, 0}, + {"TH32CS_INHERIT", Const, 4}, + {"TH32CS_SNAPALL", Const, 4}, + {"TH32CS_SNAPHEAPLIST", Const, 4}, + {"TH32CS_SNAPMODULE", Const, 4}, + {"TH32CS_SNAPMODULE32", Const, 4}, + {"TH32CS_SNAPPROCESS", Const, 4}, + {"TH32CS_SNAPTHREAD", Const, 4}, + {"TIME_ZONE_ID_DAYLIGHT", Const, 0}, + {"TIME_ZONE_ID_STANDARD", Const, 0}, + {"TIME_ZONE_ID_UNKNOWN", Const, 0}, + {"TIOCCBRK", Const, 0}, + {"TIOCCDTR", Const, 0}, + {"TIOCCONS", Const, 0}, + {"TIOCDCDTIMESTAMP", Const, 0}, + {"TIOCDRAIN", Const, 0}, + {"TIOCDSIMICROCODE", Const, 0}, + {"TIOCEXCL", Const, 0}, + {"TIOCEXT", Const, 0}, + {"TIOCFLAG_CDTRCTS", Const, 1}, + {"TIOCFLAG_CLOCAL", Const, 1}, + {"TIOCFLAG_CRTSCTS", Const, 1}, + {"TIOCFLAG_MDMBUF", Const, 1}, + {"TIOCFLAG_PPS", Const, 1}, + {"TIOCFLAG_SOFTCAR", Const, 1}, + {"TIOCFLUSH", Const, 0}, + {"TIOCGDEV", Const, 0}, + {"TIOCGDRAINWAIT", Const, 0}, + {"TIOCGETA", Const, 0}, + {"TIOCGETD", Const, 0}, + {"TIOCGFLAGS", Const, 1}, + {"TIOCGICOUNT", Const, 0}, + {"TIOCGLCKTRMIOS", Const, 0}, + {"TIOCGLINED", Const, 1}, + {"TIOCGPGRP", Const, 0}, + {"TIOCGPTN", Const, 0}, + {"TIOCGQSIZE", Const, 1}, + {"TIOCGRANTPT", Const, 1}, + {"TIOCGRS485", Const, 0}, + {"TIOCGSERIAL", Const, 0}, + {"TIOCGSID", Const, 0}, + {"TIOCGSIZE", Const, 1}, + {"TIOCGSOFTCAR", Const, 0}, + {"TIOCGTSTAMP", Const, 1}, + {"TIOCGWINSZ", Const, 0}, + {"TIOCINQ", Const, 0}, + {"TIOCIXOFF", Const, 0}, + {"TIOCIXON", Const, 0}, + {"TIOCLINUX", Const, 0}, + {"TIOCMBIC", Const, 0}, + {"TIOCMBIS", Const, 0}, + {"TIOCMGDTRWAIT", Const, 0}, + {"TIOCMGET", Const, 0}, + {"TIOCMIWAIT", Const, 0}, + {"TIOCMODG", Const, 0}, + {"TIOCMODS", Const, 0}, + {"TIOCMSDTRWAIT", Const, 0}, + {"TIOCMSET", Const, 0}, + {"TIOCM_CAR", Const, 0}, + {"TIOCM_CD", Const, 0}, + {"TIOCM_CTS", Const, 0}, + {"TIOCM_DCD", Const, 0}, + {"TIOCM_DSR", Const, 0}, + {"TIOCM_DTR", Const, 0}, + {"TIOCM_LE", Const, 0}, + {"TIOCM_RI", Const, 0}, + {"TIOCM_RNG", Const, 0}, + {"TIOCM_RTS", Const, 0}, + {"TIOCM_SR", Const, 0}, + {"TIOCM_ST", Const, 0}, + {"TIOCNOTTY", Const, 0}, + {"TIOCNXCL", Const, 0}, + {"TIOCOUTQ", Const, 0}, + {"TIOCPKT", Const, 0}, + {"TIOCPKT_DATA", Const, 0}, + {"TIOCPKT_DOSTOP", Const, 0}, + {"TIOCPKT_FLUSHREAD", Const, 0}, + {"TIOCPKT_FLUSHWRITE", Const, 0}, + {"TIOCPKT_IOCTL", Const, 0}, + {"TIOCPKT_NOSTOP", Const, 0}, + {"TIOCPKT_START", Const, 0}, + {"TIOCPKT_STOP", Const, 0}, + {"TIOCPTMASTER", Const, 0}, + {"TIOCPTMGET", Const, 1}, + {"TIOCPTSNAME", Const, 1}, + {"TIOCPTYGNAME", Const, 0}, + {"TIOCPTYGRANT", Const, 0}, + {"TIOCPTYUNLK", Const, 0}, + {"TIOCRCVFRAME", Const, 1}, + {"TIOCREMOTE", Const, 0}, + {"TIOCSBRK", Const, 0}, + {"TIOCSCONS", Const, 0}, + {"TIOCSCTTY", Const, 0}, + {"TIOCSDRAINWAIT", Const, 0}, + {"TIOCSDTR", Const, 0}, + {"TIOCSERCONFIG", Const, 0}, + {"TIOCSERGETLSR", Const, 0}, + {"TIOCSERGETMULTI", Const, 0}, + {"TIOCSERGSTRUCT", Const, 0}, + {"TIOCSERGWILD", Const, 0}, + {"TIOCSERSETMULTI", Const, 0}, + {"TIOCSERSWILD", Const, 0}, + {"TIOCSER_TEMT", Const, 0}, + {"TIOCSETA", Const, 0}, + {"TIOCSETAF", Const, 0}, + {"TIOCSETAW", Const, 0}, + {"TIOCSETD", Const, 0}, + {"TIOCSFLAGS", Const, 1}, + {"TIOCSIG", Const, 0}, + {"TIOCSLCKTRMIOS", Const, 0}, + {"TIOCSLINED", Const, 1}, + {"TIOCSPGRP", Const, 0}, + {"TIOCSPTLCK", Const, 0}, + {"TIOCSQSIZE", Const, 1}, + {"TIOCSRS485", Const, 0}, + {"TIOCSSERIAL", Const, 0}, + {"TIOCSSIZE", Const, 1}, + {"TIOCSSOFTCAR", Const, 0}, + {"TIOCSTART", Const, 0}, + {"TIOCSTAT", Const, 0}, + {"TIOCSTI", Const, 0}, + {"TIOCSTOP", Const, 0}, + {"TIOCSTSTAMP", Const, 1}, + {"TIOCSWINSZ", Const, 0}, + {"TIOCTIMESTAMP", Const, 0}, + {"TIOCUCNTL", Const, 0}, + {"TIOCVHANGUP", Const, 0}, + {"TIOCXMTFRAME", Const, 1}, + {"TOKEN_ADJUST_DEFAULT", Const, 0}, + {"TOKEN_ADJUST_GROUPS", Const, 0}, + {"TOKEN_ADJUST_PRIVILEGES", Const, 0}, + {"TOKEN_ADJUST_SESSIONID", Const, 11}, + {"TOKEN_ALL_ACCESS", Const, 0}, + {"TOKEN_ASSIGN_PRIMARY", Const, 0}, + {"TOKEN_DUPLICATE", Const, 0}, + {"TOKEN_EXECUTE", Const, 0}, + {"TOKEN_IMPERSONATE", Const, 0}, + {"TOKEN_QUERY", Const, 0}, + {"TOKEN_QUERY_SOURCE", Const, 0}, + {"TOKEN_READ", Const, 0}, + {"TOKEN_WRITE", Const, 0}, + {"TOSTOP", Const, 0}, + {"TRUNCATE_EXISTING", Const, 0}, + {"TUNATTACHFILTER", Const, 0}, + {"TUNDETACHFILTER", Const, 0}, + {"TUNGETFEATURES", Const, 0}, + {"TUNGETIFF", Const, 0}, + {"TUNGETSNDBUF", Const, 0}, + {"TUNGETVNETHDRSZ", Const, 0}, + {"TUNSETDEBUG", Const, 0}, + {"TUNSETGROUP", Const, 0}, + {"TUNSETIFF", Const, 0}, + {"TUNSETLINK", Const, 0}, + {"TUNSETNOCSUM", Const, 0}, + {"TUNSETOFFLOAD", Const, 0}, + {"TUNSETOWNER", Const, 0}, + {"TUNSETPERSIST", Const, 0}, + {"TUNSETSNDBUF", Const, 0}, + {"TUNSETTXFILTER", Const, 0}, + {"TUNSETVNETHDRSZ", Const, 0}, + {"Tee", Func, 0}, + {"TerminateProcess", Func, 0}, + {"Termios", Type, 0}, + {"Termios.Cc", Field, 0}, + {"Termios.Cflag", Field, 0}, + {"Termios.Iflag", Field, 0}, + {"Termios.Ispeed", Field, 0}, + {"Termios.Lflag", Field, 0}, + {"Termios.Line", Field, 0}, + {"Termios.Oflag", Field, 0}, + {"Termios.Ospeed", Field, 0}, + {"Termios.Pad_cgo_0", Field, 0}, + {"Tgkill", Func, 0}, + {"Time", Func, 0}, + {"Time_t", Type, 0}, + {"Times", Func, 0}, + {"Timespec", Type, 0}, + {"Timespec.Nsec", Field, 0}, + {"Timespec.Pad_cgo_0", Field, 2}, + {"Timespec.Sec", Field, 0}, + {"TimespecToNsec", Func, 0}, + {"Timeval", Type, 0}, + {"Timeval.Pad_cgo_0", Field, 0}, + {"Timeval.Sec", Field, 0}, + {"Timeval.Usec", Field, 0}, + {"Timeval32", Type, 0}, + {"Timeval32.Sec", Field, 0}, + {"Timeval32.Usec", Field, 0}, + {"TimevalToNsec", Func, 0}, + {"Timex", Type, 0}, + {"Timex.Calcnt", Field, 0}, + {"Timex.Constant", Field, 0}, + {"Timex.Errcnt", Field, 0}, + {"Timex.Esterror", Field, 0}, + {"Timex.Freq", Field, 0}, + {"Timex.Jitcnt", Field, 0}, + {"Timex.Jitter", Field, 0}, + {"Timex.Maxerror", Field, 0}, + {"Timex.Modes", Field, 0}, + {"Timex.Offset", Field, 0}, + {"Timex.Pad_cgo_0", Field, 0}, + {"Timex.Pad_cgo_1", Field, 0}, + {"Timex.Pad_cgo_2", Field, 0}, + {"Timex.Pad_cgo_3", Field, 0}, + {"Timex.Ppsfreq", Field, 0}, + {"Timex.Precision", Field, 0}, + {"Timex.Shift", Field, 0}, + {"Timex.Stabil", Field, 0}, + {"Timex.Status", Field, 0}, + {"Timex.Stbcnt", Field, 0}, + {"Timex.Tai", Field, 0}, + {"Timex.Tick", Field, 0}, + {"Timex.Time", Field, 0}, + {"Timex.Tolerance", Field, 0}, + {"Timezoneinformation", Type, 0}, + {"Timezoneinformation.Bias", Field, 0}, + {"Timezoneinformation.DaylightBias", Field, 0}, + {"Timezoneinformation.DaylightDate", Field, 0}, + {"Timezoneinformation.DaylightName", Field, 0}, + {"Timezoneinformation.StandardBias", Field, 0}, + {"Timezoneinformation.StandardDate", Field, 0}, + {"Timezoneinformation.StandardName", Field, 0}, + {"Tms", Type, 0}, + {"Tms.Cstime", Field, 0}, + {"Tms.Cutime", Field, 0}, + {"Tms.Stime", Field, 0}, + {"Tms.Utime", Field, 0}, + {"Token", Type, 0}, + {"TokenAccessInformation", Const, 0}, + {"TokenAuditPolicy", Const, 0}, + {"TokenDefaultDacl", Const, 0}, + {"TokenElevation", Const, 0}, + {"TokenElevationType", Const, 0}, + {"TokenGroups", Const, 0}, + {"TokenGroupsAndPrivileges", Const, 0}, + {"TokenHasRestrictions", Const, 0}, + {"TokenImpersonationLevel", Const, 0}, + {"TokenIntegrityLevel", Const, 0}, + {"TokenLinkedToken", Const, 0}, + {"TokenLogonSid", Const, 0}, + {"TokenMandatoryPolicy", Const, 0}, + {"TokenOrigin", Const, 0}, + {"TokenOwner", Const, 0}, + {"TokenPrimaryGroup", Const, 0}, + {"TokenPrivileges", Const, 0}, + {"TokenRestrictedSids", Const, 0}, + {"TokenSandBoxInert", Const, 0}, + {"TokenSessionId", Const, 0}, + {"TokenSessionReference", Const, 0}, + {"TokenSource", Const, 0}, + {"TokenStatistics", Const, 0}, + {"TokenType", Const, 0}, + {"TokenUIAccess", Const, 0}, + {"TokenUser", Const, 0}, + {"TokenVirtualizationAllowed", Const, 0}, + {"TokenVirtualizationEnabled", Const, 0}, + {"Tokenprimarygroup", Type, 0}, + {"Tokenprimarygroup.PrimaryGroup", Field, 0}, + {"Tokenuser", Type, 0}, + {"Tokenuser.User", Field, 0}, + {"TranslateAccountName", Func, 0}, + {"TranslateName", Func, 0}, + {"TransmitFile", Func, 0}, + {"TransmitFileBuffers", Type, 0}, + {"TransmitFileBuffers.Head", Field, 0}, + {"TransmitFileBuffers.HeadLength", Field, 0}, + {"TransmitFileBuffers.Tail", Field, 0}, + {"TransmitFileBuffers.TailLength", Field, 0}, + {"Truncate", Func, 0}, + {"UNIX_PATH_MAX", Const, 12}, + {"USAGE_MATCH_TYPE_AND", Const, 0}, + {"USAGE_MATCH_TYPE_OR", Const, 0}, + {"UTF16FromString", Func, 1}, + {"UTF16PtrFromString", Func, 1}, + {"UTF16ToString", Func, 0}, + {"Ucred", Type, 0}, + {"Ucred.Gid", Field, 0}, + {"Ucred.Pid", Field, 0}, + {"Ucred.Uid", Field, 0}, + {"Umask", Func, 0}, + {"Uname", Func, 0}, + {"Undelete", Func, 0}, + {"UnixCredentials", Func, 0}, + {"UnixRights", Func, 0}, + {"Unlink", Func, 0}, + {"Unlinkat", Func, 0}, + {"UnmapViewOfFile", Func, 0}, + {"Unmount", Func, 0}, + {"Unsetenv", Func, 4}, + {"Unshare", Func, 0}, + {"UserInfo10", Type, 0}, + {"UserInfo10.Comment", Field, 0}, + {"UserInfo10.FullName", Field, 0}, + {"UserInfo10.Name", Field, 0}, + {"UserInfo10.UsrComment", Field, 0}, + {"Ustat", Func, 0}, + {"Ustat_t", Type, 0}, + {"Ustat_t.Fname", Field, 0}, + {"Ustat_t.Fpack", Field, 0}, + {"Ustat_t.Pad_cgo_0", Field, 0}, + {"Ustat_t.Pad_cgo_1", Field, 0}, + {"Ustat_t.Tfree", Field, 0}, + {"Ustat_t.Tinode", Field, 0}, + {"Utimbuf", Type, 0}, + {"Utimbuf.Actime", Field, 0}, + {"Utimbuf.Modtime", Field, 0}, + {"Utime", Func, 0}, + {"Utimes", Func, 0}, + {"UtimesNano", Func, 1}, + {"Utsname", Type, 0}, + {"Utsname.Domainname", Field, 0}, + {"Utsname.Machine", Field, 0}, + {"Utsname.Nodename", Field, 0}, + {"Utsname.Release", Field, 0}, + {"Utsname.Sysname", Field, 0}, + {"Utsname.Version", Field, 0}, + {"VDISCARD", Const, 0}, + {"VDSUSP", Const, 1}, + {"VEOF", Const, 0}, + {"VEOL", Const, 0}, + {"VEOL2", Const, 0}, + {"VERASE", Const, 0}, + {"VERASE2", Const, 1}, + {"VINTR", Const, 0}, + {"VKILL", Const, 0}, + {"VLNEXT", Const, 0}, + {"VMIN", Const, 0}, + {"VQUIT", Const, 0}, + {"VREPRINT", Const, 0}, + {"VSTART", Const, 0}, + {"VSTATUS", Const, 1}, + {"VSTOP", Const, 0}, + {"VSUSP", Const, 0}, + {"VSWTC", Const, 0}, + {"VT0", Const, 1}, + {"VT1", Const, 1}, + {"VTDLY", Const, 1}, + {"VTIME", Const, 0}, + {"VWERASE", Const, 0}, + {"VirtualLock", Func, 0}, + {"VirtualUnlock", Func, 0}, + {"WAIT_ABANDONED", Const, 0}, + {"WAIT_FAILED", Const, 0}, + {"WAIT_OBJECT_0", Const, 0}, + {"WAIT_TIMEOUT", Const, 0}, + {"WALL", Const, 0}, + {"WALLSIG", Const, 1}, + {"WALTSIG", Const, 1}, + {"WCLONE", Const, 0}, + {"WCONTINUED", Const, 0}, + {"WCOREFLAG", Const, 0}, + {"WEXITED", Const, 0}, + {"WLINUXCLONE", Const, 0}, + {"WNOHANG", Const, 0}, + {"WNOTHREAD", Const, 0}, + {"WNOWAIT", Const, 0}, + {"WNOZOMBIE", Const, 1}, + {"WOPTSCHECKED", Const, 1}, + {"WORDSIZE", Const, 0}, + {"WSABuf", Type, 0}, + {"WSABuf.Buf", Field, 0}, + {"WSABuf.Len", Field, 0}, + {"WSACleanup", Func, 0}, + {"WSADESCRIPTION_LEN", Const, 0}, + {"WSAData", Type, 0}, + {"WSAData.Description", Field, 0}, + {"WSAData.HighVersion", Field, 0}, + {"WSAData.MaxSockets", Field, 0}, + {"WSAData.MaxUdpDg", Field, 0}, + {"WSAData.SystemStatus", Field, 0}, + {"WSAData.VendorInfo", Field, 0}, + {"WSAData.Version", Field, 0}, + {"WSAEACCES", Const, 2}, + {"WSAECONNABORTED", Const, 9}, + {"WSAECONNRESET", Const, 3}, + {"WSAENOPROTOOPT", Const, 23}, + {"WSAEnumProtocols", Func, 2}, + {"WSAID_CONNECTEX", Var, 1}, + {"WSAIoctl", Func, 0}, + {"WSAPROTOCOL_LEN", Const, 2}, + {"WSAProtocolChain", Type, 2}, + {"WSAProtocolChain.ChainEntries", Field, 2}, + {"WSAProtocolChain.ChainLen", Field, 2}, + {"WSAProtocolInfo", Type, 2}, + {"WSAProtocolInfo.AddressFamily", Field, 2}, + {"WSAProtocolInfo.CatalogEntryId", Field, 2}, + {"WSAProtocolInfo.MaxSockAddr", Field, 2}, + {"WSAProtocolInfo.MessageSize", Field, 2}, + {"WSAProtocolInfo.MinSockAddr", Field, 2}, + {"WSAProtocolInfo.NetworkByteOrder", Field, 2}, + {"WSAProtocolInfo.Protocol", Field, 2}, + {"WSAProtocolInfo.ProtocolChain", Field, 2}, + {"WSAProtocolInfo.ProtocolMaxOffset", Field, 2}, + {"WSAProtocolInfo.ProtocolName", Field, 2}, + {"WSAProtocolInfo.ProviderFlags", Field, 2}, + {"WSAProtocolInfo.ProviderId", Field, 2}, + {"WSAProtocolInfo.ProviderReserved", Field, 2}, + {"WSAProtocolInfo.SecurityScheme", Field, 2}, + {"WSAProtocolInfo.ServiceFlags1", Field, 2}, + {"WSAProtocolInfo.ServiceFlags2", Field, 2}, + {"WSAProtocolInfo.ServiceFlags3", Field, 2}, + {"WSAProtocolInfo.ServiceFlags4", Field, 2}, + {"WSAProtocolInfo.SocketType", Field, 2}, + {"WSAProtocolInfo.Version", Field, 2}, + {"WSARecv", Func, 0}, + {"WSARecvFrom", Func, 0}, + {"WSASYS_STATUS_LEN", Const, 0}, + {"WSASend", Func, 0}, + {"WSASendTo", Func, 0}, + {"WSASendto", Func, 0}, + {"WSAStartup", Func, 0}, + {"WSTOPPED", Const, 0}, + {"WTRAPPED", Const, 1}, + {"WUNTRACED", Const, 0}, + {"Wait4", Func, 0}, + {"WaitForSingleObject", Func, 0}, + {"WaitStatus", Type, 0}, + {"WaitStatus.ExitCode", Field, 0}, + {"Win32FileAttributeData", Type, 0}, + {"Win32FileAttributeData.CreationTime", Field, 0}, + {"Win32FileAttributeData.FileAttributes", Field, 0}, + {"Win32FileAttributeData.FileSizeHigh", Field, 0}, + {"Win32FileAttributeData.FileSizeLow", Field, 0}, + {"Win32FileAttributeData.LastAccessTime", Field, 0}, + {"Win32FileAttributeData.LastWriteTime", Field, 0}, + {"Win32finddata", Type, 0}, + {"Win32finddata.AlternateFileName", Field, 0}, + {"Win32finddata.CreationTime", Field, 0}, + {"Win32finddata.FileAttributes", Field, 0}, + {"Win32finddata.FileName", Field, 0}, + {"Win32finddata.FileSizeHigh", Field, 0}, + {"Win32finddata.FileSizeLow", Field, 0}, + {"Win32finddata.LastAccessTime", Field, 0}, + {"Win32finddata.LastWriteTime", Field, 0}, + {"Win32finddata.Reserved0", Field, 0}, + {"Win32finddata.Reserved1", Field, 0}, + {"Write", Func, 0}, + {"WriteConsole", Func, 1}, + {"WriteFile", Func, 0}, + {"X509_ASN_ENCODING", Const, 0}, + {"XCASE", Const, 0}, + {"XP1_CONNECTIONLESS", Const, 2}, + {"XP1_CONNECT_DATA", Const, 2}, + {"XP1_DISCONNECT_DATA", Const, 2}, + {"XP1_EXPEDITED_DATA", Const, 2}, + {"XP1_GRACEFUL_CLOSE", Const, 2}, + {"XP1_GUARANTEED_DELIVERY", Const, 2}, + {"XP1_GUARANTEED_ORDER", Const, 2}, + {"XP1_IFS_HANDLES", Const, 2}, + {"XP1_MESSAGE_ORIENTED", Const, 2}, + {"XP1_MULTIPOINT_CONTROL_PLANE", Const, 2}, + {"XP1_MULTIPOINT_DATA_PLANE", Const, 2}, + {"XP1_PARTIAL_MESSAGE", Const, 2}, + {"XP1_PSEUDO_STREAM", Const, 2}, + {"XP1_QOS_SUPPORTED", Const, 2}, + {"XP1_SAN_SUPPORT_SDP", Const, 2}, + {"XP1_SUPPORT_BROADCAST", Const, 2}, + {"XP1_SUPPORT_MULTIPOINT", Const, 2}, + {"XP1_UNI_RECV", Const, 2}, + {"XP1_UNI_SEND", Const, 2}, + }, + "syscall/js": { + {"CopyBytesToGo", Func, 0}, + {"CopyBytesToJS", Func, 0}, + {"Error", Type, 0}, + {"Func", Type, 0}, + {"FuncOf", Func, 0}, + {"Global", Func, 0}, + {"Null", Func, 0}, + {"Type", Type, 0}, + {"TypeBoolean", Const, 0}, + {"TypeFunction", Const, 0}, + {"TypeNull", Const, 0}, + {"TypeNumber", Const, 0}, + {"TypeObject", Const, 0}, + {"TypeString", Const, 0}, + {"TypeSymbol", Const, 0}, + {"TypeUndefined", Const, 0}, + {"Undefined", Func, 0}, + {"Value", Type, 0}, + {"ValueError", Type, 0}, + {"ValueOf", Func, 0}, + }, + "testing": { + {"(*B).Cleanup", Method, 14}, + {"(*B).Elapsed", Method, 20}, + {"(*B).Error", Method, 0}, + {"(*B).Errorf", Method, 0}, + {"(*B).Fail", Method, 0}, + {"(*B).FailNow", Method, 0}, + {"(*B).Failed", Method, 0}, + {"(*B).Fatal", Method, 0}, + {"(*B).Fatalf", Method, 0}, + {"(*B).Helper", Method, 9}, + {"(*B).Log", Method, 0}, + {"(*B).Logf", Method, 0}, + {"(*B).Name", Method, 8}, + {"(*B).ReportAllocs", Method, 1}, + {"(*B).ReportMetric", Method, 13}, + {"(*B).ResetTimer", Method, 0}, + {"(*B).Run", Method, 7}, + {"(*B).RunParallel", Method, 3}, + {"(*B).SetBytes", Method, 0}, + {"(*B).SetParallelism", Method, 3}, + {"(*B).Setenv", Method, 17}, + {"(*B).Skip", Method, 1}, + {"(*B).SkipNow", Method, 1}, + {"(*B).Skipf", Method, 1}, + {"(*B).Skipped", Method, 1}, + {"(*B).StartTimer", Method, 0}, + {"(*B).StopTimer", Method, 0}, + {"(*B).TempDir", Method, 15}, + {"(*F).Add", Method, 18}, + {"(*F).Cleanup", Method, 18}, + {"(*F).Error", Method, 18}, + {"(*F).Errorf", Method, 18}, + {"(*F).Fail", Method, 18}, + {"(*F).FailNow", Method, 18}, + {"(*F).Failed", Method, 18}, + {"(*F).Fatal", Method, 18}, + {"(*F).Fatalf", Method, 18}, + {"(*F).Fuzz", Method, 18}, + {"(*F).Helper", Method, 18}, + {"(*F).Log", Method, 18}, + {"(*F).Logf", Method, 18}, + {"(*F).Name", Method, 18}, + {"(*F).Setenv", Method, 18}, + {"(*F).Skip", Method, 18}, + {"(*F).SkipNow", Method, 18}, + {"(*F).Skipf", Method, 18}, + {"(*F).Skipped", Method, 18}, + {"(*F).TempDir", Method, 18}, + {"(*M).Run", Method, 4}, + {"(*PB).Next", Method, 3}, + {"(*T).Cleanup", Method, 14}, + {"(*T).Deadline", Method, 15}, + {"(*T).Error", Method, 0}, + {"(*T).Errorf", Method, 0}, + {"(*T).Fail", Method, 0}, + {"(*T).FailNow", Method, 0}, + {"(*T).Failed", Method, 0}, + {"(*T).Fatal", Method, 0}, + {"(*T).Fatalf", Method, 0}, + {"(*T).Helper", Method, 9}, + {"(*T).Log", Method, 0}, + {"(*T).Logf", Method, 0}, + {"(*T).Name", Method, 8}, + {"(*T).Parallel", Method, 0}, + {"(*T).Run", Method, 7}, + {"(*T).Setenv", Method, 17}, + {"(*T).Skip", Method, 1}, + {"(*T).SkipNow", Method, 1}, + {"(*T).Skipf", Method, 1}, + {"(*T).Skipped", Method, 1}, + {"(*T).TempDir", Method, 15}, + {"(BenchmarkResult).AllocedBytesPerOp", Method, 1}, + {"(BenchmarkResult).AllocsPerOp", Method, 1}, + {"(BenchmarkResult).MemString", Method, 1}, + {"(BenchmarkResult).NsPerOp", Method, 0}, + {"(BenchmarkResult).String", Method, 0}, + {"AllocsPerRun", Func, 1}, + {"B", Type, 0}, + {"B.N", Field, 0}, + {"Benchmark", Func, 0}, + {"BenchmarkResult", Type, 0}, + {"BenchmarkResult.Bytes", Field, 0}, + {"BenchmarkResult.Extra", Field, 13}, + {"BenchmarkResult.MemAllocs", Field, 1}, + {"BenchmarkResult.MemBytes", Field, 1}, + {"BenchmarkResult.N", Field, 0}, + {"BenchmarkResult.T", Field, 0}, + {"Cover", Type, 2}, + {"Cover.Blocks", Field, 2}, + {"Cover.Counters", Field, 2}, + {"Cover.CoveredPackages", Field, 2}, + {"Cover.Mode", Field, 2}, + {"CoverBlock", Type, 2}, + {"CoverBlock.Col0", Field, 2}, + {"CoverBlock.Col1", Field, 2}, + {"CoverBlock.Line0", Field, 2}, + {"CoverBlock.Line1", Field, 2}, + {"CoverBlock.Stmts", Field, 2}, + {"CoverMode", Func, 8}, + {"Coverage", Func, 4}, + {"F", Type, 18}, + {"Init", Func, 13}, + {"InternalBenchmark", Type, 0}, + {"InternalBenchmark.F", Field, 0}, + {"InternalBenchmark.Name", Field, 0}, + {"InternalExample", Type, 0}, + {"InternalExample.F", Field, 0}, + {"InternalExample.Name", Field, 0}, + {"InternalExample.Output", Field, 0}, + {"InternalExample.Unordered", Field, 7}, + {"InternalFuzzTarget", Type, 18}, + {"InternalFuzzTarget.Fn", Field, 18}, + {"InternalFuzzTarget.Name", Field, 18}, + {"InternalTest", Type, 0}, + {"InternalTest.F", Field, 0}, + {"InternalTest.Name", Field, 0}, + {"M", Type, 4}, + {"Main", Func, 0}, + {"MainStart", Func, 4}, + {"PB", Type, 3}, + {"RegisterCover", Func, 2}, + {"RunBenchmarks", Func, 0}, + {"RunExamples", Func, 0}, + {"RunTests", Func, 0}, + {"Short", Func, 0}, + {"T", Type, 0}, + {"TB", Type, 2}, + {"Testing", Func, 21}, + {"Verbose", Func, 1}, + }, + "testing/fstest": { + {"(MapFS).Glob", Method, 16}, + {"(MapFS).Open", Method, 16}, + {"(MapFS).ReadDir", Method, 16}, + {"(MapFS).ReadFile", Method, 16}, + {"(MapFS).Stat", Method, 16}, + {"(MapFS).Sub", Method, 16}, + {"MapFS", Type, 16}, + {"MapFile", Type, 16}, + {"MapFile.Data", Field, 16}, + {"MapFile.ModTime", Field, 16}, + {"MapFile.Mode", Field, 16}, + {"MapFile.Sys", Field, 16}, + {"TestFS", Func, 16}, + }, + "testing/iotest": { + {"DataErrReader", Func, 0}, + {"ErrReader", Func, 16}, + {"ErrTimeout", Var, 0}, + {"HalfReader", Func, 0}, + {"NewReadLogger", Func, 0}, + {"NewWriteLogger", Func, 0}, + {"OneByteReader", Func, 0}, + {"TestReader", Func, 16}, + {"TimeoutReader", Func, 0}, + {"TruncateWriter", Func, 0}, + }, + "testing/quick": { + {"(*CheckEqualError).Error", Method, 0}, + {"(*CheckError).Error", Method, 0}, + {"(SetupError).Error", Method, 0}, + {"Check", Func, 0}, + {"CheckEqual", Func, 0}, + {"CheckEqualError", Type, 0}, + {"CheckEqualError.CheckError", Field, 0}, + {"CheckEqualError.Out1", Field, 0}, + {"CheckEqualError.Out2", Field, 0}, + {"CheckError", Type, 0}, + {"CheckError.Count", Field, 0}, + {"CheckError.In", Field, 0}, + {"Config", Type, 0}, + {"Config.MaxCount", Field, 0}, + {"Config.MaxCountScale", Field, 0}, + {"Config.Rand", Field, 0}, + {"Config.Values", Field, 0}, + {"Generator", Type, 0}, + {"SetupError", Type, 0}, + {"Value", Func, 0}, + }, + "testing/slogtest": { + {"Run", Func, 22}, + {"TestHandler", Func, 21}, + }, + "text/scanner": { + {"(*Position).IsValid", Method, 0}, + {"(*Scanner).Init", Method, 0}, + {"(*Scanner).IsValid", Method, 0}, + {"(*Scanner).Next", Method, 0}, + {"(*Scanner).Peek", Method, 0}, + {"(*Scanner).Pos", Method, 0}, + {"(*Scanner).Scan", Method, 0}, + {"(*Scanner).TokenText", Method, 0}, + {"(Position).String", Method, 0}, + {"(Scanner).String", Method, 0}, + {"Char", Const, 0}, + {"Comment", Const, 0}, + {"EOF", Const, 0}, + {"Float", Const, 0}, + {"GoTokens", Const, 0}, + {"GoWhitespace", Const, 0}, + {"Ident", Const, 0}, + {"Int", Const, 0}, + {"Position", Type, 0}, + {"Position.Column", Field, 0}, + {"Position.Filename", Field, 0}, + {"Position.Line", Field, 0}, + {"Position.Offset", Field, 0}, + {"RawString", Const, 0}, + {"ScanChars", Const, 0}, + {"ScanComments", Const, 0}, + {"ScanFloats", Const, 0}, + {"ScanIdents", Const, 0}, + {"ScanInts", Const, 0}, + {"ScanRawStrings", Const, 0}, + {"ScanStrings", Const, 0}, + {"Scanner", Type, 0}, + {"Scanner.Error", Field, 0}, + {"Scanner.ErrorCount", Field, 0}, + {"Scanner.IsIdentRune", Field, 4}, + {"Scanner.Mode", Field, 0}, + {"Scanner.Position", Field, 0}, + {"Scanner.Whitespace", Field, 0}, + {"SkipComments", Const, 0}, + {"String", Const, 0}, + {"TokenString", Func, 0}, + }, + "text/tabwriter": { + {"(*Writer).Flush", Method, 0}, + {"(*Writer).Init", Method, 0}, + {"(*Writer).Write", Method, 0}, + {"AlignRight", Const, 0}, + {"Debug", Const, 0}, + {"DiscardEmptyColumns", Const, 0}, + {"Escape", Const, 0}, + {"FilterHTML", Const, 0}, + {"NewWriter", Func, 0}, + {"StripEscape", Const, 0}, + {"TabIndent", Const, 0}, + {"Writer", Type, 0}, + }, + "text/template": { + {"(*Template).AddParseTree", Method, 0}, + {"(*Template).Clone", Method, 0}, + {"(*Template).DefinedTemplates", Method, 5}, + {"(*Template).Delims", Method, 0}, + {"(*Template).Execute", Method, 0}, + {"(*Template).ExecuteTemplate", Method, 0}, + {"(*Template).Funcs", Method, 0}, + {"(*Template).Lookup", Method, 0}, + {"(*Template).Name", Method, 0}, + {"(*Template).New", Method, 0}, + {"(*Template).Option", Method, 5}, + {"(*Template).Parse", Method, 0}, + {"(*Template).ParseFS", Method, 16}, + {"(*Template).ParseFiles", Method, 0}, + {"(*Template).ParseGlob", Method, 0}, + {"(*Template).Templates", Method, 0}, + {"(ExecError).Error", Method, 6}, + {"(ExecError).Unwrap", Method, 13}, + {"(Template).Copy", Method, 2}, + {"(Template).ErrorContext", Method, 1}, + {"ExecError", Type, 6}, + {"ExecError.Err", Field, 6}, + {"ExecError.Name", Field, 6}, + {"FuncMap", Type, 0}, + {"HTMLEscape", Func, 0}, + {"HTMLEscapeString", Func, 0}, + {"HTMLEscaper", Func, 0}, + {"IsTrue", Func, 6}, + {"JSEscape", Func, 0}, + {"JSEscapeString", Func, 0}, + {"JSEscaper", Func, 0}, + {"Must", Func, 0}, + {"New", Func, 0}, + {"ParseFS", Func, 16}, + {"ParseFiles", Func, 0}, + {"ParseGlob", Func, 0}, + {"Template", Type, 0}, + {"Template.Tree", Field, 0}, + {"URLQueryEscaper", Func, 0}, + }, + "text/template/parse": { + {"(*ActionNode).Copy", Method, 0}, + {"(*ActionNode).String", Method, 0}, + {"(*BoolNode).Copy", Method, 0}, + {"(*BoolNode).String", Method, 0}, + {"(*BranchNode).Copy", Method, 4}, + {"(*BranchNode).String", Method, 0}, + {"(*BreakNode).Copy", Method, 18}, + {"(*BreakNode).String", Method, 18}, + {"(*ChainNode).Add", Method, 1}, + {"(*ChainNode).Copy", Method, 1}, + {"(*ChainNode).String", Method, 1}, + {"(*CommandNode).Copy", Method, 0}, + {"(*CommandNode).String", Method, 0}, + {"(*CommentNode).Copy", Method, 16}, + {"(*CommentNode).String", Method, 16}, + {"(*ContinueNode).Copy", Method, 18}, + {"(*ContinueNode).String", Method, 18}, + {"(*DotNode).Copy", Method, 0}, + {"(*DotNode).String", Method, 0}, + {"(*DotNode).Type", Method, 0}, + {"(*FieldNode).Copy", Method, 0}, + {"(*FieldNode).String", Method, 0}, + {"(*IdentifierNode).Copy", Method, 0}, + {"(*IdentifierNode).SetPos", Method, 1}, + {"(*IdentifierNode).SetTree", Method, 4}, + {"(*IdentifierNode).String", Method, 0}, + {"(*IfNode).Copy", Method, 0}, + {"(*IfNode).String", Method, 0}, + {"(*ListNode).Copy", Method, 0}, + {"(*ListNode).CopyList", Method, 0}, + {"(*ListNode).String", Method, 0}, + {"(*NilNode).Copy", Method, 1}, + {"(*NilNode).String", Method, 1}, + {"(*NilNode).Type", Method, 1}, + {"(*NumberNode).Copy", Method, 0}, + {"(*NumberNode).String", Method, 0}, + {"(*PipeNode).Copy", Method, 0}, + {"(*PipeNode).CopyPipe", Method, 0}, + {"(*PipeNode).String", Method, 0}, + {"(*RangeNode).Copy", Method, 0}, + {"(*RangeNode).String", Method, 0}, + {"(*StringNode).Copy", Method, 0}, + {"(*StringNode).String", Method, 0}, + {"(*TemplateNode).Copy", Method, 0}, + {"(*TemplateNode).String", Method, 0}, + {"(*TextNode).Copy", Method, 0}, + {"(*TextNode).String", Method, 0}, + {"(*Tree).Copy", Method, 2}, + {"(*Tree).ErrorContext", Method, 1}, + {"(*Tree).Parse", Method, 0}, + {"(*VariableNode).Copy", Method, 0}, + {"(*VariableNode).String", Method, 0}, + {"(*WithNode).Copy", Method, 0}, + {"(*WithNode).String", Method, 0}, + {"(ActionNode).Position", Method, 1}, + {"(ActionNode).Type", Method, 0}, + {"(BoolNode).Position", Method, 1}, + {"(BoolNode).Type", Method, 0}, + {"(BranchNode).Position", Method, 1}, + {"(BranchNode).Type", Method, 0}, + {"(BreakNode).Position", Method, 18}, + {"(BreakNode).Type", Method, 18}, + {"(ChainNode).Position", Method, 1}, + {"(ChainNode).Type", Method, 1}, + {"(CommandNode).Position", Method, 1}, + {"(CommandNode).Type", Method, 0}, + {"(CommentNode).Position", Method, 16}, + {"(CommentNode).Type", Method, 16}, + {"(ContinueNode).Position", Method, 18}, + {"(ContinueNode).Type", Method, 18}, + {"(DotNode).Position", Method, 1}, + {"(FieldNode).Position", Method, 1}, + {"(FieldNode).Type", Method, 0}, + {"(IdentifierNode).Position", Method, 1}, + {"(IdentifierNode).Type", Method, 0}, + {"(IfNode).Position", Method, 1}, + {"(IfNode).Type", Method, 0}, + {"(ListNode).Position", Method, 1}, + {"(ListNode).Type", Method, 0}, + {"(NilNode).Position", Method, 1}, + {"(NodeType).Type", Method, 0}, + {"(NumberNode).Position", Method, 1}, + {"(NumberNode).Type", Method, 0}, + {"(PipeNode).Position", Method, 1}, + {"(PipeNode).Type", Method, 0}, + {"(Pos).Position", Method, 1}, + {"(RangeNode).Position", Method, 1}, + {"(RangeNode).Type", Method, 0}, + {"(StringNode).Position", Method, 1}, + {"(StringNode).Type", Method, 0}, + {"(TemplateNode).Position", Method, 1}, + {"(TemplateNode).Type", Method, 0}, + {"(TextNode).Position", Method, 1}, + {"(TextNode).Type", Method, 0}, + {"(VariableNode).Position", Method, 1}, + {"(VariableNode).Type", Method, 0}, + {"(WithNode).Position", Method, 1}, + {"(WithNode).Type", Method, 0}, + {"ActionNode", Type, 0}, + {"ActionNode.Line", Field, 0}, + {"ActionNode.NodeType", Field, 0}, + {"ActionNode.Pipe", Field, 0}, + {"ActionNode.Pos", Field, 1}, + {"BoolNode", Type, 0}, + {"BoolNode.NodeType", Field, 0}, + {"BoolNode.Pos", Field, 1}, + {"BoolNode.True", Field, 0}, + {"BranchNode", Type, 0}, + {"BranchNode.ElseList", Field, 0}, + {"BranchNode.Line", Field, 0}, + {"BranchNode.List", Field, 0}, + {"BranchNode.NodeType", Field, 0}, + {"BranchNode.Pipe", Field, 0}, + {"BranchNode.Pos", Field, 1}, + {"BreakNode", Type, 18}, + {"BreakNode.Line", Field, 18}, + {"BreakNode.NodeType", Field, 18}, + {"BreakNode.Pos", Field, 18}, + {"ChainNode", Type, 1}, + {"ChainNode.Field", Field, 1}, + {"ChainNode.Node", Field, 1}, + {"ChainNode.NodeType", Field, 1}, + {"ChainNode.Pos", Field, 1}, + {"CommandNode", Type, 0}, + {"CommandNode.Args", Field, 0}, + {"CommandNode.NodeType", Field, 0}, + {"CommandNode.Pos", Field, 1}, + {"CommentNode", Type, 16}, + {"CommentNode.NodeType", Field, 16}, + {"CommentNode.Pos", Field, 16}, + {"CommentNode.Text", Field, 16}, + {"ContinueNode", Type, 18}, + {"ContinueNode.Line", Field, 18}, + {"ContinueNode.NodeType", Field, 18}, + {"ContinueNode.Pos", Field, 18}, + {"DotNode", Type, 0}, + {"DotNode.NodeType", Field, 4}, + {"DotNode.Pos", Field, 1}, + {"FieldNode", Type, 0}, + {"FieldNode.Ident", Field, 0}, + {"FieldNode.NodeType", Field, 0}, + {"FieldNode.Pos", Field, 1}, + {"IdentifierNode", Type, 0}, + {"IdentifierNode.Ident", Field, 0}, + {"IdentifierNode.NodeType", Field, 0}, + {"IdentifierNode.Pos", Field, 1}, + {"IfNode", Type, 0}, + {"IfNode.BranchNode", Field, 0}, + {"IsEmptyTree", Func, 0}, + {"ListNode", Type, 0}, + {"ListNode.NodeType", Field, 0}, + {"ListNode.Nodes", Field, 0}, + {"ListNode.Pos", Field, 1}, + {"Mode", Type, 16}, + {"New", Func, 0}, + {"NewIdentifier", Func, 0}, + {"NilNode", Type, 1}, + {"NilNode.NodeType", Field, 4}, + {"NilNode.Pos", Field, 1}, + {"Node", Type, 0}, + {"NodeAction", Const, 0}, + {"NodeBool", Const, 0}, + {"NodeBreak", Const, 18}, + {"NodeChain", Const, 1}, + {"NodeCommand", Const, 0}, + {"NodeComment", Const, 16}, + {"NodeContinue", Const, 18}, + {"NodeDot", Const, 0}, + {"NodeField", Const, 0}, + {"NodeIdentifier", Const, 0}, + {"NodeIf", Const, 0}, + {"NodeList", Const, 0}, + {"NodeNil", Const, 1}, + {"NodeNumber", Const, 0}, + {"NodePipe", Const, 0}, + {"NodeRange", Const, 0}, + {"NodeString", Const, 0}, + {"NodeTemplate", Const, 0}, + {"NodeText", Const, 0}, + {"NodeType", Type, 0}, + {"NodeVariable", Const, 0}, + {"NodeWith", Const, 0}, + {"NumberNode", Type, 0}, + {"NumberNode.Complex128", Field, 0}, + {"NumberNode.Float64", Field, 0}, + {"NumberNode.Int64", Field, 0}, + {"NumberNode.IsComplex", Field, 0}, + {"NumberNode.IsFloat", Field, 0}, + {"NumberNode.IsInt", Field, 0}, + {"NumberNode.IsUint", Field, 0}, + {"NumberNode.NodeType", Field, 0}, + {"NumberNode.Pos", Field, 1}, + {"NumberNode.Text", Field, 0}, + {"NumberNode.Uint64", Field, 0}, + {"Parse", Func, 0}, + {"ParseComments", Const, 16}, + {"PipeNode", Type, 0}, + {"PipeNode.Cmds", Field, 0}, + {"PipeNode.Decl", Field, 0}, + {"PipeNode.IsAssign", Field, 11}, + {"PipeNode.Line", Field, 0}, + {"PipeNode.NodeType", Field, 0}, + {"PipeNode.Pos", Field, 1}, + {"Pos", Type, 1}, + {"RangeNode", Type, 0}, + {"RangeNode.BranchNode", Field, 0}, + {"SkipFuncCheck", Const, 17}, + {"StringNode", Type, 0}, + {"StringNode.NodeType", Field, 0}, + {"StringNode.Pos", Field, 1}, + {"StringNode.Quoted", Field, 0}, + {"StringNode.Text", Field, 0}, + {"TemplateNode", Type, 0}, + {"TemplateNode.Line", Field, 0}, + {"TemplateNode.Name", Field, 0}, + {"TemplateNode.NodeType", Field, 0}, + {"TemplateNode.Pipe", Field, 0}, + {"TemplateNode.Pos", Field, 1}, + {"TextNode", Type, 0}, + {"TextNode.NodeType", Field, 0}, + {"TextNode.Pos", Field, 1}, + {"TextNode.Text", Field, 0}, + {"Tree", Type, 0}, + {"Tree.Mode", Field, 16}, + {"Tree.Name", Field, 0}, + {"Tree.ParseName", Field, 1}, + {"Tree.Root", Field, 0}, + {"VariableNode", Type, 0}, + {"VariableNode.Ident", Field, 0}, + {"VariableNode.NodeType", Field, 0}, + {"VariableNode.Pos", Field, 1}, + {"WithNode", Type, 0}, + {"WithNode.BranchNode", Field, 0}, + }, + "time": { + {"(*Location).String", Method, 0}, + {"(*ParseError).Error", Method, 0}, + {"(*Ticker).Reset", Method, 15}, + {"(*Ticker).Stop", Method, 0}, + {"(*Time).GobDecode", Method, 0}, + {"(*Time).UnmarshalBinary", Method, 2}, + {"(*Time).UnmarshalJSON", Method, 0}, + {"(*Time).UnmarshalText", Method, 2}, + {"(*Timer).Reset", Method, 1}, + {"(*Timer).Stop", Method, 0}, + {"(Duration).Abs", Method, 19}, + {"(Duration).Hours", Method, 0}, + {"(Duration).Microseconds", Method, 13}, + {"(Duration).Milliseconds", Method, 13}, + {"(Duration).Minutes", Method, 0}, + {"(Duration).Nanoseconds", Method, 0}, + {"(Duration).Round", Method, 9}, + {"(Duration).Seconds", Method, 0}, + {"(Duration).String", Method, 0}, + {"(Duration).Truncate", Method, 9}, + {"(Month).String", Method, 0}, + {"(Time).Add", Method, 0}, + {"(Time).AddDate", Method, 0}, + {"(Time).After", Method, 0}, + {"(Time).AppendFormat", Method, 5}, + {"(Time).Before", Method, 0}, + {"(Time).Clock", Method, 0}, + {"(Time).Compare", Method, 20}, + {"(Time).Date", Method, 0}, + {"(Time).Day", Method, 0}, + {"(Time).Equal", Method, 0}, + {"(Time).Format", Method, 0}, + {"(Time).GoString", Method, 17}, + {"(Time).GobEncode", Method, 0}, + {"(Time).Hour", Method, 0}, + {"(Time).ISOWeek", Method, 0}, + {"(Time).In", Method, 0}, + {"(Time).IsDST", Method, 17}, + {"(Time).IsZero", Method, 0}, + {"(Time).Local", Method, 0}, + {"(Time).Location", Method, 0}, + {"(Time).MarshalBinary", Method, 2}, + {"(Time).MarshalJSON", Method, 0}, + {"(Time).MarshalText", Method, 2}, + {"(Time).Minute", Method, 0}, + {"(Time).Month", Method, 0}, + {"(Time).Nanosecond", Method, 0}, + {"(Time).Round", Method, 1}, + {"(Time).Second", Method, 0}, + {"(Time).String", Method, 0}, + {"(Time).Sub", Method, 0}, + {"(Time).Truncate", Method, 1}, + {"(Time).UTC", Method, 0}, + {"(Time).Unix", Method, 0}, + {"(Time).UnixMicro", Method, 17}, + {"(Time).UnixMilli", Method, 17}, + {"(Time).UnixNano", Method, 0}, + {"(Time).Weekday", Method, 0}, + {"(Time).Year", Method, 0}, + {"(Time).YearDay", Method, 1}, + {"(Time).Zone", Method, 0}, + {"(Time).ZoneBounds", Method, 19}, + {"(Weekday).String", Method, 0}, + {"ANSIC", Const, 0}, + {"After", Func, 0}, + {"AfterFunc", Func, 0}, + {"April", Const, 0}, + {"August", Const, 0}, + {"Date", Func, 0}, + {"DateOnly", Const, 20}, + {"DateTime", Const, 20}, + {"December", Const, 0}, + {"Duration", Type, 0}, + {"February", Const, 0}, + {"FixedZone", Func, 0}, + {"Friday", Const, 0}, + {"Hour", Const, 0}, + {"January", Const, 0}, + {"July", Const, 0}, + {"June", Const, 0}, + {"Kitchen", Const, 0}, + {"Layout", Const, 17}, + {"LoadLocation", Func, 0}, + {"LoadLocationFromTZData", Func, 10}, + {"Local", Var, 0}, + {"Location", Type, 0}, + {"March", Const, 0}, + {"May", Const, 0}, + {"Microsecond", Const, 0}, + {"Millisecond", Const, 0}, + {"Minute", Const, 0}, + {"Monday", Const, 0}, + {"Month", Type, 0}, + {"Nanosecond", Const, 0}, + {"NewTicker", Func, 0}, + {"NewTimer", Func, 0}, + {"November", Const, 0}, + {"Now", Func, 0}, + {"October", Const, 0}, + {"Parse", Func, 0}, + {"ParseDuration", Func, 0}, + {"ParseError", Type, 0}, + {"ParseError.Layout", Field, 0}, + {"ParseError.LayoutElem", Field, 0}, + {"ParseError.Message", Field, 0}, + {"ParseError.Value", Field, 0}, + {"ParseError.ValueElem", Field, 0}, + {"ParseInLocation", Func, 1}, + {"RFC1123", Const, 0}, + {"RFC1123Z", Const, 0}, + {"RFC3339", Const, 0}, + {"RFC3339Nano", Const, 0}, + {"RFC822", Const, 0}, + {"RFC822Z", Const, 0}, + {"RFC850", Const, 0}, + {"RubyDate", Const, 0}, + {"Saturday", Const, 0}, + {"Second", Const, 0}, + {"September", Const, 0}, + {"Since", Func, 0}, + {"Sleep", Func, 0}, + {"Stamp", Const, 0}, + {"StampMicro", Const, 0}, + {"StampMilli", Const, 0}, + {"StampNano", Const, 0}, + {"Sunday", Const, 0}, + {"Thursday", Const, 0}, + {"Tick", Func, 0}, + {"Ticker", Type, 0}, + {"Ticker.C", Field, 0}, + {"Time", Type, 0}, + {"TimeOnly", Const, 20}, + {"Timer", Type, 0}, + {"Timer.C", Field, 0}, + {"Tuesday", Const, 0}, + {"UTC", Var, 0}, + {"Unix", Func, 0}, + {"UnixDate", Const, 0}, + {"UnixMicro", Func, 17}, + {"UnixMilli", Func, 17}, + {"Until", Func, 8}, + {"Wednesday", Const, 0}, + {"Weekday", Type, 0}, + }, + "unicode": { + {"(SpecialCase).ToLower", Method, 0}, + {"(SpecialCase).ToTitle", Method, 0}, + {"(SpecialCase).ToUpper", Method, 0}, + {"ASCII_Hex_Digit", Var, 0}, + {"Adlam", Var, 7}, + {"Ahom", Var, 5}, + {"Anatolian_Hieroglyphs", Var, 5}, + {"Arabic", Var, 0}, + {"Armenian", Var, 0}, + {"Avestan", Var, 0}, + {"AzeriCase", Var, 0}, + {"Balinese", Var, 0}, + {"Bamum", Var, 0}, + {"Bassa_Vah", Var, 4}, + {"Batak", Var, 0}, + {"Bengali", Var, 0}, + {"Bhaiksuki", Var, 7}, + {"Bidi_Control", Var, 0}, + {"Bopomofo", Var, 0}, + {"Brahmi", Var, 0}, + {"Braille", Var, 0}, + {"Buginese", Var, 0}, + {"Buhid", Var, 0}, + {"C", Var, 0}, + {"Canadian_Aboriginal", Var, 0}, + {"Carian", Var, 0}, + {"CaseRange", Type, 0}, + {"CaseRange.Delta", Field, 0}, + {"CaseRange.Hi", Field, 0}, + {"CaseRange.Lo", Field, 0}, + {"CaseRanges", Var, 0}, + {"Categories", Var, 0}, + {"Caucasian_Albanian", Var, 4}, + {"Cc", Var, 0}, + {"Cf", Var, 0}, + {"Chakma", Var, 1}, + {"Cham", Var, 0}, + {"Cherokee", Var, 0}, + {"Chorasmian", Var, 16}, + {"Co", Var, 0}, + {"Common", Var, 0}, + {"Coptic", Var, 0}, + {"Cs", Var, 0}, + {"Cuneiform", Var, 0}, + {"Cypriot", Var, 0}, + {"Cypro_Minoan", Var, 21}, + {"Cyrillic", Var, 0}, + {"Dash", Var, 0}, + {"Deprecated", Var, 0}, + {"Deseret", Var, 0}, + {"Devanagari", Var, 0}, + {"Diacritic", Var, 0}, + {"Digit", Var, 0}, + {"Dives_Akuru", Var, 16}, + {"Dogra", Var, 13}, + {"Duployan", Var, 4}, + {"Egyptian_Hieroglyphs", Var, 0}, + {"Elbasan", Var, 4}, + {"Elymaic", Var, 14}, + {"Ethiopic", Var, 0}, + {"Extender", Var, 0}, + {"FoldCategory", Var, 0}, + {"FoldScript", Var, 0}, + {"Georgian", Var, 0}, + {"Glagolitic", Var, 0}, + {"Gothic", Var, 0}, + {"Grantha", Var, 4}, + {"GraphicRanges", Var, 0}, + {"Greek", Var, 0}, + {"Gujarati", Var, 0}, + {"Gunjala_Gondi", Var, 13}, + {"Gurmukhi", Var, 0}, + {"Han", Var, 0}, + {"Hangul", Var, 0}, + {"Hanifi_Rohingya", Var, 13}, + {"Hanunoo", Var, 0}, + {"Hatran", Var, 5}, + {"Hebrew", Var, 0}, + {"Hex_Digit", Var, 0}, + {"Hiragana", Var, 0}, + {"Hyphen", Var, 0}, + {"IDS_Binary_Operator", Var, 0}, + {"IDS_Trinary_Operator", Var, 0}, + {"Ideographic", Var, 0}, + {"Imperial_Aramaic", Var, 0}, + {"In", Func, 2}, + {"Inherited", Var, 0}, + {"Inscriptional_Pahlavi", Var, 0}, + {"Inscriptional_Parthian", Var, 0}, + {"Is", Func, 0}, + {"IsControl", Func, 0}, + {"IsDigit", Func, 0}, + {"IsGraphic", Func, 0}, + {"IsLetter", Func, 0}, + {"IsLower", Func, 0}, + {"IsMark", Func, 0}, + {"IsNumber", Func, 0}, + {"IsOneOf", Func, 0}, + {"IsPrint", Func, 0}, + {"IsPunct", Func, 0}, + {"IsSpace", Func, 0}, + {"IsSymbol", Func, 0}, + {"IsTitle", Func, 0}, + {"IsUpper", Func, 0}, + {"Javanese", Var, 0}, + {"Join_Control", Var, 0}, + {"Kaithi", Var, 0}, + {"Kannada", Var, 0}, + {"Katakana", Var, 0}, + {"Kawi", Var, 21}, + {"Kayah_Li", Var, 0}, + {"Kharoshthi", Var, 0}, + {"Khitan_Small_Script", Var, 16}, + {"Khmer", Var, 0}, + {"Khojki", Var, 4}, + {"Khudawadi", Var, 4}, + {"L", Var, 0}, + {"Lao", Var, 0}, + {"Latin", Var, 0}, + {"Lepcha", Var, 0}, + {"Letter", Var, 0}, + {"Limbu", Var, 0}, + {"Linear_A", Var, 4}, + {"Linear_B", Var, 0}, + {"Lisu", Var, 0}, + {"Ll", Var, 0}, + {"Lm", Var, 0}, + {"Lo", Var, 0}, + {"Logical_Order_Exception", Var, 0}, + {"Lower", Var, 0}, + {"LowerCase", Const, 0}, + {"Lt", Var, 0}, + {"Lu", Var, 0}, + {"Lycian", Var, 0}, + {"Lydian", Var, 0}, + {"M", Var, 0}, + {"Mahajani", Var, 4}, + {"Makasar", Var, 13}, + {"Malayalam", Var, 0}, + {"Mandaic", Var, 0}, + {"Manichaean", Var, 4}, + {"Marchen", Var, 7}, + {"Mark", Var, 0}, + {"Masaram_Gondi", Var, 10}, + {"MaxASCII", Const, 0}, + {"MaxCase", Const, 0}, + {"MaxLatin1", Const, 0}, + {"MaxRune", Const, 0}, + {"Mc", Var, 0}, + {"Me", Var, 0}, + {"Medefaidrin", Var, 13}, + {"Meetei_Mayek", Var, 0}, + {"Mende_Kikakui", Var, 4}, + {"Meroitic_Cursive", Var, 1}, + {"Meroitic_Hieroglyphs", Var, 1}, + {"Miao", Var, 1}, + {"Mn", Var, 0}, + {"Modi", Var, 4}, + {"Mongolian", Var, 0}, + {"Mro", Var, 4}, + {"Multani", Var, 5}, + {"Myanmar", Var, 0}, + {"N", Var, 0}, + {"Nabataean", Var, 4}, + {"Nag_Mundari", Var, 21}, + {"Nandinagari", Var, 14}, + {"Nd", Var, 0}, + {"New_Tai_Lue", Var, 0}, + {"Newa", Var, 7}, + {"Nko", Var, 0}, + {"Nl", Var, 0}, + {"No", Var, 0}, + {"Noncharacter_Code_Point", Var, 0}, + {"Number", Var, 0}, + {"Nushu", Var, 10}, + {"Nyiakeng_Puachue_Hmong", Var, 14}, + {"Ogham", Var, 0}, + {"Ol_Chiki", Var, 0}, + {"Old_Hungarian", Var, 5}, + {"Old_Italic", Var, 0}, + {"Old_North_Arabian", Var, 4}, + {"Old_Permic", Var, 4}, + {"Old_Persian", Var, 0}, + {"Old_Sogdian", Var, 13}, + {"Old_South_Arabian", Var, 0}, + {"Old_Turkic", Var, 0}, + {"Old_Uyghur", Var, 21}, + {"Oriya", Var, 0}, + {"Osage", Var, 7}, + {"Osmanya", Var, 0}, + {"Other", Var, 0}, + {"Other_Alphabetic", Var, 0}, + {"Other_Default_Ignorable_Code_Point", Var, 0}, + {"Other_Grapheme_Extend", Var, 0}, + {"Other_ID_Continue", Var, 0}, + {"Other_ID_Start", Var, 0}, + {"Other_Lowercase", Var, 0}, + {"Other_Math", Var, 0}, + {"Other_Uppercase", Var, 0}, + {"P", Var, 0}, + {"Pahawh_Hmong", Var, 4}, + {"Palmyrene", Var, 4}, + {"Pattern_Syntax", Var, 0}, + {"Pattern_White_Space", Var, 0}, + {"Pau_Cin_Hau", Var, 4}, + {"Pc", Var, 0}, + {"Pd", Var, 0}, + {"Pe", Var, 0}, + {"Pf", Var, 0}, + {"Phags_Pa", Var, 0}, + {"Phoenician", Var, 0}, + {"Pi", Var, 0}, + {"Po", Var, 0}, + {"Prepended_Concatenation_Mark", Var, 7}, + {"PrintRanges", Var, 0}, + {"Properties", Var, 0}, + {"Ps", Var, 0}, + {"Psalter_Pahlavi", Var, 4}, + {"Punct", Var, 0}, + {"Quotation_Mark", Var, 0}, + {"Radical", Var, 0}, + {"Range16", Type, 0}, + {"Range16.Hi", Field, 0}, + {"Range16.Lo", Field, 0}, + {"Range16.Stride", Field, 0}, + {"Range32", Type, 0}, + {"Range32.Hi", Field, 0}, + {"Range32.Lo", Field, 0}, + {"Range32.Stride", Field, 0}, + {"RangeTable", Type, 0}, + {"RangeTable.LatinOffset", Field, 1}, + {"RangeTable.R16", Field, 0}, + {"RangeTable.R32", Field, 0}, + {"Regional_Indicator", Var, 10}, + {"Rejang", Var, 0}, + {"ReplacementChar", Const, 0}, + {"Runic", Var, 0}, + {"S", Var, 0}, + {"STerm", Var, 0}, + {"Samaritan", Var, 0}, + {"Saurashtra", Var, 0}, + {"Sc", Var, 0}, + {"Scripts", Var, 0}, + {"Sentence_Terminal", Var, 7}, + {"Sharada", Var, 1}, + {"Shavian", Var, 0}, + {"Siddham", Var, 4}, + {"SignWriting", Var, 5}, + {"SimpleFold", Func, 0}, + {"Sinhala", Var, 0}, + {"Sk", Var, 0}, + {"Sm", Var, 0}, + {"So", Var, 0}, + {"Soft_Dotted", Var, 0}, + {"Sogdian", Var, 13}, + {"Sora_Sompeng", Var, 1}, + {"Soyombo", Var, 10}, + {"Space", Var, 0}, + {"SpecialCase", Type, 0}, + {"Sundanese", Var, 0}, + {"Syloti_Nagri", Var, 0}, + {"Symbol", Var, 0}, + {"Syriac", Var, 0}, + {"Tagalog", Var, 0}, + {"Tagbanwa", Var, 0}, + {"Tai_Le", Var, 0}, + {"Tai_Tham", Var, 0}, + {"Tai_Viet", Var, 0}, + {"Takri", Var, 1}, + {"Tamil", Var, 0}, + {"Tangsa", Var, 21}, + {"Tangut", Var, 7}, + {"Telugu", Var, 0}, + {"Terminal_Punctuation", Var, 0}, + {"Thaana", Var, 0}, + {"Thai", Var, 0}, + {"Tibetan", Var, 0}, + {"Tifinagh", Var, 0}, + {"Tirhuta", Var, 4}, + {"Title", Var, 0}, + {"TitleCase", Const, 0}, + {"To", Func, 0}, + {"ToLower", Func, 0}, + {"ToTitle", Func, 0}, + {"ToUpper", Func, 0}, + {"Toto", Var, 21}, + {"TurkishCase", Var, 0}, + {"Ugaritic", Var, 0}, + {"Unified_Ideograph", Var, 0}, + {"Upper", Var, 0}, + {"UpperCase", Const, 0}, + {"UpperLower", Const, 0}, + {"Vai", Var, 0}, + {"Variation_Selector", Var, 0}, + {"Version", Const, 0}, + {"Vithkuqi", Var, 21}, + {"Wancho", Var, 14}, + {"Warang_Citi", Var, 4}, + {"White_Space", Var, 0}, + {"Yezidi", Var, 16}, + {"Yi", Var, 0}, + {"Z", Var, 0}, + {"Zanabazar_Square", Var, 10}, + {"Zl", Var, 0}, + {"Zp", Var, 0}, + {"Zs", Var, 0}, + }, + "unicode/utf16": { + {"AppendRune", Func, 20}, + {"Decode", Func, 0}, + {"DecodeRune", Func, 0}, + {"Encode", Func, 0}, + {"EncodeRune", Func, 0}, + {"IsSurrogate", Func, 0}, + {"RuneLen", Func, 23}, + }, + "unicode/utf8": { + {"AppendRune", Func, 18}, + {"DecodeLastRune", Func, 0}, + {"DecodeLastRuneInString", Func, 0}, + {"DecodeRune", Func, 0}, + {"DecodeRuneInString", Func, 0}, + {"EncodeRune", Func, 0}, + {"FullRune", Func, 0}, + {"FullRuneInString", Func, 0}, + {"MaxRune", Const, 0}, + {"RuneCount", Func, 0}, + {"RuneCountInString", Func, 0}, + {"RuneError", Const, 0}, + {"RuneLen", Func, 0}, + {"RuneSelf", Const, 0}, + {"RuneStart", Func, 0}, + {"UTFMax", Const, 0}, + {"Valid", Func, 0}, + {"ValidRune", Func, 1}, + {"ValidString", Func, 0}, + }, + "unique": { + {"(Handle).Value", Method, 23}, + {"Handle", Type, 23}, + {"Make", Func, 23}, + }, + "unsafe": { + {"Add", Func, 0}, + {"Alignof", Func, 0}, + {"Offsetof", Func, 0}, + {"Pointer", Type, 0}, + {"Sizeof", Func, 0}, + {"Slice", Func, 0}, + {"SliceData", Func, 0}, + {"String", Func, 0}, + {"StringData", Func, 0}, + }, +} diff --git a/vendor/golang.org/x/tools/internal/stdlib/stdlib.go b/vendor/golang.org/x/tools/internal/stdlib/stdlib.go new file mode 100644 index 0000000..9890401 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/stdlib/stdlib.go @@ -0,0 +1,97 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:generate go run generate.go + +// Package stdlib provides a table of all exported symbols in the +// standard library, along with the version at which they first +// appeared. +package stdlib + +import ( + "fmt" + "strings" +) + +type Symbol struct { + Name string + Kind Kind + Version Version // Go version that first included the symbol +} + +// A Kind indicates the kind of a symbol: +// function, variable, constant, type, and so on. +type Kind int8 + +const ( + Invalid Kind = iota // Example name: + Type // "Buffer" + Func // "Println" + Var // "EOF" + Const // "Pi" + Field // "Point.X" + Method // "(*Buffer).Grow" +) + +func (kind Kind) String() string { + return [...]string{ + Invalid: "invalid", + Type: "type", + Func: "func", + Var: "var", + Const: "const", + Field: "field", + Method: "method", + }[kind] +} + +// A Version represents a version of Go of the form "go1.%d". +type Version int8 + +// String returns a version string of the form "go1.23", without allocating. +func (v Version) String() string { return versions[v] } + +var versions [30]string // (increase constant as needed) + +func init() { + for i := range versions { + versions[i] = fmt.Sprintf("go1.%d", i) + } +} + +// HasPackage reports whether the specified package path is part of +// the standard library's public API. +func HasPackage(path string) bool { + _, ok := PackageSymbols[path] + return ok +} + +// SplitField splits the field symbol name into type and field +// components. It must be called only on Field symbols. +// +// Example: "File.Package" -> ("File", "Package") +func (sym *Symbol) SplitField() (typename, name string) { + if sym.Kind != Field { + panic("not a field") + } + typename, name, _ = strings.Cut(sym.Name, ".") + return +} + +// SplitMethod splits the method symbol name into pointer, receiver, +// and method components. It must be called only on Method symbols. +// +// Example: "(*Buffer).Grow" -> (true, "Buffer", "Grow") +func (sym *Symbol) SplitMethod() (ptr bool, recv, name string) { + if sym.Kind != Method { + panic("not a method") + } + recv, name, _ = strings.Cut(sym.Name, ".") + recv = recv[len("(") : len(recv)-len(")")] + ptr = recv[0] == '*' + if ptr { + recv = recv[len("*"):] + } + return +} diff --git a/vendor/golang.org/x/tools/internal/typeparams/common.go b/vendor/golang.org/x/tools/internal/typeparams/common.go index d0d0649..0b84acc 100644 --- a/vendor/golang.org/x/tools/internal/typeparams/common.go +++ b/vendor/golang.org/x/tools/internal/typeparams/common.go @@ -2,20 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Package typeparams contains common utilities for writing tools that interact -// with generic Go code, as introduced with Go 1.18. -// -// Many of the types and functions in this package are proxies for the new APIs -// introduced in the standard library with Go 1.18. For example, the -// typeparams.Union type is an alias for go/types.Union, and the ForTypeSpec -// function returns the value of the go/ast.TypeSpec.TypeParams field. At Go -// versions older than 1.18 these helpers are implemented as stubs, allowing -// users of this package to write code that handles generic constructs inline, -// even if the Go version being used to compile does not support generics. -// -// Additionally, this package contains common utilities for working with the -// new generic constructs, to supplement the standard library APIs. Notably, -// the StructuralTerms API computes a minimal representation of the structural +// Package typeparams contains common utilities for writing tools that +// interact with generic Go code, as introduced with Go 1.18. It +// supplements the standard library APIs. Notably, the StructuralTerms +// API computes a minimal representation of the structural // restrictions on a type parameter. // // An external version of these APIs is available in the @@ -23,7 +13,6 @@ package typeparams import ( - "fmt" "go/ast" "go/token" "go/types" @@ -42,7 +31,7 @@ func UnpackIndexExpr(n ast.Node) (x ast.Expr, lbrack token.Pos, indices []ast.Ex switch e := n.(type) { case *ast.IndexExpr: return e.X, e.Lbrack, []ast.Expr{e.Index}, e.Rbrack - case *IndexListExpr: + case *ast.IndexListExpr: return e.X, e.Lbrack, e.Indices, e.Rbrack } return nil, token.NoPos, nil, token.NoPos @@ -63,7 +52,7 @@ func PackIndexExpr(x ast.Expr, lbrack token.Pos, indices []ast.Expr, rbrack toke Rbrack: rbrack, } default: - return &IndexListExpr{ + return &ast.IndexListExpr{ X: x, Lbrack: lbrack, Indices: indices, @@ -72,73 +61,17 @@ func PackIndexExpr(x ast.Expr, lbrack token.Pos, indices []ast.Expr, rbrack toke } } -// IsTypeParam reports whether t is a type parameter. +// IsTypeParam reports whether t is a type parameter (or an alias of one). func IsTypeParam(t types.Type) bool { - _, ok := t.(*TypeParam) + _, ok := types.Unalias(t).(*types.TypeParam) return ok } -// OriginMethod returns the origin method associated with the method fn. -// For methods on a non-generic receiver base type, this is just -// fn. However, for methods with a generic receiver, OriginMethod returns the -// corresponding method in the method set of the origin type. -// -// As a special case, if fn is not a method (has no receiver), OriginMethod -// returns fn. -func OriginMethod(fn *types.Func) *types.Func { - recv := fn.Type().(*types.Signature).Recv() - if recv == nil { - return fn - } - base := recv.Type() - p, isPtr := base.(*types.Pointer) - if isPtr { - base = p.Elem() - } - named, isNamed := base.(*types.Named) - if !isNamed { - // Receiver is a *types.Interface. - return fn - } - if ForNamed(named).Len() == 0 { - // Receiver base has no type parameters, so we can avoid the lookup below. - return fn - } - orig := NamedTypeOrigin(named) - gfn, _, _ := types.LookupFieldOrMethod(orig, true, fn.Pkg(), fn.Name()) - - // This is a fix for a gopls crash (#60628) due to a go/types bug (#60634). In: - // package p - // type T *int - // func (*T) f() {} - // LookupFieldOrMethod(T, true, p, f)=nil, but NewMethodSet(*T)={(*T).f}. - // Here we make them consistent by force. - // (The go/types bug is general, but this workaround is reached only - // for generic T thanks to the early return above.) - if gfn == nil { - mset := types.NewMethodSet(types.NewPointer(orig)) - for i := 0; i < mset.Len(); i++ { - m := mset.At(i) - if m.Obj().Id() == fn.Id() { - gfn = m.Obj() - break - } - } - } - - // In golang/go#61196, we observe another crash, this time inexplicable. - if gfn == nil { - panic(fmt.Sprintf("missing origin method for %s.%s; named == origin: %t, named.NumMethods(): %d, origin.NumMethods(): %d", named, fn, named == orig, named.NumMethods(), orig.NumMethods())) - } - - return gfn.(*types.Func) -} - // GenericAssignableTo is a generalization of types.AssignableTo that // implements the following rule for uninstantiated generic types: // // If V and T are generic named types, then V is considered assignable to T if, -// for every possible instantation of V[A_1, ..., A_N], the instantiation +// for every possible instantiation of V[A_1, ..., A_N], the instantiation // T[A_1, ..., A_N] is valid and V[A_1, ..., A_N] implements T[A_1, ..., A_N]. // // If T has structural constraints, they must be satisfied by V. @@ -157,7 +90,10 @@ func OriginMethod(fn *types.Func) *types.Func { // // In this case, GenericAssignableTo reports that instantiations of Container // are assignable to the corresponding instantiation of Interface. -func GenericAssignableTo(ctxt *Context, V, T types.Type) bool { +func GenericAssignableTo(ctxt *types.Context, V, T types.Type) bool { + V = types.Unalias(V) + T = types.Unalias(T) + // If V and T are not both named, or do not have matching non-empty type // parameter lists, fall back on types.AssignableTo. @@ -167,9 +103,9 @@ func GenericAssignableTo(ctxt *Context, V, T types.Type) bool { return types.AssignableTo(V, T) } - vtparams := ForNamed(VN) - ttparams := ForNamed(TN) - if vtparams.Len() == 0 || vtparams.Len() != ttparams.Len() || NamedTypeArgs(VN).Len() != 0 || NamedTypeArgs(TN).Len() != 0 { + vtparams := VN.TypeParams() + ttparams := TN.TypeParams() + if vtparams.Len() == 0 || vtparams.Len() != ttparams.Len() || VN.TypeArgs().Len() != 0 || TN.TypeArgs().Len() != 0 { return types.AssignableTo(V, T) } @@ -182,7 +118,7 @@ func GenericAssignableTo(ctxt *Context, V, T types.Type) bool { // Minor optimization: ensure we share a context across the two // instantiations below. if ctxt == nil { - ctxt = NewContext() + ctxt = types.NewContext() } var targs []types.Type @@ -190,12 +126,12 @@ func GenericAssignableTo(ctxt *Context, V, T types.Type) bool { targs = append(targs, vtparams.At(i)) } - vinst, err := Instantiate(ctxt, V, targs, true) + vinst, err := types.Instantiate(ctxt, V, targs, true) if err != nil { panic("type parameters should satisfy their own constraints") } - tinst, err := Instantiate(ctxt, T, targs, true) + tinst, err := types.Instantiate(ctxt, T, targs, true) if err != nil { return false } diff --git a/vendor/golang.org/x/tools/internal/typeparams/coretype.go b/vendor/golang.org/x/tools/internal/typeparams/coretype.go index 993135e..6e83c6f 100644 --- a/vendor/golang.org/x/tools/internal/typeparams/coretype.go +++ b/vendor/golang.org/x/tools/internal/typeparams/coretype.go @@ -5,6 +5,7 @@ package typeparams import ( + "fmt" "go/types" ) @@ -17,7 +18,7 @@ func CoreType(T types.Type) types.Type { return U // for non-interface types, } - terms, err := _NormalTerms(U) + terms, err := NormalTerms(U) if len(terms) == 0 || err != nil { // len(terms) -> empty type set of interface. // err != nil => U is invalid, exceeds complexity bounds, or has an empty type set. @@ -63,7 +64,7 @@ func CoreType(T types.Type) types.Type { return ch } -// _NormalTerms returns a slice of terms representing the normalized structural +// NormalTerms returns a slice of terms representing the normalized structural // type restrictions of a type, if any. // // For all types other than *types.TypeParam, *types.Interface, and @@ -81,42 +82,69 @@ func CoreType(T types.Type) types.Type { // restrictions may be arbitrarily complex. For example, consider the // following: // -// type A interface{ ~string|~[]byte } +// type A interface{ ~string|~[]byte } // -// type B interface{ int|string } +// type B interface{ int|string } // -// type C interface { ~string|~int } +// type C interface { ~string|~int } // -// type T[P interface{ A|B; C }] int +// type T[P interface{ A|B; C }] int // // In this example, the structural type restriction of P is ~string|int: A|B // expands to ~string|~[]byte|int|string, which reduces to ~string|~[]byte|int, // which when intersected with C (~string|~int) yields ~string|int. // -// _NormalTerms computes these expansions and reductions, producing a +// NormalTerms computes these expansions and reductions, producing a // "normalized" form of the embeddings. A structural restriction is normalized // if it is a single union containing no interface terms, and is minimal in the // sense that removing any term changes the set of types satisfying the // constraint. It is left as a proof for the reader that, modulo sorting, there // is exactly one such normalized form. // -// Because the minimal representation always takes this form, _NormalTerms +// Because the minimal representation always takes this form, NormalTerms // returns a slice of tilde terms corresponding to the terms of the union in // the normalized structural restriction. An error is returned if the type is // invalid, exceeds complexity bounds, or has an empty type set. In the latter -// case, _NormalTerms returns ErrEmptyTypeSet. +// case, NormalTerms returns ErrEmptyTypeSet. // -// _NormalTerms makes no guarantees about the order of terms, except that it +// NormalTerms makes no guarantees about the order of terms, except that it // is deterministic. -func _NormalTerms(typ types.Type) ([]*Term, error) { - switch typ := typ.(type) { - case *TypeParam: +func NormalTerms(typ types.Type) ([]*types.Term, error) { + switch typ := typ.Underlying().(type) { + case *types.TypeParam: return StructuralTerms(typ) - case *Union: + case *types.Union: return UnionTermSet(typ) case *types.Interface: return InterfaceTermSet(typ) default: - return []*Term{NewTerm(false, typ)}, nil + return []*types.Term{types.NewTerm(false, typ)}, nil } } + +// Deref returns the type of the variable pointed to by t, +// if t's core type is a pointer; otherwise it returns t. +// +// Do not assume that Deref(T)==T implies T is not a pointer: +// consider "type T *T", for example. +// +// TODO(adonovan): ideally this would live in typesinternal, but that +// creates an import cycle. Move there when we melt this package down. +func Deref(t types.Type) types.Type { + if ptr, ok := CoreType(t).(*types.Pointer); ok { + return ptr.Elem() + } + return t +} + +// MustDeref returns the type of the variable pointed to by t. +// It panics if t's core type is not a pointer. +// +// TODO(adonovan): ideally this would live in typesinternal, but that +// creates an import cycle. Move there when we melt this package down. +func MustDeref(t types.Type) types.Type { + if ptr, ok := CoreType(t).(*types.Pointer); ok { + return ptr.Elem() + } + panic(fmt.Sprintf("%v is not a pointer", t)) +} diff --git a/vendor/golang.org/x/tools/internal/typeparams/enabled_go117.go b/vendor/golang.org/x/tools/internal/typeparams/enabled_go117.go deleted file mode 100644 index 1821239..0000000 --- a/vendor/golang.org/x/tools/internal/typeparams/enabled_go117.go +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.18 -// +build !go1.18 - -package typeparams - -// Enabled reports whether type parameters are enabled in the current build -// environment. -const Enabled = false diff --git a/vendor/golang.org/x/tools/internal/typeparams/enabled_go118.go b/vendor/golang.org/x/tools/internal/typeparams/enabled_go118.go deleted file mode 100644 index d671488..0000000 --- a/vendor/golang.org/x/tools/internal/typeparams/enabled_go118.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.18 -// +build go1.18 - -package typeparams - -// Note: this constant is in a separate file as this is the only acceptable -// diff between the <1.18 API of this package and the 1.18 API. - -// Enabled reports whether type parameters are enabled in the current build -// environment. -const Enabled = true diff --git a/vendor/golang.org/x/tools/internal/typeparams/free.go b/vendor/golang.org/x/tools/internal/typeparams/free.go new file mode 100644 index 0000000..3581082 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typeparams/free.go @@ -0,0 +1,118 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typeparams + +import ( + "go/types" +) + +// Free is a memoization of the set of free type parameters within a +// type. It makes a sequence of calls to [Free.Has] for overlapping +// types more efficient. The zero value is ready for use. +// +// NOTE: Adapted from go/types/infer.go. If it is later exported, factor. +type Free struct { + seen map[types.Type]bool +} + +// Has reports whether the specified type has a free type parameter. +func (w *Free) Has(typ types.Type) (res bool) { + // detect cycles + if x, ok := w.seen[typ]; ok { + return x + } + if w.seen == nil { + w.seen = make(map[types.Type]bool) + } + w.seen[typ] = false + defer func() { + w.seen[typ] = res + }() + + switch t := typ.(type) { + case nil, *types.Basic: // TODO(gri) should nil be handled here? + break + + case *types.Alias: + return w.Has(types.Unalias(t)) + + case *types.Array: + return w.Has(t.Elem()) + + case *types.Slice: + return w.Has(t.Elem()) + + case *types.Struct: + for i, n := 0, t.NumFields(); i < n; i++ { + if w.Has(t.Field(i).Type()) { + return true + } + } + + case *types.Pointer: + return w.Has(t.Elem()) + + case *types.Tuple: + n := t.Len() + for i := 0; i < n; i++ { + if w.Has(t.At(i).Type()) { + return true + } + } + + case *types.Signature: + // t.tparams may not be nil if we are looking at a signature + // of a generic function type (or an interface method) that is + // part of the type we're testing. We don't care about these type + // parameters. + // Similarly, the receiver of a method may declare (rather than + // use) type parameters, we don't care about those either. + // Thus, we only need to look at the input and result parameters. + return w.Has(t.Params()) || w.Has(t.Results()) + + case *types.Interface: + for i, n := 0, t.NumMethods(); i < n; i++ { + if w.Has(t.Method(i).Type()) { + return true + } + } + terms, err := InterfaceTermSet(t) + if err != nil { + return false // ill typed + } + for _, term := range terms { + if w.Has(term.Type()) { + return true + } + } + + case *types.Map: + return w.Has(t.Key()) || w.Has(t.Elem()) + + case *types.Chan: + return w.Has(t.Elem()) + + case *types.Named: + args := t.TypeArgs() + // TODO(taking): this does not match go/types/infer.go. Check with rfindley. + if params := t.TypeParams(); params.Len() > args.Len() { + return true + } + for i, n := 0, args.Len(); i < n; i++ { + if w.Has(args.At(i)) { + return true + } + } + return w.Has(t.Underlying()) // recurse for types local to parameterized functions + + case *types.TypeParam: + return true + + default: + panic(t) // unreachable + } + + return false +} diff --git a/vendor/golang.org/x/tools/internal/typeparams/normalize.go b/vendor/golang.org/x/tools/internal/typeparams/normalize.go index 9c631b6..93c80fd 100644 --- a/vendor/golang.org/x/tools/internal/typeparams/normalize.go +++ b/vendor/golang.org/x/tools/internal/typeparams/normalize.go @@ -60,7 +60,7 @@ var ErrEmptyTypeSet = errors.New("empty type set") // // StructuralTerms makes no guarantees about the order of terms, except that it // is deterministic. -func StructuralTerms(tparam *TypeParam) ([]*Term, error) { +func StructuralTerms(tparam *types.TypeParam) ([]*types.Term, error) { constraint := tparam.Constraint() if constraint == nil { return nil, fmt.Errorf("%s has nil constraint", tparam) @@ -78,7 +78,7 @@ func StructuralTerms(tparam *TypeParam) ([]*Term, error) { // // See the documentation of StructuralTerms for more information on // normalization. -func InterfaceTermSet(iface *types.Interface) ([]*Term, error) { +func InterfaceTermSet(iface *types.Interface) ([]*types.Term, error) { return computeTermSet(iface) } @@ -88,11 +88,11 @@ func InterfaceTermSet(iface *types.Interface) ([]*Term, error) { // // See the documentation of StructuralTerms for more information on // normalization. -func UnionTermSet(union *Union) ([]*Term, error) { +func UnionTermSet(union *types.Union) ([]*types.Term, error) { return computeTermSet(union) } -func computeTermSet(typ types.Type) ([]*Term, error) { +func computeTermSet(typ types.Type) ([]*types.Term, error) { tset, err := computeTermSetInternal(typ, make(map[types.Type]*termSet), 0) if err != nil { return nil, err @@ -103,9 +103,9 @@ func computeTermSet(typ types.Type) ([]*Term, error) { if tset.terms.isAll() { return nil, nil } - var terms []*Term + var terms []*types.Term for _, term := range tset.terms { - terms = append(terms, NewTerm(term.tilde, term.typ)) + terms = append(terms, types.NewTerm(term.tilde, term.typ)) } return terms, nil } @@ -162,7 +162,7 @@ func computeTermSetInternal(t types.Type, seen map[types.Type]*termSet, depth in tset.terms = allTermlist for i := 0; i < u.NumEmbeddeds(); i++ { embedded := u.EmbeddedType(i) - if _, ok := embedded.Underlying().(*TypeParam); ok { + if _, ok := embedded.Underlying().(*types.TypeParam); ok { return nil, fmt.Errorf("invalid embedded type %T", embedded) } tset2, err := computeTermSetInternal(embedded, seen, depth+1) @@ -171,7 +171,7 @@ func computeTermSetInternal(t types.Type, seen map[types.Type]*termSet, depth in } tset.terms = tset.terms.intersect(tset2.terms) } - case *Union: + case *types.Union: // The term set of a union is the union of term sets of its terms. tset.terms = nil for i := 0; i < u.Len(); i++ { @@ -184,7 +184,7 @@ func computeTermSetInternal(t types.Type, seen map[types.Type]*termSet, depth in return nil, err } terms = tset2.terms - case *TypeParam, *Union: + case *types.TypeParam, *types.Union: // A stand-alone type parameter or union is not permitted as union // term. return nil, fmt.Errorf("invalid union term %T", t) @@ -199,7 +199,7 @@ func computeTermSetInternal(t types.Type, seen map[types.Type]*termSet, depth in return nil, fmt.Errorf("exceeded max term count %d", maxTermCount) } } - case *TypeParam: + case *types.TypeParam: panic("unreachable") default: // For all other types, the term set is just a single non-tilde term diff --git a/vendor/golang.org/x/tools/internal/typeparams/termlist.go b/vendor/golang.org/x/tools/internal/typeparams/termlist.go index 933106a..cbd12f8 100644 --- a/vendor/golang.org/x/tools/internal/typeparams/termlist.go +++ b/vendor/golang.org/x/tools/internal/typeparams/termlist.go @@ -30,7 +30,7 @@ func (xl termlist) String() string { var buf bytes.Buffer for i, x := range xl { if i > 0 { - buf.WriteString(" ∪ ") + buf.WriteString(" | ") } buf.WriteString(x.String()) } diff --git a/vendor/golang.org/x/tools/internal/typeparams/typeparams_go117.go b/vendor/golang.org/x/tools/internal/typeparams/typeparams_go117.go deleted file mode 100644 index 7ed86e1..0000000 --- a/vendor/golang.org/x/tools/internal/typeparams/typeparams_go117.go +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.18 -// +build !go1.18 - -package typeparams - -import ( - "go/ast" - "go/token" - "go/types" -) - -func unsupported() { - panic("type parameters are unsupported at this go version") -} - -// IndexListExpr is a placeholder type, as type parameters are not supported at -// this Go version. Its methods panic on use. -type IndexListExpr struct { - ast.Expr - X ast.Expr // expression - Lbrack token.Pos // position of "[" - Indices []ast.Expr // index expressions - Rbrack token.Pos // position of "]" -} - -// ForTypeSpec returns an empty field list, as type parameters on not supported -// at this Go version. -func ForTypeSpec(*ast.TypeSpec) *ast.FieldList { - return nil -} - -// ForFuncType returns an empty field list, as type parameters are not -// supported at this Go version. -func ForFuncType(*ast.FuncType) *ast.FieldList { - return nil -} - -// TypeParam is a placeholder type, as type parameters are not supported at -// this Go version. Its methods panic on use. -type TypeParam struct{ types.Type } - -func (*TypeParam) Index() int { unsupported(); return 0 } -func (*TypeParam) Constraint() types.Type { unsupported(); return nil } -func (*TypeParam) Obj() *types.TypeName { unsupported(); return nil } - -// TypeParamList is a placeholder for an empty type parameter list. -type TypeParamList struct{} - -func (*TypeParamList) Len() int { return 0 } -func (*TypeParamList) At(int) *TypeParam { unsupported(); return nil } - -// TypeList is a placeholder for an empty type list. -type TypeList struct{} - -func (*TypeList) Len() int { return 0 } -func (*TypeList) At(int) types.Type { unsupported(); return nil } - -// NewTypeParam is unsupported at this Go version, and panics. -func NewTypeParam(name *types.TypeName, constraint types.Type) *TypeParam { - unsupported() - return nil -} - -// SetTypeParamConstraint is unsupported at this Go version, and panics. -func SetTypeParamConstraint(tparam *TypeParam, constraint types.Type) { - unsupported() -} - -// NewSignatureType calls types.NewSignature, panicking if recvTypeParams or -// typeParams is non-empty. -func NewSignatureType(recv *types.Var, recvTypeParams, typeParams []*TypeParam, params, results *types.Tuple, variadic bool) *types.Signature { - if len(recvTypeParams) != 0 || len(typeParams) != 0 { - panic("signatures cannot have type parameters at this Go version") - } - return types.NewSignature(recv, params, results, variadic) -} - -// ForSignature returns an empty slice. -func ForSignature(*types.Signature) *TypeParamList { - return nil -} - -// RecvTypeParams returns a nil slice. -func RecvTypeParams(sig *types.Signature) *TypeParamList { - return nil -} - -// IsComparable returns false, as no interfaces are type-restricted at this Go -// version. -func IsComparable(*types.Interface) bool { - return false -} - -// IsMethodSet returns true, as no interfaces are type-restricted at this Go -// version. -func IsMethodSet(*types.Interface) bool { - return true -} - -// IsImplicit returns false, as no interfaces are implicit at this Go version. -func IsImplicit(*types.Interface) bool { - return false -} - -// MarkImplicit does nothing, because this Go version does not have implicit -// interfaces. -func MarkImplicit(*types.Interface) {} - -// ForNamed returns an empty type parameter list, as type parameters are not -// supported at this Go version. -func ForNamed(*types.Named) *TypeParamList { - return nil -} - -// SetForNamed panics if tparams is non-empty. -func SetForNamed(_ *types.Named, tparams []*TypeParam) { - if len(tparams) > 0 { - unsupported() - } -} - -// NamedTypeArgs returns nil. -func NamedTypeArgs(*types.Named) *TypeList { - return nil -} - -// NamedTypeOrigin is the identity method at this Go version. -func NamedTypeOrigin(named *types.Named) *types.Named { - return named -} - -// Term holds information about a structural type restriction. -type Term struct { - tilde bool - typ types.Type -} - -func (m *Term) Tilde() bool { return m.tilde } -func (m *Term) Type() types.Type { return m.typ } -func (m *Term) String() string { - pre := "" - if m.tilde { - pre = "~" - } - return pre + m.typ.String() -} - -// NewTerm is unsupported at this Go version, and panics. -func NewTerm(tilde bool, typ types.Type) *Term { - return &Term{tilde, typ} -} - -// Union is a placeholder type, as type parameters are not supported at this Go -// version. Its methods panic on use. -type Union struct{ types.Type } - -func (*Union) Len() int { return 0 } -func (*Union) Term(i int) *Term { unsupported(); return nil } - -// NewUnion is unsupported at this Go version, and panics. -func NewUnion(terms []*Term) *Union { - unsupported() - return nil -} - -// InitInstanceInfo is a noop at this Go version. -func InitInstanceInfo(*types.Info) {} - -// Instance is a placeholder type, as type parameters are not supported at this -// Go version. -type Instance struct { - TypeArgs *TypeList - Type types.Type -} - -// GetInstances returns a nil map, as type parameters are not supported at this -// Go version. -func GetInstances(info *types.Info) map[*ast.Ident]Instance { return nil } - -// Context is a placeholder type, as type parameters are not supported at -// this Go version. -type Context struct{} - -// NewContext returns a placeholder Context instance. -func NewContext() *Context { - return &Context{} -} - -// Instantiate is unsupported on this Go version, and panics. -func Instantiate(ctxt *Context, typ types.Type, targs []types.Type, validate bool) (types.Type, error) { - unsupported() - return nil, nil -} diff --git a/vendor/golang.org/x/tools/internal/typeparams/typeparams_go118.go b/vendor/golang.org/x/tools/internal/typeparams/typeparams_go118.go deleted file mode 100644 index cf301af..0000000 --- a/vendor/golang.org/x/tools/internal/typeparams/typeparams_go118.go +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.18 -// +build go1.18 - -package typeparams - -import ( - "go/ast" - "go/types" -) - -// IndexListExpr is an alias for ast.IndexListExpr. -type IndexListExpr = ast.IndexListExpr - -// ForTypeSpec returns n.TypeParams. -func ForTypeSpec(n *ast.TypeSpec) *ast.FieldList { - if n == nil { - return nil - } - return n.TypeParams -} - -// ForFuncType returns n.TypeParams. -func ForFuncType(n *ast.FuncType) *ast.FieldList { - if n == nil { - return nil - } - return n.TypeParams -} - -// TypeParam is an alias for types.TypeParam -type TypeParam = types.TypeParam - -// TypeParamList is an alias for types.TypeParamList -type TypeParamList = types.TypeParamList - -// TypeList is an alias for types.TypeList -type TypeList = types.TypeList - -// NewTypeParam calls types.NewTypeParam. -func NewTypeParam(name *types.TypeName, constraint types.Type) *TypeParam { - return types.NewTypeParam(name, constraint) -} - -// SetTypeParamConstraint calls tparam.SetConstraint(constraint). -func SetTypeParamConstraint(tparam *TypeParam, constraint types.Type) { - tparam.SetConstraint(constraint) -} - -// NewSignatureType calls types.NewSignatureType. -func NewSignatureType(recv *types.Var, recvTypeParams, typeParams []*TypeParam, params, results *types.Tuple, variadic bool) *types.Signature { - return types.NewSignatureType(recv, recvTypeParams, typeParams, params, results, variadic) -} - -// ForSignature returns sig.TypeParams() -func ForSignature(sig *types.Signature) *TypeParamList { - return sig.TypeParams() -} - -// RecvTypeParams returns sig.RecvTypeParams(). -func RecvTypeParams(sig *types.Signature) *TypeParamList { - return sig.RecvTypeParams() -} - -// IsComparable calls iface.IsComparable(). -func IsComparable(iface *types.Interface) bool { - return iface.IsComparable() -} - -// IsMethodSet calls iface.IsMethodSet(). -func IsMethodSet(iface *types.Interface) bool { - return iface.IsMethodSet() -} - -// IsImplicit calls iface.IsImplicit(). -func IsImplicit(iface *types.Interface) bool { - return iface.IsImplicit() -} - -// MarkImplicit calls iface.MarkImplicit(). -func MarkImplicit(iface *types.Interface) { - iface.MarkImplicit() -} - -// ForNamed extracts the (possibly empty) type parameter object list from -// named. -func ForNamed(named *types.Named) *TypeParamList { - return named.TypeParams() -} - -// SetForNamed sets the type params tparams on n. Each tparam must be of -// dynamic type *types.TypeParam. -func SetForNamed(n *types.Named, tparams []*TypeParam) { - n.SetTypeParams(tparams) -} - -// NamedTypeArgs returns named.TypeArgs(). -func NamedTypeArgs(named *types.Named) *TypeList { - return named.TypeArgs() -} - -// NamedTypeOrigin returns named.Orig(). -func NamedTypeOrigin(named *types.Named) *types.Named { - return named.Origin() -} - -// Term is an alias for types.Term. -type Term = types.Term - -// NewTerm calls types.NewTerm. -func NewTerm(tilde bool, typ types.Type) *Term { - return types.NewTerm(tilde, typ) -} - -// Union is an alias for types.Union -type Union = types.Union - -// NewUnion calls types.NewUnion. -func NewUnion(terms []*Term) *Union { - return types.NewUnion(terms) -} - -// InitInstanceInfo initializes info to record information about type and -// function instances. -func InitInstanceInfo(info *types.Info) { - info.Instances = make(map[*ast.Ident]types.Instance) -} - -// Instance is an alias for types.Instance. -type Instance = types.Instance - -// GetInstances returns info.Instances. -func GetInstances(info *types.Info) map[*ast.Ident]Instance { - return info.Instances -} - -// Context is an alias for types.Context. -type Context = types.Context - -// NewContext calls types.NewContext. -func NewContext() *Context { - return types.NewContext() -} - -// Instantiate calls types.Instantiate. -func Instantiate(ctxt *Context, typ types.Type, targs []types.Type, validate bool) (types.Type, error) { - return types.Instantiate(ctxt, typ, targs, validate) -} diff --git a/vendor/golang.org/x/tools/internal/typeparams/typeterm.go b/vendor/golang.org/x/tools/internal/typeparams/typeterm.go index 7ddee28..7350bb7 100644 --- a/vendor/golang.org/x/tools/internal/typeparams/typeterm.go +++ b/vendor/golang.org/x/tools/internal/typeparams/typeterm.go @@ -10,11 +10,10 @@ import "go/types" // A term describes elementary type sets: // -// ∅: (*term)(nil) == ∅ // set of no types (empty set) -// 𝓤: &term{} == 𝓤 // set of all types (𝓤niverse) -// T: &term{false, T} == {T} // set of type T -// ~t: &term{true, t} == {t' | under(t') == t} // set of types with underlying type t -// +// ∅: (*term)(nil) == ∅ // set of no types (empty set) +// 𝓤: &term{} == 𝓤 // set of all types (𝓤niverse) +// T: &term{false, T} == {T} // set of type T +// ~t: &term{true, t} == {t' | under(t') == t} // set of types with underlying type t type term struct { tilde bool // valid if typ != nil typ types.Type diff --git a/vendor/golang.org/x/tools/internal/typesinternal/element.go b/vendor/golang.org/x/tools/internal/typesinternal/element.go new file mode 100644 index 0000000..4957f02 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typesinternal/element.go @@ -0,0 +1,133 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typesinternal + +import ( + "fmt" + "go/types" + + "golang.org/x/tools/go/types/typeutil" +) + +// ForEachElement calls f for type T and each type reachable from its +// type through reflection. It does this by recursively stripping off +// type constructors; in addition, for each named type N, the type *N +// is added to the result as it may have additional methods. +// +// The caller must provide an initially empty set used to de-duplicate +// identical types, potentially across multiple calls to ForEachElement. +// (Its final value holds all the elements seen, matching the arguments +// passed to f.) +// +// TODO(adonovan): share/harmonize with go/callgraph/rta. +func ForEachElement(rtypes *typeutil.Map, msets *typeutil.MethodSetCache, T types.Type, f func(types.Type)) { + var visit func(T types.Type, skip bool) + visit = func(T types.Type, skip bool) { + if !skip { + if seen, _ := rtypes.Set(T, true).(bool); seen { + return // de-dup + } + + f(T) // notify caller of new element type + } + + // Recursion over signatures of each method. + tmset := msets.MethodSet(T) + for i := 0; i < tmset.Len(); i++ { + sig := tmset.At(i).Type().(*types.Signature) + // It is tempting to call visit(sig, false) + // but, as noted in golang.org/cl/65450043, + // the Signature.Recv field is ignored by + // types.Identical and typeutil.Map, which + // is confusing at best. + // + // More importantly, the true signature rtype + // reachable from a method using reflection + // has no receiver but an extra ordinary parameter. + // For the Read method of io.Reader we want: + // func(Reader, []byte) (int, error) + // but here sig is: + // func([]byte) (int, error) + // with .Recv = Reader (though it is hard to + // notice because it doesn't affect Signature.String + // or types.Identical). + // + // TODO(adonovan): construct and visit the correct + // non-method signature with an extra parameter + // (though since unnamed func types have no methods + // there is essentially no actual demand for this). + // + // TODO(adonovan): document whether or not it is + // safe to skip non-exported methods (as RTA does). + visit(sig.Params(), true) // skip the Tuple + visit(sig.Results(), true) // skip the Tuple + } + + switch T := T.(type) { + case *types.Alias: + visit(types.Unalias(T), skip) // emulates the pre-Alias behavior + + case *types.Basic: + // nop + + case *types.Interface: + // nop---handled by recursion over method set. + + case *types.Pointer: + visit(T.Elem(), false) + + case *types.Slice: + visit(T.Elem(), false) + + case *types.Chan: + visit(T.Elem(), false) + + case *types.Map: + visit(T.Key(), false) + visit(T.Elem(), false) + + case *types.Signature: + if T.Recv() != nil { + panic(fmt.Sprintf("Signature %s has Recv %s", T, T.Recv())) + } + visit(T.Params(), true) // skip the Tuple + visit(T.Results(), true) // skip the Tuple + + case *types.Named: + // A pointer-to-named type can be derived from a named + // type via reflection. It may have methods too. + visit(types.NewPointer(T), false) + + // Consider 'type T struct{S}' where S has methods. + // Reflection provides no way to get from T to struct{S}, + // only to S, so the method set of struct{S} is unwanted, + // so set 'skip' flag during recursion. + visit(T.Underlying(), true) // skip the unnamed type + + case *types.Array: + visit(T.Elem(), false) + + case *types.Struct: + for i, n := 0, T.NumFields(); i < n; i++ { + // TODO(adonovan): document whether or not + // it is safe to skip non-exported fields. + visit(T.Field(i).Type(), false) + } + + case *types.Tuple: + for i, n := 0, T.Len(); i < n; i++ { + visit(T.At(i).Type(), false) + } + + case *types.TypeParam, *types.Union: + // forEachReachable must not be called on parameterized types. + panic(T) + + default: + panic(T) + } + } + visit(T, false) +} diff --git a/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go b/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go new file mode 100644 index 0000000..131caab --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go @@ -0,0 +1,1560 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typesinternal + +//go:generate stringer -type=ErrorCode + +type ErrorCode int + +// This file defines the error codes that can be produced during type-checking. +// Collectively, these codes provide an identifier that may be used to +// implement special handling for certain types of errors. +// +// Error codes should be fine-grained enough that the exact nature of the error +// can be easily determined, but coarse enough that they are not an +// implementation detail of the type checking algorithm. As a rule-of-thumb, +// errors should be considered equivalent if there is a theoretical refactoring +// of the type checker in which they are emitted in exactly one place. For +// example, the type checker emits different error messages for "too many +// arguments" and "too few arguments", but one can imagine an alternative type +// checker where this check instead just emits a single "wrong number of +// arguments", so these errors should have the same code. +// +// Error code names should be as brief as possible while retaining accuracy and +// distinctiveness. In most cases names should start with an adjective +// describing the nature of the error (e.g. "invalid", "unused", "misplaced"), +// and end with a noun identifying the relevant language object. For example, +// "DuplicateDecl" or "InvalidSliceExpr". For brevity, naming follows the +// convention that "bad" implies a problem with syntax, and "invalid" implies a +// problem with types. + +const ( + // InvalidSyntaxTree occurs if an invalid syntax tree is provided + // to the type checker. It should never happen. + InvalidSyntaxTree ErrorCode = -1 +) + +const ( + _ ErrorCode = iota + + // Test is reserved for errors that only apply while in self-test mode. + Test + + /* package names */ + + // BlankPkgName occurs when a package name is the blank identifier "_". + // + // Per the spec: + // "The PackageName must not be the blank identifier." + BlankPkgName + + // MismatchedPkgName occurs when a file's package name doesn't match the + // package name already established by other files. + MismatchedPkgName + + // InvalidPkgUse occurs when a package identifier is used outside of a + // selector expression. + // + // Example: + // import "fmt" + // + // var _ = fmt + InvalidPkgUse + + /* imports */ + + // BadImportPath occurs when an import path is not valid. + BadImportPath + + // BrokenImport occurs when importing a package fails. + // + // Example: + // import "amissingpackage" + BrokenImport + + // ImportCRenamed occurs when the special import "C" is renamed. "C" is a + // pseudo-package, and must not be renamed. + // + // Example: + // import _ "C" + ImportCRenamed + + // UnusedImport occurs when an import is unused. + // + // Example: + // import "fmt" + // + // func main() {} + UnusedImport + + /* initialization */ + + // InvalidInitCycle occurs when an invalid cycle is detected within the + // initialization graph. + // + // Example: + // var x int = f() + // + // func f() int { return x } + InvalidInitCycle + + /* decls */ + + // DuplicateDecl occurs when an identifier is declared multiple times. + // + // Example: + // var x = 1 + // var x = 2 + DuplicateDecl + + // InvalidDeclCycle occurs when a declaration cycle is not valid. + // + // Example: + // import "unsafe" + // + // type T struct { + // a [n]int + // } + // + // var n = unsafe.Sizeof(T{}) + InvalidDeclCycle + + // InvalidTypeCycle occurs when a cycle in type definitions results in a + // type that is not well-defined. + // + // Example: + // import "unsafe" + // + // type T [unsafe.Sizeof(T{})]int + InvalidTypeCycle + + /* decls > const */ + + // InvalidConstInit occurs when a const declaration has a non-constant + // initializer. + // + // Example: + // var x int + // const _ = x + InvalidConstInit + + // InvalidConstVal occurs when a const value cannot be converted to its + // target type. + // + // TODO(findleyr): this error code and example are not very clear. Consider + // removing it. + // + // Example: + // const _ = 1 << "hello" + InvalidConstVal + + // InvalidConstType occurs when the underlying type in a const declaration + // is not a valid constant type. + // + // Example: + // const c *int = 4 + InvalidConstType + + /* decls > var (+ other variable assignment codes) */ + + // UntypedNilUse occurs when the predeclared (untyped) value nil is used to + // initialize a variable declared without an explicit type. + // + // Example: + // var x = nil + UntypedNilUse + + // WrongAssignCount occurs when the number of values on the right-hand side + // of an assignment or initialization expression does not match the number + // of variables on the left-hand side. + // + // Example: + // var x = 1, 2 + WrongAssignCount + + // UnassignableOperand occurs when the left-hand side of an assignment is + // not assignable. + // + // Example: + // func f() { + // const c = 1 + // c = 2 + // } + UnassignableOperand + + // NoNewVar occurs when a short variable declaration (':=') does not declare + // new variables. + // + // Example: + // func f() { + // x := 1 + // x := 2 + // } + NoNewVar + + // MultiValAssignOp occurs when an assignment operation (+=, *=, etc) does + // not have single-valued left-hand or right-hand side. + // + // Per the spec: + // "In assignment operations, both the left- and right-hand expression lists + // must contain exactly one single-valued expression" + // + // Example: + // func f() int { + // x, y := 1, 2 + // x, y += 1 + // return x + y + // } + MultiValAssignOp + + // InvalidIfaceAssign occurs when a value of type T is used as an + // interface, but T does not implement a method of the expected interface. + // + // Example: + // type I interface { + // f() + // } + // + // type T int + // + // var x I = T(1) + InvalidIfaceAssign + + // InvalidChanAssign occurs when a chan assignment is invalid. + // + // Per the spec, a value x is assignable to a channel type T if: + // "x is a bidirectional channel value, T is a channel type, x's type V and + // T have identical element types, and at least one of V or T is not a + // defined type." + // + // Example: + // type T1 chan int + // type T2 chan int + // + // var x T1 + // // Invalid assignment because both types are named + // var _ T2 = x + InvalidChanAssign + + // IncompatibleAssign occurs when the type of the right-hand side expression + // in an assignment cannot be assigned to the type of the variable being + // assigned. + // + // Example: + // var x []int + // var _ int = x + IncompatibleAssign + + // UnaddressableFieldAssign occurs when trying to assign to a struct field + // in a map value. + // + // Example: + // func f() { + // m := make(map[string]struct{i int}) + // m["foo"].i = 42 + // } + UnaddressableFieldAssign + + /* decls > type (+ other type expression codes) */ + + // NotAType occurs when the identifier used as the underlying type in a type + // declaration or the right-hand side of a type alias does not denote a type. + // + // Example: + // var S = 2 + // + // type T S + NotAType + + // InvalidArrayLen occurs when an array length is not a constant value. + // + // Example: + // var n = 3 + // var _ = [n]int{} + InvalidArrayLen + + // BlankIfaceMethod occurs when a method name is '_'. + // + // Per the spec: + // "The name of each explicitly specified method must be unique and not + // blank." + // + // Example: + // type T interface { + // _(int) + // } + BlankIfaceMethod + + // IncomparableMapKey occurs when a map key type does not support the == and + // != operators. + // + // Per the spec: + // "The comparison operators == and != must be fully defined for operands of + // the key type; thus the key type must not be a function, map, or slice." + // + // Example: + // var x map[T]int + // + // type T []int + IncomparableMapKey + + // InvalidIfaceEmbed occurs when a non-interface type is embedded in an + // interface. + // + // Example: + // type T struct {} + // + // func (T) m() + // + // type I interface { + // T + // } + InvalidIfaceEmbed + + // InvalidPtrEmbed occurs when an embedded field is of the pointer form *T, + // and T itself is itself a pointer, an unsafe.Pointer, or an interface. + // + // Per the spec: + // "An embedded field must be specified as a type name T or as a pointer to + // a non-interface type name *T, and T itself may not be a pointer type." + // + // Example: + // type T *int + // + // type S struct { + // *T + // } + InvalidPtrEmbed + + /* decls > func and method */ + + // BadRecv occurs when a method declaration does not have exactly one + // receiver parameter. + // + // Example: + // func () _() {} + BadRecv + + // InvalidRecv occurs when a receiver type expression is not of the form T + // or *T, or T is a pointer type. + // + // Example: + // type T struct {} + // + // func (**T) m() {} + InvalidRecv + + // DuplicateFieldAndMethod occurs when an identifier appears as both a field + // and method name. + // + // Example: + // type T struct { + // m int + // } + // + // func (T) m() {} + DuplicateFieldAndMethod + + // DuplicateMethod occurs when two methods on the same receiver type have + // the same name. + // + // Example: + // type T struct {} + // func (T) m() {} + // func (T) m(i int) int { return i } + DuplicateMethod + + /* decls > special */ + + // InvalidBlank occurs when a blank identifier is used as a value or type. + // + // Per the spec: + // "The blank identifier may appear as an operand only on the left-hand side + // of an assignment." + // + // Example: + // var x = _ + InvalidBlank + + // InvalidIota occurs when the predeclared identifier iota is used outside + // of a constant declaration. + // + // Example: + // var x = iota + InvalidIota + + // MissingInitBody occurs when an init function is missing its body. + // + // Example: + // func init() + MissingInitBody + + // InvalidInitSig occurs when an init function declares parameters or + // results. + // + // Example: + // func init() int { return 1 } + InvalidInitSig + + // InvalidInitDecl occurs when init is declared as anything other than a + // function. + // + // Example: + // var init = 1 + InvalidInitDecl + + // InvalidMainDecl occurs when main is declared as anything other than a + // function, in a main package. + InvalidMainDecl + + /* exprs */ + + // TooManyValues occurs when a function returns too many values for the + // expression context in which it is used. + // + // Example: + // func ReturnTwo() (int, int) { + // return 1, 2 + // } + // + // var x = ReturnTwo() + TooManyValues + + // NotAnExpr occurs when a type expression is used where a value expression + // is expected. + // + // Example: + // type T struct {} + // + // func f() { + // T + // } + NotAnExpr + + /* exprs > const */ + + // TruncatedFloat occurs when a float constant is truncated to an integer + // value. + // + // Example: + // var _ int = 98.6 + TruncatedFloat + + // NumericOverflow occurs when a numeric constant overflows its target type. + // + // Example: + // var x int8 = 1000 + NumericOverflow + + /* exprs > operation */ + + // UndefinedOp occurs when an operator is not defined for the type(s) used + // in an operation. + // + // Example: + // var c = "a" - "b" + UndefinedOp + + // MismatchedTypes occurs when operand types are incompatible in a binary + // operation. + // + // Example: + // var a = "hello" + // var b = 1 + // var c = a - b + MismatchedTypes + + // DivByZero occurs when a division operation is provable at compile + // time to be a division by zero. + // + // Example: + // const divisor = 0 + // var x int = 1/divisor + DivByZero + + // NonNumericIncDec occurs when an increment or decrement operator is + // applied to a non-numeric value. + // + // Example: + // func f() { + // var c = "c" + // c++ + // } + NonNumericIncDec + + /* exprs > ptr */ + + // UnaddressableOperand occurs when the & operator is applied to an + // unaddressable expression. + // + // Example: + // var x = &1 + UnaddressableOperand + + // InvalidIndirection occurs when a non-pointer value is indirected via the + // '*' operator. + // + // Example: + // var x int + // var y = *x + InvalidIndirection + + /* exprs > [] */ + + // NonIndexableOperand occurs when an index operation is applied to a value + // that cannot be indexed. + // + // Example: + // var x = 1 + // var y = x[1] + NonIndexableOperand + + // InvalidIndex occurs when an index argument is not of integer type, + // negative, or out-of-bounds. + // + // Example: + // var s = [...]int{1,2,3} + // var x = s[5] + // + // Example: + // var s = []int{1,2,3} + // var _ = s[-1] + // + // Example: + // var s = []int{1,2,3} + // var i string + // var _ = s[i] + InvalidIndex + + // SwappedSliceIndices occurs when constant indices in a slice expression + // are decreasing in value. + // + // Example: + // var _ = []int{1,2,3}[2:1] + SwappedSliceIndices + + /* operators > slice */ + + // NonSliceableOperand occurs when a slice operation is applied to a value + // whose type is not sliceable, or is unaddressable. + // + // Example: + // var x = [...]int{1, 2, 3}[:1] + // + // Example: + // var x = 1 + // var y = 1[:1] + NonSliceableOperand + + // InvalidSliceExpr occurs when a three-index slice expression (a[x:y:z]) is + // applied to a string. + // + // Example: + // var s = "hello" + // var x = s[1:2:3] + InvalidSliceExpr + + /* exprs > shift */ + + // InvalidShiftCount occurs when the right-hand side of a shift operation is + // either non-integer, negative, or too large. + // + // Example: + // var ( + // x string + // y int = 1 << x + // ) + InvalidShiftCount + + // InvalidShiftOperand occurs when the shifted operand is not an integer. + // + // Example: + // var s = "hello" + // var x = s << 2 + InvalidShiftOperand + + /* exprs > chan */ + + // InvalidReceive occurs when there is a channel receive from a value that + // is either not a channel, or is a send-only channel. + // + // Example: + // func f() { + // var x = 1 + // <-x + // } + InvalidReceive + + // InvalidSend occurs when there is a channel send to a value that is not a + // channel, or is a receive-only channel. + // + // Example: + // func f() { + // var x = 1 + // x <- "hello!" + // } + InvalidSend + + /* exprs > literal */ + + // DuplicateLitKey occurs when an index is duplicated in a slice, array, or + // map literal. + // + // Example: + // var _ = []int{0:1, 0:2} + // + // Example: + // var _ = map[string]int{"a": 1, "a": 2} + DuplicateLitKey + + // MissingLitKey occurs when a map literal is missing a key expression. + // + // Example: + // var _ = map[string]int{1} + MissingLitKey + + // InvalidLitIndex occurs when the key in a key-value element of a slice or + // array literal is not an integer constant. + // + // Example: + // var i = 0 + // var x = []string{i: "world"} + InvalidLitIndex + + // OversizeArrayLit occurs when an array literal exceeds its length. + // + // Example: + // var _ = [2]int{1,2,3} + OversizeArrayLit + + // MixedStructLit occurs when a struct literal contains a mix of positional + // and named elements. + // + // Example: + // var _ = struct{i, j int}{i: 1, 2} + MixedStructLit + + // InvalidStructLit occurs when a positional struct literal has an incorrect + // number of values. + // + // Example: + // var _ = struct{i, j int}{1,2,3} + InvalidStructLit + + // MissingLitField occurs when a struct literal refers to a field that does + // not exist on the struct type. + // + // Example: + // var _ = struct{i int}{j: 2} + MissingLitField + + // DuplicateLitField occurs when a struct literal contains duplicated + // fields. + // + // Example: + // var _ = struct{i int}{i: 1, i: 2} + DuplicateLitField + + // UnexportedLitField occurs when a positional struct literal implicitly + // assigns an unexported field of an imported type. + UnexportedLitField + + // InvalidLitField occurs when a field name is not a valid identifier. + // + // Example: + // var _ = struct{i int}{1: 1} + InvalidLitField + + // UntypedLit occurs when a composite literal omits a required type + // identifier. + // + // Example: + // type outer struct{ + // inner struct { i int } + // } + // + // var _ = outer{inner: {1}} + UntypedLit + + // InvalidLit occurs when a composite literal expression does not match its + // type. + // + // Example: + // type P *struct{ + // x int + // } + // var _ = P {} + InvalidLit + + /* exprs > selector */ + + // AmbiguousSelector occurs when a selector is ambiguous. + // + // Example: + // type E1 struct { i int } + // type E2 struct { i int } + // type T struct { E1; E2 } + // + // var x T + // var _ = x.i + AmbiguousSelector + + // UndeclaredImportedName occurs when a package-qualified identifier is + // undeclared by the imported package. + // + // Example: + // import "go/types" + // + // var _ = types.NotAnActualIdentifier + UndeclaredImportedName + + // UnexportedName occurs when a selector refers to an unexported identifier + // of an imported package. + // + // Example: + // import "reflect" + // + // type _ reflect.flag + UnexportedName + + // UndeclaredName occurs when an identifier is not declared in the current + // scope. + // + // Example: + // var x T + UndeclaredName + + // MissingFieldOrMethod occurs when a selector references a field or method + // that does not exist. + // + // Example: + // type T struct {} + // + // var x = T{}.f + MissingFieldOrMethod + + /* exprs > ... */ + + // BadDotDotDotSyntax occurs when a "..." occurs in a context where it is + // not valid. + // + // Example: + // var _ = map[int][...]int{0: {}} + BadDotDotDotSyntax + + // NonVariadicDotDotDot occurs when a "..." is used on the final argument to + // a non-variadic function. + // + // Example: + // func printArgs(s []string) { + // for _, a := range s { + // println(a) + // } + // } + // + // func f() { + // s := []string{"a", "b", "c"} + // printArgs(s...) + // } + NonVariadicDotDotDot + + // MisplacedDotDotDot occurs when a "..." is used somewhere other than the + // final argument to a function call. + // + // Example: + // func printArgs(args ...int) { + // for _, a := range args { + // println(a) + // } + // } + // + // func f() { + // a := []int{1,2,3} + // printArgs(0, a...) + // } + MisplacedDotDotDot + + // InvalidDotDotDotOperand occurs when a "..." operator is applied to a + // single-valued operand. + // + // Example: + // func printArgs(args ...int) { + // for _, a := range args { + // println(a) + // } + // } + // + // func f() { + // a := 1 + // printArgs(a...) + // } + // + // Example: + // func args() (int, int) { + // return 1, 2 + // } + // + // func printArgs(args ...int) { + // for _, a := range args { + // println(a) + // } + // } + // + // func g() { + // printArgs(args()...) + // } + InvalidDotDotDotOperand + + // InvalidDotDotDot occurs when a "..." is used in a non-variadic built-in + // function. + // + // Example: + // var s = []int{1, 2, 3} + // var l = len(s...) + InvalidDotDotDot + + /* exprs > built-in */ + + // UncalledBuiltin occurs when a built-in function is used as a + // function-valued expression, instead of being called. + // + // Per the spec: + // "The built-in functions do not have standard Go types, so they can only + // appear in call expressions; they cannot be used as function values." + // + // Example: + // var _ = copy + UncalledBuiltin + + // InvalidAppend occurs when append is called with a first argument that is + // not a slice. + // + // Example: + // var _ = append(1, 2) + InvalidAppend + + // InvalidCap occurs when an argument to the cap built-in function is not of + // supported type. + // + // See https://golang.org/ref/spec#Length_and_capacity for information on + // which underlying types are supported as arguments to cap and len. + // + // Example: + // var s = 2 + // var x = cap(s) + InvalidCap + + // InvalidClose occurs when close(...) is called with an argument that is + // not of channel type, or that is a receive-only channel. + // + // Example: + // func f() { + // var x int + // close(x) + // } + InvalidClose + + // InvalidCopy occurs when the arguments are not of slice type or do not + // have compatible type. + // + // See https://golang.org/ref/spec#Appending_and_copying_slices for more + // information on the type requirements for the copy built-in. + // + // Example: + // func f() { + // var x []int + // y := []int64{1,2,3} + // copy(x, y) + // } + InvalidCopy + + // InvalidComplex occurs when the complex built-in function is called with + // arguments with incompatible types. + // + // Example: + // var _ = complex(float32(1), float64(2)) + InvalidComplex + + // InvalidDelete occurs when the delete built-in function is called with a + // first argument that is not a map. + // + // Example: + // func f() { + // m := "hello" + // delete(m, "e") + // } + InvalidDelete + + // InvalidImag occurs when the imag built-in function is called with an + // argument that does not have complex type. + // + // Example: + // var _ = imag(int(1)) + InvalidImag + + // InvalidLen occurs when an argument to the len built-in function is not of + // supported type. + // + // See https://golang.org/ref/spec#Length_and_capacity for information on + // which underlying types are supported as arguments to cap and len. + // + // Example: + // var s = 2 + // var x = len(s) + InvalidLen + + // SwappedMakeArgs occurs when make is called with three arguments, and its + // length argument is larger than its capacity argument. + // + // Example: + // var x = make([]int, 3, 2) + SwappedMakeArgs + + // InvalidMake occurs when make is called with an unsupported type argument. + // + // See https://golang.org/ref/spec#Making_slices_maps_and_channels for + // information on the types that may be created using make. + // + // Example: + // var x = make(int) + InvalidMake + + // InvalidReal occurs when the real built-in function is called with an + // argument that does not have complex type. + // + // Example: + // var _ = real(int(1)) + InvalidReal + + /* exprs > assertion */ + + // InvalidAssert occurs when a type assertion is applied to a + // value that is not of interface type. + // + // Example: + // var x = 1 + // var _ = x.(float64) + InvalidAssert + + // ImpossibleAssert occurs for a type assertion x.(T) when the value x of + // interface cannot have dynamic type T, due to a missing or mismatching + // method on T. + // + // Example: + // type T int + // + // func (t *T) m() int { return int(*t) } + // + // type I interface { m() int } + // + // var x I + // var _ = x.(T) + ImpossibleAssert + + /* exprs > conversion */ + + // InvalidConversion occurs when the argument type cannot be converted to the + // target. + // + // See https://golang.org/ref/spec#Conversions for the rules of + // convertibility. + // + // Example: + // var x float64 + // var _ = string(x) + InvalidConversion + + // InvalidUntypedConversion occurs when an there is no valid implicit + // conversion from an untyped value satisfying the type constraints of the + // context in which it is used. + // + // Example: + // var _ = 1 + "" + InvalidUntypedConversion + + /* offsetof */ + + // BadOffsetofSyntax occurs when unsafe.Offsetof is called with an argument + // that is not a selector expression. + // + // Example: + // import "unsafe" + // + // var x int + // var _ = unsafe.Offsetof(x) + BadOffsetofSyntax + + // InvalidOffsetof occurs when unsafe.Offsetof is called with a method + // selector, rather than a field selector, or when the field is embedded via + // a pointer. + // + // Per the spec: + // + // "If f is an embedded field, it must be reachable without pointer + // indirections through fields of the struct. " + // + // Example: + // import "unsafe" + // + // type T struct { f int } + // type S struct { *T } + // var s S + // var _ = unsafe.Offsetof(s.f) + // + // Example: + // import "unsafe" + // + // type S struct{} + // + // func (S) m() {} + // + // var s S + // var _ = unsafe.Offsetof(s.m) + InvalidOffsetof + + /* control flow > scope */ + + // UnusedExpr occurs when a side-effect free expression is used as a + // statement. Such a statement has no effect. + // + // Example: + // func f(i int) { + // i*i + // } + UnusedExpr + + // UnusedVar occurs when a variable is declared but unused. + // + // Example: + // func f() { + // x := 1 + // } + UnusedVar + + // MissingReturn occurs when a function with results is missing a return + // statement. + // + // Example: + // func f() int {} + MissingReturn + + // WrongResultCount occurs when a return statement returns an incorrect + // number of values. + // + // Example: + // func ReturnOne() int { + // return 1, 2 + // } + WrongResultCount + + // OutOfScopeResult occurs when the name of a value implicitly returned by + // an empty return statement is shadowed in a nested scope. + // + // Example: + // func factor(n int) (i int) { + // for i := 2; i < n; i++ { + // if n%i == 0 { + // return + // } + // } + // return 0 + // } + OutOfScopeResult + + /* control flow > if */ + + // InvalidCond occurs when an if condition is not a boolean expression. + // + // Example: + // func checkReturn(i int) { + // if i { + // panic("non-zero return") + // } + // } + InvalidCond + + /* control flow > for */ + + // InvalidPostDecl occurs when there is a declaration in a for-loop post + // statement. + // + // Example: + // func f() { + // for i := 0; i < 10; j := 0 {} + // } + InvalidPostDecl + + // InvalidChanRange occurs when a send-only channel used in a range + // expression. + // + // Example: + // func sum(c chan<- int) { + // s := 0 + // for i := range c { + // s += i + // } + // } + InvalidChanRange + + // InvalidIterVar occurs when two iteration variables are used while ranging + // over a channel. + // + // Example: + // func f(c chan int) { + // for k, v := range c { + // println(k, v) + // } + // } + InvalidIterVar + + // InvalidRangeExpr occurs when the type of a range expression is not array, + // slice, string, map, or channel. + // + // Example: + // func f(i int) { + // for j := range i { + // println(j) + // } + // } + InvalidRangeExpr + + /* control flow > switch */ + + // MisplacedBreak occurs when a break statement is not within a for, switch, + // or select statement of the innermost function definition. + // + // Example: + // func f() { + // break + // } + MisplacedBreak + + // MisplacedContinue occurs when a continue statement is not within a for + // loop of the innermost function definition. + // + // Example: + // func sumeven(n int) int { + // proceed := func() { + // continue + // } + // sum := 0 + // for i := 1; i <= n; i++ { + // if i % 2 != 0 { + // proceed() + // } + // sum += i + // } + // return sum + // } + MisplacedContinue + + // MisplacedFallthrough occurs when a fallthrough statement is not within an + // expression switch. + // + // Example: + // func typename(i interface{}) string { + // switch i.(type) { + // case int64: + // fallthrough + // case int: + // return "int" + // } + // return "unsupported" + // } + MisplacedFallthrough + + // DuplicateCase occurs when a type or expression switch has duplicate + // cases. + // + // Example: + // func printInt(i int) { + // switch i { + // case 1: + // println("one") + // case 1: + // println("One") + // } + // } + DuplicateCase + + // DuplicateDefault occurs when a type or expression switch has multiple + // default clauses. + // + // Example: + // func printInt(i int) { + // switch i { + // case 1: + // println("one") + // default: + // println("One") + // default: + // println("1") + // } + // } + DuplicateDefault + + // BadTypeKeyword occurs when a .(type) expression is used anywhere other + // than a type switch. + // + // Example: + // type I interface { + // m() + // } + // var t I + // var _ = t.(type) + BadTypeKeyword + + // InvalidTypeSwitch occurs when .(type) is used on an expression that is + // not of interface type. + // + // Example: + // func f(i int) { + // switch x := i.(type) {} + // } + InvalidTypeSwitch + + // InvalidExprSwitch occurs when a switch expression is not comparable. + // + // Example: + // func _() { + // var a struct{ _ func() } + // switch a /* ERROR cannot switch on a */ { + // } + // } + InvalidExprSwitch + + /* control flow > select */ + + // InvalidSelectCase occurs when a select case is not a channel send or + // receive. + // + // Example: + // func checkChan(c <-chan int) bool { + // select { + // case c: + // return true + // default: + // return false + // } + // } + InvalidSelectCase + + /* control flow > labels and jumps */ + + // UndeclaredLabel occurs when an undeclared label is jumped to. + // + // Example: + // func f() { + // goto L + // } + UndeclaredLabel + + // DuplicateLabel occurs when a label is declared more than once. + // + // Example: + // func f() int { + // L: + // L: + // return 1 + // } + DuplicateLabel + + // MisplacedLabel occurs when a break or continue label is not on a for, + // switch, or select statement. + // + // Example: + // func f() { + // L: + // a := []int{1,2,3} + // for _, e := range a { + // if e > 10 { + // break L + // } + // println(a) + // } + // } + MisplacedLabel + + // UnusedLabel occurs when a label is declared but not used. + // + // Example: + // func f() { + // L: + // } + UnusedLabel + + // JumpOverDecl occurs when a label jumps over a variable declaration. + // + // Example: + // func f() int { + // goto L + // x := 2 + // L: + // x++ + // return x + // } + JumpOverDecl + + // JumpIntoBlock occurs when a forward jump goes to a label inside a nested + // block. + // + // Example: + // func f(x int) { + // goto L + // if x > 0 { + // L: + // print("inside block") + // } + // } + JumpIntoBlock + + /* control flow > calls */ + + // InvalidMethodExpr occurs when a pointer method is called but the argument + // is not addressable. + // + // Example: + // type T struct {} + // + // func (*T) m() int { return 1 } + // + // var _ = T.m(T{}) + InvalidMethodExpr + + // WrongArgCount occurs when too few or too many arguments are passed by a + // function call. + // + // Example: + // func f(i int) {} + // var x = f() + WrongArgCount + + // InvalidCall occurs when an expression is called that is not of function + // type. + // + // Example: + // var x = "x" + // var y = x() + InvalidCall + + /* control flow > suspended */ + + // UnusedResults occurs when a restricted expression-only built-in function + // is suspended via go or defer. Such a suspension discards the results of + // these side-effect free built-in functions, and therefore is ineffectual. + // + // Example: + // func f(a []int) int { + // defer len(a) + // return i + // } + UnusedResults + + // InvalidDefer occurs when a deferred expression is not a function call, + // for example if the expression is a type conversion. + // + // Example: + // func f(i int) int { + // defer int32(i) + // return i + // } + InvalidDefer + + // InvalidGo occurs when a go expression is not a function call, for example + // if the expression is a type conversion. + // + // Example: + // func f(i int) int { + // go int32(i) + // return i + // } + InvalidGo + + // All codes below were added in Go 1.17. + + /* decl */ + + // BadDecl occurs when a declaration has invalid syntax. + BadDecl + + // RepeatedDecl occurs when an identifier occurs more than once on the left + // hand side of a short variable declaration. + // + // Example: + // func _() { + // x, y, y := 1, 2, 3 + // } + RepeatedDecl + + /* unsafe */ + + // InvalidUnsafeAdd occurs when unsafe.Add is called with a + // length argument that is not of integer type. + // + // Example: + // import "unsafe" + // + // var p unsafe.Pointer + // var _ = unsafe.Add(p, float64(1)) + InvalidUnsafeAdd + + // InvalidUnsafeSlice occurs when unsafe.Slice is called with a + // pointer argument that is not of pointer type or a length argument + // that is not of integer type, negative, or out of bounds. + // + // Example: + // import "unsafe" + // + // var x int + // var _ = unsafe.Slice(x, 1) + // + // Example: + // import "unsafe" + // + // var x int + // var _ = unsafe.Slice(&x, float64(1)) + // + // Example: + // import "unsafe" + // + // var x int + // var _ = unsafe.Slice(&x, -1) + // + // Example: + // import "unsafe" + // + // var x int + // var _ = unsafe.Slice(&x, uint64(1) << 63) + InvalidUnsafeSlice + + // All codes below were added in Go 1.18. + + /* features */ + + // UnsupportedFeature occurs when a language feature is used that is not + // supported at this Go version. + UnsupportedFeature + + /* type params */ + + // NotAGenericType occurs when a non-generic type is used where a generic + // type is expected: in type or function instantiation. + // + // Example: + // type T int + // + // var _ T[int] + NotAGenericType + + // WrongTypeArgCount occurs when a type or function is instantiated with an + // incorrect number of type arguments, including when a generic type or + // function is used without instantiation. + // + // Errors involving failed type inference are assigned other error codes. + // + // Example: + // type T[p any] int + // + // var _ T[int, string] + // + // Example: + // func f[T any]() {} + // + // var x = f + WrongTypeArgCount + + // CannotInferTypeArgs occurs when type or function type argument inference + // fails to infer all type arguments. + // + // Example: + // func f[T any]() {} + // + // func _() { + // f() + // } + // + // Example: + // type N[P, Q any] struct{} + // + // var _ N[int] + CannotInferTypeArgs + + // InvalidTypeArg occurs when a type argument does not satisfy its + // corresponding type parameter constraints. + // + // Example: + // type T[P ~int] struct{} + // + // var _ T[string] + InvalidTypeArg // arguments? InferenceFailed + + // InvalidInstanceCycle occurs when an invalid cycle is detected + // within the instantiation graph. + // + // Example: + // func f[T any]() { f[*T]() } + InvalidInstanceCycle + + // InvalidUnion occurs when an embedded union or approximation element is + // not valid. + // + // Example: + // type _ interface { + // ~int | interface{ m() } + // } + InvalidUnion + + // MisplacedConstraintIface occurs when a constraint-type interface is used + // outside of constraint position. + // + // Example: + // type I interface { ~int } + // + // var _ I + MisplacedConstraintIface + + // InvalidMethodTypeParams occurs when methods have type parameters. + // + // It cannot be encountered with an AST parsed using go/parser. + InvalidMethodTypeParams + + // MisplacedTypeParam occurs when a type parameter is used in a place where + // it is not permitted. + // + // Example: + // type T[P any] P + // + // Example: + // type T[P any] struct{ *P } + MisplacedTypeParam + + // InvalidUnsafeSliceData occurs when unsafe.SliceData is called with + // an argument that is not of slice type. It also occurs if it is used + // in a package compiled for a language version before go1.20. + // + // Example: + // import "unsafe" + // + // var x int + // var _ = unsafe.SliceData(x) + InvalidUnsafeSliceData + + // InvalidUnsafeString occurs when unsafe.String is called with + // a length argument that is not of integer type, negative, or + // out of bounds. It also occurs if it is used in a package + // compiled for a language version before go1.20. + // + // Example: + // import "unsafe" + // + // var b [10]byte + // var _ = unsafe.String(&b[0], -1) + InvalidUnsafeString + + // InvalidUnsafeStringData occurs if it is used in a package + // compiled for a language version before go1.20. + _ // not used anymore + +) diff --git a/vendor/golang.org/x/tools/internal/typesinternal/errorcode_string.go b/vendor/golang.org/x/tools/internal/typesinternal/errorcode_string.go new file mode 100644 index 0000000..15ecf7c --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typesinternal/errorcode_string.go @@ -0,0 +1,179 @@ +// Code generated by "stringer -type=ErrorCode"; DO NOT EDIT. + +package typesinternal + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSyntaxTree - -1] + _ = x[Test-1] + _ = x[BlankPkgName-2] + _ = x[MismatchedPkgName-3] + _ = x[InvalidPkgUse-4] + _ = x[BadImportPath-5] + _ = x[BrokenImport-6] + _ = x[ImportCRenamed-7] + _ = x[UnusedImport-8] + _ = x[InvalidInitCycle-9] + _ = x[DuplicateDecl-10] + _ = x[InvalidDeclCycle-11] + _ = x[InvalidTypeCycle-12] + _ = x[InvalidConstInit-13] + _ = x[InvalidConstVal-14] + _ = x[InvalidConstType-15] + _ = x[UntypedNilUse-16] + _ = x[WrongAssignCount-17] + _ = x[UnassignableOperand-18] + _ = x[NoNewVar-19] + _ = x[MultiValAssignOp-20] + _ = x[InvalidIfaceAssign-21] + _ = x[InvalidChanAssign-22] + _ = x[IncompatibleAssign-23] + _ = x[UnaddressableFieldAssign-24] + _ = x[NotAType-25] + _ = x[InvalidArrayLen-26] + _ = x[BlankIfaceMethod-27] + _ = x[IncomparableMapKey-28] + _ = x[InvalidIfaceEmbed-29] + _ = x[InvalidPtrEmbed-30] + _ = x[BadRecv-31] + _ = x[InvalidRecv-32] + _ = x[DuplicateFieldAndMethod-33] + _ = x[DuplicateMethod-34] + _ = x[InvalidBlank-35] + _ = x[InvalidIota-36] + _ = x[MissingInitBody-37] + _ = x[InvalidInitSig-38] + _ = x[InvalidInitDecl-39] + _ = x[InvalidMainDecl-40] + _ = x[TooManyValues-41] + _ = x[NotAnExpr-42] + _ = x[TruncatedFloat-43] + _ = x[NumericOverflow-44] + _ = x[UndefinedOp-45] + _ = x[MismatchedTypes-46] + _ = x[DivByZero-47] + _ = x[NonNumericIncDec-48] + _ = x[UnaddressableOperand-49] + _ = x[InvalidIndirection-50] + _ = x[NonIndexableOperand-51] + _ = x[InvalidIndex-52] + _ = x[SwappedSliceIndices-53] + _ = x[NonSliceableOperand-54] + _ = x[InvalidSliceExpr-55] + _ = x[InvalidShiftCount-56] + _ = x[InvalidShiftOperand-57] + _ = x[InvalidReceive-58] + _ = x[InvalidSend-59] + _ = x[DuplicateLitKey-60] + _ = x[MissingLitKey-61] + _ = x[InvalidLitIndex-62] + _ = x[OversizeArrayLit-63] + _ = x[MixedStructLit-64] + _ = x[InvalidStructLit-65] + _ = x[MissingLitField-66] + _ = x[DuplicateLitField-67] + _ = x[UnexportedLitField-68] + _ = x[InvalidLitField-69] + _ = x[UntypedLit-70] + _ = x[InvalidLit-71] + _ = x[AmbiguousSelector-72] + _ = x[UndeclaredImportedName-73] + _ = x[UnexportedName-74] + _ = x[UndeclaredName-75] + _ = x[MissingFieldOrMethod-76] + _ = x[BadDotDotDotSyntax-77] + _ = x[NonVariadicDotDotDot-78] + _ = x[MisplacedDotDotDot-79] + _ = x[InvalidDotDotDotOperand-80] + _ = x[InvalidDotDotDot-81] + _ = x[UncalledBuiltin-82] + _ = x[InvalidAppend-83] + _ = x[InvalidCap-84] + _ = x[InvalidClose-85] + _ = x[InvalidCopy-86] + _ = x[InvalidComplex-87] + _ = x[InvalidDelete-88] + _ = x[InvalidImag-89] + _ = x[InvalidLen-90] + _ = x[SwappedMakeArgs-91] + _ = x[InvalidMake-92] + _ = x[InvalidReal-93] + _ = x[InvalidAssert-94] + _ = x[ImpossibleAssert-95] + _ = x[InvalidConversion-96] + _ = x[InvalidUntypedConversion-97] + _ = x[BadOffsetofSyntax-98] + _ = x[InvalidOffsetof-99] + _ = x[UnusedExpr-100] + _ = x[UnusedVar-101] + _ = x[MissingReturn-102] + _ = x[WrongResultCount-103] + _ = x[OutOfScopeResult-104] + _ = x[InvalidCond-105] + _ = x[InvalidPostDecl-106] + _ = x[InvalidChanRange-107] + _ = x[InvalidIterVar-108] + _ = x[InvalidRangeExpr-109] + _ = x[MisplacedBreak-110] + _ = x[MisplacedContinue-111] + _ = x[MisplacedFallthrough-112] + _ = x[DuplicateCase-113] + _ = x[DuplicateDefault-114] + _ = x[BadTypeKeyword-115] + _ = x[InvalidTypeSwitch-116] + _ = x[InvalidExprSwitch-117] + _ = x[InvalidSelectCase-118] + _ = x[UndeclaredLabel-119] + _ = x[DuplicateLabel-120] + _ = x[MisplacedLabel-121] + _ = x[UnusedLabel-122] + _ = x[JumpOverDecl-123] + _ = x[JumpIntoBlock-124] + _ = x[InvalidMethodExpr-125] + _ = x[WrongArgCount-126] + _ = x[InvalidCall-127] + _ = x[UnusedResults-128] + _ = x[InvalidDefer-129] + _ = x[InvalidGo-130] + _ = x[BadDecl-131] + _ = x[RepeatedDecl-132] + _ = x[InvalidUnsafeAdd-133] + _ = x[InvalidUnsafeSlice-134] + _ = x[UnsupportedFeature-135] + _ = x[NotAGenericType-136] + _ = x[WrongTypeArgCount-137] + _ = x[CannotInferTypeArgs-138] + _ = x[InvalidTypeArg-139] + _ = x[InvalidInstanceCycle-140] + _ = x[InvalidUnion-141] + _ = x[MisplacedConstraintIface-142] + _ = x[InvalidMethodTypeParams-143] + _ = x[MisplacedTypeParam-144] + _ = x[InvalidUnsafeSliceData-145] + _ = x[InvalidUnsafeString-146] +} + +const ( + _ErrorCode_name_0 = "InvalidSyntaxTree" + _ErrorCode_name_1 = "TestBlankPkgNameMismatchedPkgNameInvalidPkgUseBadImportPathBrokenImportImportCRenamedUnusedImportInvalidInitCycleDuplicateDeclInvalidDeclCycleInvalidTypeCycleInvalidConstInitInvalidConstValInvalidConstTypeUntypedNilUseWrongAssignCountUnassignableOperandNoNewVarMultiValAssignOpInvalidIfaceAssignInvalidChanAssignIncompatibleAssignUnaddressableFieldAssignNotATypeInvalidArrayLenBlankIfaceMethodIncomparableMapKeyInvalidIfaceEmbedInvalidPtrEmbedBadRecvInvalidRecvDuplicateFieldAndMethodDuplicateMethodInvalidBlankInvalidIotaMissingInitBodyInvalidInitSigInvalidInitDeclInvalidMainDeclTooManyValuesNotAnExprTruncatedFloatNumericOverflowUndefinedOpMismatchedTypesDivByZeroNonNumericIncDecUnaddressableOperandInvalidIndirectionNonIndexableOperandInvalidIndexSwappedSliceIndicesNonSliceableOperandInvalidSliceExprInvalidShiftCountInvalidShiftOperandInvalidReceiveInvalidSendDuplicateLitKeyMissingLitKeyInvalidLitIndexOversizeArrayLitMixedStructLitInvalidStructLitMissingLitFieldDuplicateLitFieldUnexportedLitFieldInvalidLitFieldUntypedLitInvalidLitAmbiguousSelectorUndeclaredImportedNameUnexportedNameUndeclaredNameMissingFieldOrMethodBadDotDotDotSyntaxNonVariadicDotDotDotMisplacedDotDotDotInvalidDotDotDotOperandInvalidDotDotDotUncalledBuiltinInvalidAppendInvalidCapInvalidCloseInvalidCopyInvalidComplexInvalidDeleteInvalidImagInvalidLenSwappedMakeArgsInvalidMakeInvalidRealInvalidAssertImpossibleAssertInvalidConversionInvalidUntypedConversionBadOffsetofSyntaxInvalidOffsetofUnusedExprUnusedVarMissingReturnWrongResultCountOutOfScopeResultInvalidCondInvalidPostDeclInvalidChanRangeInvalidIterVarInvalidRangeExprMisplacedBreakMisplacedContinueMisplacedFallthroughDuplicateCaseDuplicateDefaultBadTypeKeywordInvalidTypeSwitchInvalidExprSwitchInvalidSelectCaseUndeclaredLabelDuplicateLabelMisplacedLabelUnusedLabelJumpOverDeclJumpIntoBlockInvalidMethodExprWrongArgCountInvalidCallUnusedResultsInvalidDeferInvalidGoBadDeclRepeatedDeclInvalidUnsafeAddInvalidUnsafeSliceUnsupportedFeatureNotAGenericTypeWrongTypeArgCountCannotInferTypeArgsInvalidTypeArgInvalidInstanceCycleInvalidUnionMisplacedConstraintIfaceInvalidMethodTypeParamsMisplacedTypeParamInvalidUnsafeSliceDataInvalidUnsafeString" +) + +var ( + _ErrorCode_index_1 = [...]uint16{0, 4, 16, 33, 46, 59, 71, 85, 97, 113, 126, 142, 158, 174, 189, 205, 218, 234, 253, 261, 277, 295, 312, 330, 354, 362, 377, 393, 411, 428, 443, 450, 461, 484, 499, 511, 522, 537, 551, 566, 581, 594, 603, 617, 632, 643, 658, 667, 683, 703, 721, 740, 752, 771, 790, 806, 823, 842, 856, 867, 882, 895, 910, 926, 940, 956, 971, 988, 1006, 1021, 1031, 1041, 1058, 1080, 1094, 1108, 1128, 1146, 1166, 1184, 1207, 1223, 1238, 1251, 1261, 1273, 1284, 1298, 1311, 1322, 1332, 1347, 1358, 1369, 1382, 1398, 1415, 1439, 1456, 1471, 1481, 1490, 1503, 1519, 1535, 1546, 1561, 1577, 1591, 1607, 1621, 1638, 1658, 1671, 1687, 1701, 1718, 1735, 1752, 1767, 1781, 1795, 1806, 1818, 1831, 1848, 1861, 1872, 1885, 1897, 1906, 1913, 1925, 1941, 1959, 1977, 1992, 2009, 2028, 2042, 2062, 2074, 2098, 2121, 2139, 2161, 2180} +) + +func (i ErrorCode) String() string { + switch { + case i == -1: + return _ErrorCode_name_0 + case 1 <= i && i <= 146: + i -= 1 + return _ErrorCode_name_1[_ErrorCode_index_1[i]:_ErrorCode_index_1[i+1]] + default: + return "ErrorCode(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/golang.org/x/tools/internal/typesinternal/recv.go b/vendor/golang.org/x/tools/internal/typesinternal/recv.go new file mode 100644 index 0000000..ba6f4f4 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typesinternal/recv.go @@ -0,0 +1,41 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typesinternal + +import ( + "go/types" +) + +// ReceiverNamed returns the named type (if any) associated with the +// type of recv, which may be of the form N or *N, or aliases thereof. +// It also reports whether a Pointer was present. +func ReceiverNamed(recv *types.Var) (isPtr bool, named *types.Named) { + t := recv.Type() + if ptr, ok := types.Unalias(t).(*types.Pointer); ok { + isPtr = true + t = ptr.Elem() + } + named, _ = types.Unalias(t).(*types.Named) + return +} + +// Unpointer returns T given *T or an alias thereof. +// For all other types it is the identity function. +// It does not look at underlying types. +// The result may be an alias. +// +// Use this function to strip off the optional pointer on a receiver +// in a field or method selection, without losing the named type +// (which is needed to compute the method set). +// +// See also [typeparams.MustDeref], which removes one level of +// indirection from the type, regardless of named types (analogous to +// a LOAD instruction). +func Unpointer(t types.Type) types.Type { + if ptr, ok := types.Unalias(t).(*types.Pointer); ok { + return ptr.Elem() + } + return t +} diff --git a/vendor/golang.org/x/tools/internal/typesinternal/toonew.go b/vendor/golang.org/x/tools/internal/typesinternal/toonew.go new file mode 100644 index 0000000..cc86487 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typesinternal/toonew.go @@ -0,0 +1,89 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package typesinternal + +import ( + "go/types" + + "golang.org/x/tools/internal/stdlib" + "golang.org/x/tools/internal/versions" +) + +// TooNewStdSymbols computes the set of package-level symbols +// exported by pkg that are not available at the specified version. +// The result maps each symbol to its minimum version. +// +// The pkg is allowed to contain type errors. +func TooNewStdSymbols(pkg *types.Package, version string) map[types.Object]string { + disallowed := make(map[types.Object]string) + + // Pass 1: package-level symbols. + symbols := stdlib.PackageSymbols[pkg.Path()] + for _, sym := range symbols { + symver := sym.Version.String() + if versions.Before(version, symver) { + switch sym.Kind { + case stdlib.Func, stdlib.Var, stdlib.Const, stdlib.Type: + disallowed[pkg.Scope().Lookup(sym.Name)] = symver + } + } + } + + // Pass 2: fields and methods. + // + // We allow fields and methods if their associated type is + // disallowed, as otherwise we would report false positives + // for compatibility shims. Consider: + // + // //go:build go1.22 + // type T struct { F std.Real } // correct new API + // + // //go:build !go1.22 + // type T struct { F fake } // shim + // type fake struct { ... } + // func (fake) M () {} + // + // These alternative declarations of T use either the std.Real + // type, introduced in go1.22, or a fake type, for the field + // F. (The fakery could be arbitrarily deep, involving more + // nested fields and methods than are shown here.) Clients + // that use the compatibility shim T will compile with any + // version of go, whether older or newer than go1.22, but only + // the newer version will use the std.Real implementation. + // + // Now consider a reference to method M in new(T).F.M() in a + // module that requires a minimum of go1.21. The analysis may + // occur using a version of Go higher than 1.21, selecting the + // first version of T, so the method M is Real.M. This would + // spuriously cause the analyzer to report a reference to a + // too-new symbol even though this expression compiles just + // fine (with the fake implementation) using go1.21. + for _, sym := range symbols { + symVersion := sym.Version.String() + if !versions.Before(version, symVersion) { + continue // allowed + } + + var obj types.Object + switch sym.Kind { + case stdlib.Field: + typename, name := sym.SplitField() + if t := pkg.Scope().Lookup(typename); t != nil && disallowed[t] == "" { + obj, _, _ = types.LookupFieldOrMethod(t.Type(), false, pkg, name) + } + + case stdlib.Method: + ptr, recvname, name := sym.SplitMethod() + if t := pkg.Scope().Lookup(recvname); t != nil && disallowed[t] == "" { + obj, _, _ = types.LookupFieldOrMethod(t.Type(), ptr, pkg, name) + } + } + if obj != nil { + disallowed[obj] = symVersion + } + } + + return disallowed +} diff --git a/vendor/golang.org/x/tools/internal/typesinternal/types.go b/vendor/golang.org/x/tools/internal/typesinternal/types.go new file mode 100644 index 0000000..8392328 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/typesinternal/types.go @@ -0,0 +1,65 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package typesinternal provides access to internal go/types APIs that are not +// yet exported. +package typesinternal + +import ( + "go/token" + "go/types" + "reflect" + "unsafe" +) + +func SetUsesCgo(conf *types.Config) bool { + v := reflect.ValueOf(conf).Elem() + + f := v.FieldByName("go115UsesCgo") + if !f.IsValid() { + f = v.FieldByName("UsesCgo") + if !f.IsValid() { + return false + } + } + + addr := unsafe.Pointer(f.UnsafeAddr()) + *(*bool)(addr) = true + + return true +} + +// ReadGo116ErrorData extracts additional information from types.Error values +// generated by Go version 1.16 and later: the error code, start position, and +// end position. If all positions are valid, start <= err.Pos <= end. +// +// If the data could not be read, the final result parameter will be false. +func ReadGo116ErrorData(err types.Error) (code ErrorCode, start, end token.Pos, ok bool) { + var data [3]int + // By coincidence all of these fields are ints, which simplifies things. + v := reflect.ValueOf(err) + for i, name := range []string{"go116code", "go116start", "go116end"} { + f := v.FieldByName(name) + if !f.IsValid() { + return 0, 0, 0, false + } + data[i] = int(f.Int()) + } + return ErrorCode(data[0]), token.Pos(data[1]), token.Pos(data[2]), true +} + +// NameRelativeTo returns a types.Qualifier that qualifies members of +// all packages other than pkg, using only the package name. +// (By contrast, [types.RelativeTo] uses the complete package path, +// which is often excessive.) +// +// If pkg is nil, it is equivalent to [*types.Package.Name]. +func NameRelativeTo(pkg *types.Package) types.Qualifier { + return func(other *types.Package) string { + if pkg != nil && pkg == other { + return "" // same package; unqualified + } + return other.Name() + } +} diff --git a/vendor/golang.org/x/tools/internal/versions/constraint.go b/vendor/golang.org/x/tools/internal/versions/constraint.go new file mode 100644 index 0000000..179063d --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/constraint.go @@ -0,0 +1,13 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package versions + +import "go/build/constraint" + +// ConstraintGoVersion is constraint.GoVersion (if built with go1.21+). +// Otherwise nil. +// +// Deprecate once x/tools is after go1.21. +var ConstraintGoVersion func(x constraint.Expr) string diff --git a/vendor/golang.org/x/tools/internal/versions/constraint_go121.go b/vendor/golang.org/x/tools/internal/versions/constraint_go121.go new file mode 100644 index 0000000..3801140 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/constraint_go121.go @@ -0,0 +1,14 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.21 +// +build go1.21 + +package versions + +import "go/build/constraint" + +func init() { + ConstraintGoVersion = constraint.GoVersion +} diff --git a/vendor/golang.org/x/tools/internal/versions/features.go b/vendor/golang.org/x/tools/internal/versions/features.go new file mode 100644 index 0000000..b53f178 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/features.go @@ -0,0 +1,43 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package versions + +// This file contains predicates for working with file versions to +// decide when a tool should consider a language feature enabled. + +// GoVersions that features in x/tools can be gated to. +const ( + Go1_18 = "go1.18" + Go1_19 = "go1.19" + Go1_20 = "go1.20" + Go1_21 = "go1.21" + Go1_22 = "go1.22" +) + +// Future is an invalid unknown Go version sometime in the future. +// Do not use directly with Compare. +const Future = "" + +// AtLeast reports whether the file version v comes after a Go release. +// +// Use this predicate to enable a behavior once a certain Go release +// has happened (and stays enabled in the future). +func AtLeast(v, release string) bool { + if v == Future { + return true // an unknown future version is always after y. + } + return Compare(Lang(v), Lang(release)) >= 0 +} + +// Before reports whether the file version v is strictly before a Go release. +// +// Use this predicate to disable a behavior once a certain Go release +// has happened (and stays enabled in the future). +func Before(v, release string) bool { + if v == Future { + return false // an unknown future version happens after y. + } + return Compare(Lang(v), Lang(release)) < 0 +} diff --git a/vendor/golang.org/x/tools/internal/versions/gover.go b/vendor/golang.org/x/tools/internal/versions/gover.go new file mode 100644 index 0000000..bbabcd2 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/gover.go @@ -0,0 +1,172 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a fork of internal/gover for use by x/tools until +// go1.21 and earlier are no longer supported by x/tools. + +package versions + +import "strings" + +// A gover is a parsed Go gover: major[.Minor[.Patch]][kind[pre]] +// The numbers are the original decimal strings to avoid integer overflows +// and since there is very little actual math. (Probably overflow doesn't matter in practice, +// but at the time this code was written, there was an existing test that used +// go1.99999999999, which does not fit in an int on 32-bit platforms. +// The "big decimal" representation avoids the problem entirely.) +type gover struct { + major string // decimal + minor string // decimal or "" + patch string // decimal or "" + kind string // "", "alpha", "beta", "rc" + pre string // decimal or "" +} + +// compare returns -1, 0, or +1 depending on whether +// x < y, x == y, or x > y, interpreted as toolchain versions. +// The versions x and y must not begin with a "go" prefix: just "1.21" not "go1.21". +// Malformed versions compare less than well-formed versions and equal to each other. +// The language version "1.21" compares less than the release candidate and eventual releases "1.21rc1" and "1.21.0". +func compare(x, y string) int { + vx := parse(x) + vy := parse(y) + + if c := cmpInt(vx.major, vy.major); c != 0 { + return c + } + if c := cmpInt(vx.minor, vy.minor); c != 0 { + return c + } + if c := cmpInt(vx.patch, vy.patch); c != 0 { + return c + } + if c := strings.Compare(vx.kind, vy.kind); c != 0 { // "" < alpha < beta < rc + return c + } + if c := cmpInt(vx.pre, vy.pre); c != 0 { + return c + } + return 0 +} + +// lang returns the Go language version. For example, lang("1.2.3") == "1.2". +func lang(x string) string { + v := parse(x) + if v.minor == "" || v.major == "1" && v.minor == "0" { + return v.major + } + return v.major + "." + v.minor +} + +// isValid reports whether the version x is valid. +func isValid(x string) bool { + return parse(x) != gover{} +} + +// parse parses the Go version string x into a version. +// It returns the zero version if x is malformed. +func parse(x string) gover { + var v gover + + // Parse major version. + var ok bool + v.major, x, ok = cutInt(x) + if !ok { + return gover{} + } + if x == "" { + // Interpret "1" as "1.0.0". + v.minor = "0" + v.patch = "0" + return v + } + + // Parse . before minor version. + if x[0] != '.' { + return gover{} + } + + // Parse minor version. + v.minor, x, ok = cutInt(x[1:]) + if !ok { + return gover{} + } + if x == "" { + // Patch missing is same as "0" for older versions. + // Starting in Go 1.21, patch missing is different from explicit .0. + if cmpInt(v.minor, "21") < 0 { + v.patch = "0" + } + return v + } + + // Parse patch if present. + if x[0] == '.' { + v.patch, x, ok = cutInt(x[1:]) + if !ok || x != "" { + // Note that we are disallowing prereleases (alpha, beta, rc) for patch releases here (x != ""). + // Allowing them would be a bit confusing because we already have: + // 1.21 < 1.21rc1 + // But a prerelease of a patch would have the opposite effect: + // 1.21.3rc1 < 1.21.3 + // We've never needed them before, so let's not start now. + return gover{} + } + return v + } + + // Parse prerelease. + i := 0 + for i < len(x) && (x[i] < '0' || '9' < x[i]) { + if x[i] < 'a' || 'z' < x[i] { + return gover{} + } + i++ + } + if i == 0 { + return gover{} + } + v.kind, x = x[:i], x[i:] + if x == "" { + return v + } + v.pre, x, ok = cutInt(x) + if !ok || x != "" { + return gover{} + } + + return v +} + +// cutInt scans the leading decimal number at the start of x to an integer +// and returns that value and the rest of the string. +func cutInt(x string) (n, rest string, ok bool) { + i := 0 + for i < len(x) && '0' <= x[i] && x[i] <= '9' { + i++ + } + if i == 0 || x[0] == '0' && i != 1 { // no digits or unnecessary leading zero + return "", "", false + } + return x[:i], x[i:], true +} + +// cmpInt returns cmp.Compare(x, y) interpreting x and y as decimal numbers. +// (Copied from golang.org/x/mod/semver's compareInt.) +func cmpInt(x, y string) int { + if x == y { + return 0 + } + if len(x) < len(y) { + return -1 + } + if len(x) > len(y) { + return +1 + } + if x < y { + return -1 + } else { + return +1 + } +} diff --git a/vendor/golang.org/x/tools/internal/versions/types.go b/vendor/golang.org/x/tools/internal/versions/types.go new file mode 100644 index 0000000..f0bb0d1 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/types.go @@ -0,0 +1,38 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package versions + +import ( + "go/ast" + "go/types" +) + +// FileVersion returns a file's Go version. +// The reported version is an unknown Future version if a +// version cannot be determined. +func FileVersion(info *types.Info, file *ast.File) string { + // In tools built with Go >= 1.22, the Go version of a file + // follow a cascades of sources: + // 1) types.Info.FileVersion, which follows the cascade: + // 1.a) file version (ast.File.GoVersion), + // 1.b) the package version (types.Config.GoVersion), or + // 2) is some unknown Future version. + // + // File versions require a valid package version to be provided to types + // in Config.GoVersion. Config.GoVersion is either from the package's module + // or the toolchain (go run). This value should be provided by go/packages + // or unitchecker.Config.GoVersion. + if v := info.FileVersions[file]; IsValid(v) { + return v + } + // Note: we could instead return runtime.Version() [if valid]. + // This would act as a max version on what a tool can support. + return Future +} + +// InitFileVersions initializes info to record Go versions for Go files. +func InitFileVersions(info *types.Info) { + info.FileVersions = make(map[*ast.File]string) +} diff --git a/vendor/golang.org/x/tools/internal/versions/versions.go b/vendor/golang.org/x/tools/internal/versions/versions.go new file mode 100644 index 0000000..8d1f745 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/versions.go @@ -0,0 +1,57 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package versions + +import ( + "strings" +) + +// Note: If we use build tags to use go/versions when go >=1.22, +// we run into go.dev/issue/53737. Under some operations users would see an +// import of "go/versions" even if they would not compile the file. +// For example, during `go get -u ./...` (go.dev/issue/64490) we do not try to include +// For this reason, this library just a clone of go/versions for the moment. + +// Lang returns the Go language version for version x. +// If x is not a valid version, Lang returns the empty string. +// For example: +// +// Lang("go1.21rc2") = "go1.21" +// Lang("go1.21.2") = "go1.21" +// Lang("go1.21") = "go1.21" +// Lang("go1") = "go1" +// Lang("bad") = "" +// Lang("1.21") = "" +func Lang(x string) string { + v := lang(stripGo(x)) + if v == "" { + return "" + } + return x[:2+len(v)] // "go"+v without allocation +} + +// Compare returns -1, 0, or +1 depending on whether +// x < y, x == y, or x > y, interpreted as Go versions. +// The versions x and y must begin with a "go" prefix: "go1.21" not "1.21". +// Invalid versions, including the empty string, compare less than +// valid versions and equal to each other. +// The language version "go1.21" compares less than the +// release candidate and eventual releases "go1.21rc1" and "go1.21.0". +// Custom toolchain suffixes are ignored during comparison: +// "go1.21.0" and "go1.21.0-bigcorp" are equal. +func Compare(x, y string) int { return compare(stripGo(x), stripGo(y)) } + +// IsValid reports whether the version x is valid. +func IsValid(x string) bool { return isValid(stripGo(x)) } + +// stripGo converts from a "go1.21" version to a "1.21" version. +// If v does not start with "go", stripGo returns the empty string (a known invalid version). +func stripGo(v string) string { + v, _, _ = strings.Cut(v, "-") // strip -bigcorp suffix. + if len(v) < 2 || v[:2] != "go" { + return "" + } + return v[2:] +} diff --git a/vendor/gorm.io/driver/postgres/.gitignore b/vendor/gorm.io/driver/postgres/.gitignore new file mode 100644 index 0000000..485dee6 --- /dev/null +++ b/vendor/gorm.io/driver/postgres/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/vendor/gorm.io/driver/postgres/error_translator.go b/vendor/gorm.io/driver/postgres/error_translator.go new file mode 100644 index 0000000..5f81350 --- /dev/null +++ b/vendor/gorm.io/driver/postgres/error_translator.go @@ -0,0 +1,50 @@ +package postgres + +import ( + "encoding/json" + + "gorm.io/gorm" + + "github.com/jackc/pgx/v5/pgconn" +) + +// The error codes to map PostgreSQL errors to gorm errors, here is the PostgreSQL error codes reference https://www.postgresql.org/docs/current/errcodes-appendix.html. +var errCodes = map[string]error{ + "23505": gorm.ErrDuplicatedKey, + "23503": gorm.ErrForeignKeyViolated, + "42703": gorm.ErrInvalidField, + "23514": gorm.ErrCheckConstraintViolated, +} + +type ErrMessage struct { + Code string + Severity string + Message string +} + +// Translate it will translate the error to native gorm errors. +// Since currently gorm supporting both pgx and pg drivers, only checking for pgx PgError types is not enough for translating errors, so we have additional error json marshal fallback. +func (dialector Dialector) Translate(err error) error { + if pgErr, ok := err.(*pgconn.PgError); ok { + if translatedErr, found := errCodes[pgErr.Code]; found { + return translatedErr + } + return err + } + + parsedErr, marshalErr := json.Marshal(err) + if marshalErr != nil { + return err + } + + var errMsg ErrMessage + unmarshalErr := json.Unmarshal(parsedErr, &errMsg) + if unmarshalErr != nil { + return err + } + + if translatedErr, found := errCodes[errMsg.Code]; found { + return translatedErr + } + return err +} diff --git a/vendor/gorm.io/driver/postgres/migrator.go b/vendor/gorm.io/driver/postgres/migrator.go index 5eb0acc..df18db1 100644 --- a/vendor/gorm.io/driver/postgres/migrator.go +++ b/vendor/gorm.io/driver/postgres/migrator.go @@ -6,40 +6,78 @@ import ( "regexp" "strings" - "github.com/jackc/pgx/v4" + "github.com/jackc/pgx/v5" "gorm.io/gorm" "gorm.io/gorm/clause" "gorm.io/gorm/migrator" "gorm.io/gorm/schema" ) +// See https://stackoverflow.com/questions/2204058/list-columns-with-indexes-in-postgresql +// Here are some changes: +// - use `LEFT JOIN` instead of `CROSS JOIN` +// - exclude indexes used to support constraints (they are auto-generated) const indexSql = ` -select - t.relname as table_name, - i.relname as index_name, - a.attname as column_name, - ix.indisunique as non_unique, - ix.indisprimary as primary -from - pg_class t, - pg_class i, - pg_index ix, - pg_attribute a -where - t.oid = ix.indrelid - and i.oid = ix.indexrelid - and a.attrelid = t.oid - and a.attnum = ANY(ix.indkey) - and t.relkind = 'r' - and t.relname = ? +SELECT + ct.relname AS table_name, + ci.relname AS index_name, + i.indisunique AS non_unique, + i.indisprimary AS primary, + a.attname AS column_name +FROM + pg_index i + LEFT JOIN pg_class ct ON ct.oid = i.indrelid + LEFT JOIN pg_class ci ON ci.oid = i.indexrelid + LEFT JOIN pg_attribute a ON a.attrelid = ct.oid + LEFT JOIN pg_constraint con ON con.conindid = i.indexrelid +WHERE + a.attnum = ANY(i.indkey) + AND con.oid IS NULL + AND ct.relkind = 'r' + AND ct.relname = ? ` +var typeAliasMap = map[string][]string{ + "int": {"integer"}, + "int2": {"smallint"}, + "int4": {"integer"}, + "int8": {"bigint"}, + "smallint": {"int2"}, + "integer": {"int4"}, + "bigint": {"int8"}, + "decimal": {"numeric"}, + "numeric": {"decimal"}, + "timestamptz": {"timestamp with time zone"}, + "timestamp with time zone": {"timestamptz"}, + "bool": {"boolean"}, + "boolean": {"bool"}, + "serial2": {"smallserial"}, + "serial4": {"serial"}, + "serial8": {"bigserial"}, + "varbit": {"bit varying"}, + "char": {"character"}, + "varchar": {"character varying"}, + "float4": {"real"}, + "float8": {"double precision"}, + "timetz": {"time with time zone"}, +} + type Migrator struct { migrator.Migrator } +// select querys ignore dryrun +func (m Migrator) queryRaw(sql string, values ...interface{}) (tx *gorm.DB) { + queryTx := m.DB + if m.DB.DryRun { + queryTx = m.DB.Session(&gorm.Session{}) + queryTx.DryRun = false + } + return queryTx.Raw(sql, values...) +} + func (m Migrator) CurrentDatabase() (name string) { - m.DB.Raw("SELECT CURRENT_DATABASE()").Scan(&name) + m.queryRaw("SELECT CURRENT_DATABASE()").Scan(&name) return } @@ -65,11 +103,13 @@ func (m Migrator) BuildIndexOptions(opts []schema.IndexOption, stmt *gorm.Statem func (m Migrator) HasIndex(value interface{}, name string) bool { var count int64 m.RunWithValue(value, func(stmt *gorm.Statement) error { - if idx := stmt.Schema.LookIndex(name); idx != nil { - name = idx.Name + if stmt.Schema != nil { + if idx := stmt.Schema.LookIndex(name); idx != nil { + name = idx.Name + } } currentSchema, curTable := m.CurrentSchema(stmt, stmt.Table) - return m.DB.Raw( + return m.queryRaw( "SELECT count(*) FROM pg_indexes WHERE tablename = ? AND indexname = ? AND schemaname = ?", curTable, name, currentSchema, ).Scan(&count).Error }) @@ -79,33 +119,35 @@ func (m Migrator) HasIndex(value interface{}, name string) bool { func (m Migrator) CreateIndex(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if idx := stmt.Schema.LookIndex(name); idx != nil { - opts := m.BuildIndexOptions(idx.Fields, stmt) - values := []interface{}{clause.Column{Name: idx.Name}, m.CurrentTable(stmt), opts} + if stmt.Schema != nil { + if idx := stmt.Schema.LookIndex(name); idx != nil { + opts := m.BuildIndexOptions(idx.Fields, stmt) + values := []interface{}{clause.Column{Name: idx.Name}, m.CurrentTable(stmt), opts} - createIndexSQL := "CREATE " - if idx.Class != "" { - createIndexSQL += idx.Class + " " - } - createIndexSQL += "INDEX " + createIndexSQL := "CREATE " + if idx.Class != "" { + createIndexSQL += idx.Class + " " + } + createIndexSQL += "INDEX " - if strings.TrimSpace(strings.ToUpper(idx.Option)) == "CONCURRENTLY" { - createIndexSQL += "CONCURRENTLY " - } + if strings.TrimSpace(strings.ToUpper(idx.Option)) == "CONCURRENTLY" { + createIndexSQL += "CONCURRENTLY " + } - createIndexSQL += "IF NOT EXISTS ? ON ?" + createIndexSQL += "IF NOT EXISTS ? ON ?" - if idx.Type != "" { - createIndexSQL += " USING " + idx.Type + "(?)" - } else { - createIndexSQL += " ?" - } + if idx.Type != "" { + createIndexSQL += " USING " + idx.Type + "(?)" + } else { + createIndexSQL += " ?" + } - if idx.Where != "" { - createIndexSQL += " WHERE " + idx.Where - } + if idx.Where != "" { + createIndexSQL += " WHERE " + idx.Where + } - return m.DB.Exec(createIndexSQL, values...).Error + return m.DB.Exec(createIndexSQL, values...).Error + } } return fmt.Errorf("failed to create index with name %v", name) @@ -123,8 +165,10 @@ func (m Migrator) RenameIndex(value interface{}, oldName, newName string) error func (m Migrator) DropIndex(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if idx := stmt.Schema.LookIndex(name); idx != nil { - name = idx.Name + if stmt.Schema != nil { + if idx := stmt.Schema.LookIndex(name); idx != nil { + name = idx.Name + } } return m.DB.Exec("DROP INDEX ?", clause.Column{Name: name}).Error @@ -133,7 +177,7 @@ func (m Migrator) DropIndex(value interface{}, name string) error { func (m Migrator) GetTables() (tableList []string, err error) { currentSchema, _ := m.CurrentSchema(m.DB.Statement, "") - return tableList, m.DB.Raw("SELECT table_name FROM information_schema.tables WHERE table_schema = ? AND table_type = ?", currentSchema, "BASE TABLE").Scan(&tableList).Error + return tableList, m.queryRaw("SELECT table_name FROM information_schema.tables WHERE table_schema = ? AND table_type = ?", currentSchema, "BASE TABLE").Scan(&tableList).Error } func (m Migrator) CreateTable(values ...interface{}) (err error) { @@ -142,13 +186,16 @@ func (m Migrator) CreateTable(values ...interface{}) (err error) { } for _, value := range m.ReorderModels(values, false) { if err = m.RunWithValue(value, func(stmt *gorm.Statement) error { - for _, field := range stmt.Schema.FieldsByDBName { - if field.Comment != "" { - if err := m.DB.Exec( - "COMMENT ON COLUMN ?.? IS ?", - m.CurrentTable(stmt), clause.Column{Name: field.DBName}, gorm.Expr(m.Migrator.Dialector.Explain("$1", field.Comment)), - ).Error; err != nil { - return err + if stmt.Schema != nil { + for _, fieldName := range stmt.Schema.DBNames { + field := stmt.Schema.FieldsByDBName[fieldName] + if field.Comment != "" { + if err := m.DB.Exec( + "COMMENT ON COLUMN ?.? IS ?", + m.CurrentTable(stmt), clause.Column{Name: field.DBName}, gorm.Expr(m.Migrator.Dialector.Explain("$1", field.Comment)), + ).Error; err != nil { + return err + } } } } @@ -164,7 +211,7 @@ func (m Migrator) HasTable(value interface{}) bool { var count int64 m.RunWithValue(value, func(stmt *gorm.Statement) error { currentSchema, curTable := m.CurrentSchema(stmt, stmt.Table) - return m.DB.Raw("SELECT count(*) FROM information_schema.tables WHERE table_schema = ? AND table_name = ? AND table_type = ?", currentSchema, curTable, "BASE TABLE").Scan(&count).Error + return m.queryRaw("SELECT count(*) FROM information_schema.tables WHERE table_schema = ? AND table_name = ? AND table_type = ?", currentSchema, curTable, "BASE TABLE").Scan(&count).Error }) return count > 0 } @@ -186,14 +233,18 @@ func (m Migrator) AddColumn(value interface{}, field string) error { if err := m.Migrator.AddColumn(value, field); err != nil { return err } + m.resetPreparedStmts() + return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if field := stmt.Schema.LookUpField(field); field != nil { - if field.Comment != "" { - if err := m.DB.Exec( - "COMMENT ON COLUMN ?.? IS ?", - m.CurrentTable(stmt), clause.Column{Name: field.DBName}, gorm.Expr(m.Migrator.Dialector.Explain("$1", field.Comment)), - ).Error; err != nil { - return err + if stmt.Schema != nil { + if field := stmt.Schema.LookUpField(field); field != nil { + if field.Comment != "" { + if err := m.DB.Exec( + "COMMENT ON COLUMN ?.? IS ?", + m.CurrentTable(stmt), clause.Column{Name: field.DBName}, gorm.Expr(m.Migrator.Dialector.Explain("$1", field.Comment)), + ).Error; err != nil { + return err + } } } } @@ -212,7 +263,7 @@ func (m Migrator) HasColumn(value interface{}, field string) bool { } currentSchema, curTable := m.CurrentSchema(stmt, stmt.Table) - return m.DB.Raw( + return m.queryRaw( "SELECT count(*) FROM INFORMATION_SCHEMA.columns WHERE table_schema = ? AND table_name = ? AND column_name = ?", currentSchema, curTable, name, ).Scan(&count).Error @@ -237,11 +288,10 @@ func (m Migrator) MigrateColumn(value interface{}, field *schema.Field, columnTy checkSQL += "WHERE objsubid = (SELECT ordinal_position FROM information_schema.columns WHERE table_schema = ? AND table_name = ? AND column_name = ?) " checkSQL += "AND objoid = (SELECT oid FROM pg_catalog.pg_class WHERE relname = ? AND relnamespace = " checkSQL += "(SELECT oid FROM pg_catalog.pg_namespace WHERE nspname = ?))" - m.DB.Raw(checkSQL, values...).Scan(&description) - comment := field.Comment - if comment != "" { - comment = comment[1 : len(comment)-1] - } + m.queryRaw(checkSQL, values...).Scan(&description) + + comment := strings.Trim(field.Comment, "'") + comment = strings.Trim(comment, `"`) if field.Comment != "" && comment != description { if err := m.DB.Exec( "COMMENT ON COLUMN ?.? IS ?", @@ -256,100 +306,137 @@ func (m Migrator) MigrateColumn(value interface{}, field *schema.Field, columnTy // AlterColumn alter value's `field` column' type based on schema definition func (m Migrator) AlterColumn(value interface{}, field string) error { - return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if field := stmt.Schema.LookUpField(field); field != nil { - var ( - columnTypes, _ = m.DB.Migrator().ColumnTypes(value) - fieldColumnType *migrator.ColumnType - ) - for _, columnType := range columnTypes { - if columnType.Name() == field.DBName { - fieldColumnType, _ = columnType.(*migrator.ColumnType) + err := m.RunWithValue(value, func(stmt *gorm.Statement) error { + if stmt.Schema != nil { + if field := stmt.Schema.LookUpField(field); field != nil { + var ( + columnTypes, _ = m.DB.Migrator().ColumnTypes(value) + fieldColumnType *migrator.ColumnType + ) + for _, columnType := range columnTypes { + if columnType.Name() == field.DBName { + fieldColumnType, _ = columnType.(*migrator.ColumnType) + } } - } - fileType := clause.Expr{SQL: m.DataTypeOf(field)} - if fieldColumnType.DatabaseTypeName() != fileType.SQL { - filedColumnAutoIncrement, _ := fieldColumnType.AutoIncrement() - if field.AutoIncrement && filedColumnAutoIncrement { // update - serialDatabaseType, _ := getSerialDatabaseType(fileType.SQL) - if t, _ := fieldColumnType.ColumnType(); t != serialDatabaseType { - if err := m.UpdateSequence(m.DB, stmt, field, serialDatabaseType); err != nil { - return err + fileType := clause.Expr{SQL: m.DataTypeOf(field)} + // check for typeName and SQL name + isSameType := true + if !strings.EqualFold(fieldColumnType.DatabaseTypeName(), fileType.SQL) { + isSameType = false + // if different, also check for aliases + aliases := m.GetTypeAliases(fieldColumnType.DatabaseTypeName()) + for _, alias := range aliases { + if strings.HasPrefix(fileType.SQL, alias) { + isSameType = true + break } } - } else if field.AutoIncrement && !filedColumnAutoIncrement { // create - serialDatabaseType, _ := getSerialDatabaseType(fileType.SQL) - if err := m.CreateSequence(m.DB, stmt, field, serialDatabaseType); err != nil { - return err - } - } else if !field.AutoIncrement && filedColumnAutoIncrement { // delete - if err := m.DeleteSequence(m.DB, stmt, field, fileType); err != nil { - return err - } - } else { - if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? TYPE ?", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, fileType).Error; err != nil { - return err - } } - } - if null, _ := fieldColumnType.Nullable(); null == field.NotNull { - if field.NotNull { - if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? SET NOT NULL", m.CurrentTable(stmt), clause.Column{Name: field.DBName}).Error; err != nil { - return err - } - } else { - if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? DROP NOT NULL", m.CurrentTable(stmt), clause.Column{Name: field.DBName}).Error; err != nil { - return err + // not same, migrate + if !isSameType { + filedColumnAutoIncrement, _ := fieldColumnType.AutoIncrement() + if field.AutoIncrement && filedColumnAutoIncrement { // update + serialDatabaseType, _ := getSerialDatabaseType(fileType.SQL) + if t, _ := fieldColumnType.ColumnType(); t != serialDatabaseType { + if err := m.UpdateSequence(m.DB, stmt, field, serialDatabaseType); err != nil { + return err + } + } + } else if field.AutoIncrement && !filedColumnAutoIncrement { // create + serialDatabaseType, _ := getSerialDatabaseType(fileType.SQL) + if err := m.CreateSequence(m.DB, stmt, field, serialDatabaseType); err != nil { + return err + } + } else if !field.AutoIncrement && filedColumnAutoIncrement { // delete + if err := m.DeleteSequence(m.DB, stmt, field, fileType); err != nil { + return err + } + } else { + if err := m.modifyColumn(stmt, field, fileType, fieldColumnType); err != nil { + return err + } } } - } - - if uniq, _ := fieldColumnType.Unique(); uniq != field.Unique { - idxName := clause.Column{Name: m.DB.Config.NamingStrategy.IndexName(stmt.Table, field.DBName)} - if err := m.DB.Exec("ALTER TABLE ? ADD CONSTRAINT ? UNIQUE(?)", m.CurrentTable(stmt), idxName, clause.Column{Name: field.DBName}).Error; err != nil { - return err - } - } - if v, _ := fieldColumnType.DefaultValue(); v != field.DefaultValue { - if field.HasDefaultValue && (field.DefaultValueInterface != nil || field.DefaultValue != "") { - if field.DefaultValueInterface != nil { - defaultStmt := &gorm.Statement{Vars: []interface{}{field.DefaultValueInterface}} - m.Dialector.BindVarTo(defaultStmt, defaultStmt, field.DefaultValueInterface) - if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? SET DEFAULT ?", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, clause.Expr{SQL: m.Dialector.Explain(defaultStmt.SQL.String(), field.DefaultValueInterface)}).Error; err != nil { - return err - } - } else if field.DefaultValue != "(-)" { - if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? SET DEFAULT ?", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, clause.Expr{SQL: field.DefaultValue}).Error; err != nil { + if null, _ := fieldColumnType.Nullable(); null == field.NotNull { + if field.NotNull { + if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? SET NOT NULL", m.CurrentTable(stmt), clause.Column{Name: field.DBName}).Error; err != nil { return err } } else { - if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? DROP DEFAULT", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, clause.Expr{SQL: field.DefaultValue}).Error; err != nil { + if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? DROP NOT NULL", m.CurrentTable(stmt), clause.Column{Name: field.DBName}).Error; err != nil { return err } } } + + if v, ok := fieldColumnType.DefaultValue(); (field.DefaultValueInterface == nil && ok) || v != field.DefaultValue { + if field.HasDefaultValue && (field.DefaultValueInterface != nil || field.DefaultValue != "") { + if field.DefaultValueInterface != nil { + defaultStmt := &gorm.Statement{Vars: []interface{}{field.DefaultValueInterface}} + m.Dialector.BindVarTo(defaultStmt, defaultStmt, field.DefaultValueInterface) + if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? SET DEFAULT ?", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, clause.Expr{SQL: m.Dialector.Explain(defaultStmt.SQL.String(), field.DefaultValueInterface)}).Error; err != nil { + return err + } + } else if field.DefaultValue != "(-)" { + if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? SET DEFAULT ?", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, clause.Expr{SQL: field.DefaultValue}).Error; err != nil { + return err + } + } else { + if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? DROP DEFAULT", m.CurrentTable(stmt), clause.Column{Name: field.DBName}, clause.Expr{SQL: field.DefaultValue}).Error; err != nil { + return err + } + } + } + } + return nil } - return nil } return fmt.Errorf("failed to look up field with name: %s", field) }) + + if err != nil { + return err + } + m.resetPreparedStmts() + return nil +} + +func (m Migrator) modifyColumn(stmt *gorm.Statement, field *schema.Field, targetType clause.Expr, existingColumn *migrator.ColumnType) error { + alterSQL := "ALTER TABLE ? ALTER COLUMN ? TYPE ? USING ?::?" + isUncastableDefaultValue := false + + if targetType.SQL == "boolean" { + switch existingColumn.DatabaseTypeName() { + case "int2", "int8", "numeric": + alterSQL = "ALTER TABLE ? ALTER COLUMN ? TYPE ? USING ?::int::?" + } + isUncastableDefaultValue = true + } + + if dv, _ := existingColumn.DefaultValue(); dv != "" && isUncastableDefaultValue { + if err := m.DB.Exec("ALTER TABLE ? ALTER COLUMN ? DROP DEFAULT", m.CurrentTable(stmt), clause.Column{Name: field.DBName}).Error; err != nil { + return err + } + } + if err := m.DB.Exec(alterSQL, m.CurrentTable(stmt), clause.Column{Name: field.DBName}, targetType, clause.Column{Name: field.DBName}, targetType).Error; err != nil { + return err + } + return nil } func (m Migrator) HasConstraint(value interface{}, name string) bool { var count int64 m.RunWithValue(value, func(stmt *gorm.Statement) error { - constraint, chk, table := m.GuessConstraintAndTable(stmt, name) - currentSchema, curTable := m.CurrentSchema(stmt, table) + constraint, table := m.GuessConstraintInterfaceAndTable(stmt, name) if constraint != nil { - name = constraint.Name - } else if chk != nil { - name = chk.Name + name = constraint.GetName() } + currentSchema, curTable := m.CurrentSchema(stmt, table) - return m.DB.Raw( + return m.queryRaw( "SELECT count(*) FROM INFORMATION_SCHEMA.table_constraints WHERE table_schema = ? AND table_name = ? AND constraint_name = ?", currentSchema, curTable, name, ).Scan(&count).Error @@ -364,8 +451,8 @@ func (m Migrator) ColumnTypes(value interface{}) (columnTypes []gorm.ColumnType, var ( currentDatabase = m.DB.Migrator().CurrentDatabase() currentSchema, table = m.CurrentSchema(stmt, stmt.Table) - columns, err = m.DB.Raw( - "SELECT c.column_name, c.is_nullable = 'YES', c.udt_name, c.character_maximum_length, c.numeric_precision, c.numeric_precision_radix, c.numeric_scale, c.datetime_precision, 8 * typlen, c.column_default, pd.description FROM information_schema.columns AS c JOIN pg_type AS pgt ON c.udt_name = pgt.typname LEFT JOIN pg_catalog.pg_description as pd ON pd.objsubid = c.ordinal_position AND pd.objoid = (SELECT oid FROM pg_catalog.pg_class WHERE relname = c.table_name AND relnamespace = (SELECT oid FROM pg_catalog.pg_namespace WHERE nspname = c.table_schema)) where table_catalog = ? AND table_schema = ? AND table_name = ?", + columns, err = m.queryRaw( + "SELECT c.column_name, c.is_nullable = 'YES', c.udt_name, c.character_maximum_length, c.numeric_precision, c.numeric_precision_radix, c.numeric_scale, c.datetime_precision, 8 * typlen, c.column_default, pd.description, c.identity_increment FROM information_schema.columns AS c JOIN pg_type AS pgt ON c.udt_name = pgt.typname LEFT JOIN pg_catalog.pg_description as pd ON pd.objsubid = c.ordinal_position AND pd.objoid = (SELECT oid FROM pg_catalog.pg_class WHERE relname = c.table_name AND relnamespace = (SELECT oid FROM pg_catalog.pg_namespace WHERE nspname = c.table_schema)) where table_catalog = ? AND table_schema = ? AND table_name = ?", currentDatabase, currentSchema, table).Rows() ) @@ -382,11 +469,12 @@ func (m Migrator) ColumnTypes(value interface{}) (columnTypes []gorm.ColumnType, datetimePrecision sql.NullInt64 radixValue sql.NullInt64 typeLenValue sql.NullInt64 + identityIncrement sql.NullString ) err = columns.Scan( &column.NameValue, &column.NullableValue, &column.DataTypeValue, &column.LengthValue, &column.DecimalSizeValue, - &radixValue, &column.ScaleValue, &datetimePrecision, &typeLenValue, &column.DefaultValueValue, &column.CommentValue, + &radixValue, &column.ScaleValue, &datetimePrecision, &typeLenValue, &column.DefaultValueValue, &column.CommentValue, &identityIncrement, ) if err != nil { return err @@ -396,13 +484,14 @@ func (m Migrator) ColumnTypes(value interface{}) (columnTypes []gorm.ColumnType, column.LengthValue = typeLenValue } - if strings.HasPrefix(column.DefaultValueValue.String, "nextval('") && strings.HasSuffix(column.DefaultValueValue.String, "seq'::regclass)") { + if (strings.HasPrefix(column.DefaultValueValue.String, "nextval('") && + strings.HasSuffix(column.DefaultValueValue.String, "seq'::regclass)")) || (identityIncrement.Valid && identityIncrement.String != "") { column.AutoIncrementValue = sql.NullBool{Bool: true, Valid: true} column.DefaultValueValue = sql.NullString{} } if column.DefaultValueValue.Valid { - column.DefaultValueValue.String = regexp.MustCompile(`'(.*)'::[\w]+$`).ReplaceAllString(column.DefaultValueValue.String, "$1") + column.DefaultValueValue.String = parseDefaultValueValue(column.DefaultValueValue.String) } if datetimePrecision.Valid { @@ -436,14 +525,25 @@ func (m Migrator) ColumnTypes(value interface{}) (columnTypes []gorm.ColumnType, // check primary, unique field { - columnTypeRows, err := m.DB.Raw("SELECT c.column_name, constraint_type FROM information_schema.table_constraints tc JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_name) JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema AND tc.table_name = c.table_name AND ccu.column_name = c.column_name WHERE constraint_type IN ('PRIMARY KEY', 'UNIQUE') AND c.table_catalog = ? AND c.table_schema = ? AND c.table_name = ?", currentDatabase, currentSchema, table).Rows() + columnTypeRows, err := m.queryRaw("SELECT constraint_name FROM information_schema.table_constraints tc JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_catalog, table_name, constraint_name) JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema AND tc.table_name = c.table_name AND ccu.column_name = c.column_name WHERE constraint_type IN ('PRIMARY KEY', 'UNIQUE') AND c.table_catalog = ? AND c.table_schema = ? AND c.table_name = ? AND constraint_type = ?", currentDatabase, currentSchema, table, "UNIQUE").Rows() if err != nil { return err } + uniqueContraints := map[string]int{} + for columnTypeRows.Next() { + var constraintName string + columnTypeRows.Scan(&constraintName) + uniqueContraints[constraintName]++ + } + columnTypeRows.Close() + columnTypeRows, err = m.queryRaw("SELECT c.column_name, constraint_name, constraint_type FROM information_schema.table_constraints tc JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_catalog, table_name, constraint_name) JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema AND tc.table_name = c.table_name AND ccu.column_name = c.column_name WHERE constraint_type IN ('PRIMARY KEY', 'UNIQUE') AND c.table_catalog = ? AND c.table_schema = ? AND c.table_name = ?", currentDatabase, currentSchema, table).Rows() + if err != nil { + return err + } for columnTypeRows.Next() { - var name, columnType string - columnTypeRows.Scan(&name, &columnType) + var name, constraintName, columnType string + columnTypeRows.Scan(&name, &constraintName, &columnType) for _, c := range columnTypes { mc := c.(*migrator.ColumnType) if mc.NameValue.String == name { @@ -451,7 +551,9 @@ func (m Migrator) ColumnTypes(value interface{}) (columnTypes []gorm.ColumnType, case "PRIMARY KEY": mc.PrimaryKeyValue = sql.NullBool{Bool: true, Valid: true} case "UNIQUE": - mc.UniqueValue = sql.NullBool{Bool: true, Valid: true} + if uniqueContraints[constraintName] == 1 { + mc.UniqueValue = sql.NullBool{Bool: true, Valid: true} + } } break } @@ -462,8 +564,8 @@ func (m Migrator) ColumnTypes(value interface{}) (columnTypes []gorm.ColumnType, // check column type { - dataTypeRows, err := m.DB.Raw(`SELECT a.attname as column_name, format_type(a.atttypid, a.atttypmod) AS data_type - FROM pg_attribute a JOIN pg_class b ON a.attrelid = b.relfilenode AND relnamespace = (SELECT oid FROM pg_catalog.pg_namespace WHERE nspname = ?) + dataTypeRows, err := m.queryRaw(`SELECT a.attname as column_name, format_type(a.atttypid, a.atttypmod) AS data_type + FROM pg_attribute a JOIN pg_class b ON a.attrelid = b.oid AND relnamespace = (SELECT oid FROM pg_catalog.pg_namespace WHERE nspname = ?) WHERE a.attnum > 0 -- hide internal columns AND NOT a.attisdropped -- hide deleted columns AND b.relname = ?`, currentSchema, table).Rows() @@ -503,9 +605,10 @@ func (m Migrator) GetRows(currentSchema interface{}, table interface{}) (*sql.Ro } return m.DB.Session(&gorm.Session{}).Table(name).Limit(1).Scopes(func(d *gorm.DB) *gorm.DB { + dialector, _ := m.Dialector.(Dialector) // use simple protocol - if !m.DB.PrepareStmt { - d.Statement.Vars = append(d.Statement.Vars, pgx.QuerySimpleProtocol(true)) + if !m.DB.PrepareStmt && (dialector.Config != nil && (dialector.Config.DriverName == "" || dialector.Config.DriverName == "pgx")) { + d.Statement.Vars = append([]interface{}{pgx.QueryExecModeSimpleProtocol}, d.Statement.Vars...) } return d }).Rows() @@ -619,7 +722,7 @@ func (m Migrator) GetIndexes(value interface{}) ([]gorm.Index, error) { err := m.RunWithValue(value, func(stmt *gorm.Statement) error { result := make([]*Index, 0) - scanErr := m.DB.Raw(indexSql, stmt.Table).Scan(&result).Error + scanErr := m.queryRaw(indexSql, stmt.Table).Scan(&result).Error if scanErr != nil { return scanErr } @@ -663,3 +766,39 @@ func groupByIndexName(indexList []*Index) map[string][]*Index { } return columnIndexMap } + +func (m Migrator) GetTypeAliases(databaseTypeName string) []string { + return typeAliasMap[databaseTypeName] +} + +// should reset prepared stmts when table changed +func (m Migrator) resetPreparedStmts() { + if m.DB.PrepareStmt { + if pdb, ok := m.DB.ConnPool.(*gorm.PreparedStmtDB); ok { + pdb.Reset() + } + } +} + +func (m Migrator) DropColumn(dst interface{}, field string) error { + if err := m.Migrator.DropColumn(dst, field); err != nil { + return err + } + + m.resetPreparedStmts() + return nil +} + +func (m Migrator) RenameColumn(dst interface{}, oldName, field string) error { + if err := m.Migrator.RenameColumn(dst, oldName, field); err != nil { + return err + } + + m.resetPreparedStmts() + return nil +} + +func parseDefaultValueValue(defaultValue string) string { + value := regexp.MustCompile(`^(.*?)(?:::.*)?$`).ReplaceAllString(defaultValue, "$1") + return strings.Trim(value, "'") +} diff --git a/vendor/gorm.io/driver/postgres/postgres.go b/vendor/gorm.io/driver/postgres/postgres.go index c0fdea0..e865b0f 100644 --- a/vendor/gorm.io/driver/postgres/postgres.go +++ b/vendor/gorm.io/driver/postgres/postgres.go @@ -5,9 +5,10 @@ import ( "fmt" "regexp" "strconv" + "strings" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/stdlib" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/stdlib" "gorm.io/gorm" "gorm.io/gorm/callbacks" "gorm.io/gorm/clause" @@ -23,11 +24,17 @@ type Dialector struct { type Config struct { DriverName string DSN string + WithoutQuotingCheck bool PreferSimpleProtocol bool WithoutReturning bool Conn gorm.ConnPool } +var ( + timeZoneMatcher = regexp.MustCompile("(time_zone|TimeZone)=(.*?)($|&| )") + defaultIdentifierLength = 63 //maximum identifier length for postgres +) + func Open(dsn string) gorm.Dialector { return &Dialector{&Config{DSN: dsn}} } @@ -40,15 +47,42 @@ func (dialector Dialector) Name() string { return "postgres" } -var timeZoneMatcher = regexp.MustCompile("(time_zone|TimeZone)=(.*?)($|&| )") +func (dialector Dialector) Apply(config *gorm.Config) error { + if config.NamingStrategy == nil { + config.NamingStrategy = schema.NamingStrategy{ + IdentifierMaxLength: defaultIdentifierLength, + } + return nil + } + + switch v := config.NamingStrategy.(type) { + case *schema.NamingStrategy: + if v.IdentifierMaxLength <= 0 { + v.IdentifierMaxLength = defaultIdentifierLength + } + case schema.NamingStrategy: + if v.IdentifierMaxLength <= 0 { + v.IdentifierMaxLength = defaultIdentifierLength + config.NamingStrategy = v + } + } + + return nil +} func (dialector Dialector) Initialize(db *gorm.DB) (err error) { + callbackConfig := &callbacks.Config{ + CreateClauses: []string{"INSERT", "VALUES", "ON CONFLICT"}, + UpdateClauses: []string{"UPDATE", "SET", "FROM", "WHERE"}, + DeleteClauses: []string{"DELETE", "FROM", "WHERE"}, + } // register callbacks - callbacks.RegisterDefaultCallbacks(db, &callbacks.Config{ - CreateClauses: []string{"INSERT", "VALUES", "ON CONFLICT", "RETURNING"}, - UpdateClauses: []string{"UPDATE", "SET", "WHERE", "RETURNING"}, - DeleteClauses: []string{"DELETE", "FROM", "WHERE", "RETURNING"}, - }) + if !dialector.WithoutReturning { + callbackConfig.CreateClauses = append(callbackConfig.CreateClauses, "RETURNING") + callbackConfig.UpdateClauses = append(callbackConfig.UpdateClauses, "RETURNING") + callbackConfig.DeleteClauses = append(callbackConfig.DeleteClauses, "RETURNING") + } + callbacks.RegisterDefaultCallbacks(db, callbackConfig) if dialector.Conn != nil { db.ConnPool = dialector.Conn @@ -62,7 +96,7 @@ func (dialector Dialector) Initialize(db *gorm.DB) (err error) { return } if dialector.Config.PreferSimpleProtocol { - config.PreferSimpleProtocol = true + config.DefaultQueryExecMode = pgx.QueryExecModeSimpleProtocol } result := timeZoneMatcher.FindStringSubmatch(dialector.Config.DSN) if len(result) > 2 { @@ -87,10 +121,23 @@ func (dialector Dialector) DefaultValueOf(field *schema.Field) clause.Expression func (dialector Dialector) BindVarTo(writer clause.Writer, stmt *gorm.Statement, v interface{}) { writer.WriteByte('$') - writer.WriteString(strconv.Itoa(len(stmt.Vars))) + index := 0 + varLen := len(stmt.Vars) + if varLen > 0 { + switch stmt.Vars[0].(type) { + case pgx.QueryExecMode: + index++ + } + } + writer.WriteString(strconv.Itoa(varLen - index)) } func (dialector Dialector) QuoteTo(writer clause.Writer, str string) { + if dialector.WithoutQuotingCheck { + writer.WriteString(str) + return + } + var ( underQuoted, selfQuoted bool continuousBacktick int8 @@ -192,17 +239,38 @@ func (dialector Dialector) DataTypeOf(field *schema.Field) string { return "timestamptz" case schema.Bytes: return "bytea" + default: + return dialector.getSchemaCustomType(field) + } +} + +func (dialector Dialector) getSchemaCustomType(field *schema.Field) string { + sqlType := string(field.DataType) + + if field.AutoIncrement && !strings.Contains(strings.ToLower(sqlType), "serial") { + size := field.Size + if field.GORMDataType == schema.Uint { + size++ + } + switch { + case size <= 16: + sqlType = "smallserial" + case size <= 32: + sqlType = "serial" + default: + sqlType = "bigserial" + } } - return string(field.DataType) + return sqlType } -func (dialectopr Dialector) SavePoint(tx *gorm.DB, name string) error { +func (dialector Dialector) SavePoint(tx *gorm.DB, name string) error { tx.Exec("SAVEPOINT " + name) return nil } -func (dialectopr Dialector) RollbackTo(tx *gorm.DB, name string) error { +func (dialector Dialector) RollbackTo(tx *gorm.DB, name string) error { tx.Exec("ROLLBACK TO SAVEPOINT " + name) return nil } diff --git a/vendor/gorm.io/gorm/.gitignore b/vendor/gorm.io/gorm/.gitignore index 45505cc..7273332 100644 --- a/vendor/gorm.io/gorm/.gitignore +++ b/vendor/gorm.io/gorm/.gitignore @@ -3,4 +3,5 @@ documents coverage.txt _book .idea -vendor \ No newline at end of file +vendor +.vscode diff --git a/vendor/gorm.io/gorm/.golangci.yml b/vendor/gorm.io/gorm/.golangci.yml index 16903ed..b88bf67 100644 --- a/vendor/gorm.io/gorm/.golangci.yml +++ b/vendor/gorm.io/gorm/.golangci.yml @@ -9,3 +9,12 @@ linters: - prealloc - unconvert - unparam + - goimports + - whitespace + +linters-settings: + whitespace: + multi-func: true + goimports: + local-prefixes: gorm.io/gorm + diff --git a/vendor/gorm.io/gorm/License b/vendor/gorm.io/gorm/LICENSE similarity index 100% rename from vendor/gorm.io/gorm/License rename to vendor/gorm.io/gorm/LICENSE diff --git a/vendor/gorm.io/gorm/README.md b/vendor/gorm.io/gorm/README.md index 312a3a5..745dad6 100644 --- a/vendor/gorm.io/gorm/README.md +++ b/vendor/gorm.io/gorm/README.md @@ -4,9 +4,6 @@ The fantastic ORM library for Golang, aims to be developer friendly. [![go report card](https://goreportcard.com/badge/github.com/go-gorm/gorm "go report card")](https://goreportcard.com/report/github.com/go-gorm/gorm) [![test status](https://github.com/go-gorm/gorm/workflows/tests/badge.svg?branch=master "test status")](https://github.com/go-gorm/gorm/actions) -[![Join the chat at https://gitter.im/jinzhu/gorm](https://img.shields.io/gitter/room/jinzhu/gorm.svg)](https://gitter.im/jinzhu/gorm?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -[![Open Collective Backer](https://opencollective.com/gorm/tiers/backer/badge.svg?label=backer&color=brightgreen "Open Collective Backer")](https://opencollective.com/gorm) -[![Open Collective Sponsor](https://opencollective.com/gorm/tiers/sponsor/badge.svg?label=sponsor&color=brightgreen "Open Collective Sponsor")](https://opencollective.com/gorm) [![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT) [![Go.Dev reference](https://img.shields.io/badge/go.dev-reference-blue?logo=go&logoColor=white)](https://pkg.go.dev/gorm.io/gorm?tab=doc) @@ -30,14 +27,18 @@ The fantastic ORM library for Golang, aims to be developer friendly. ## Getting Started * GORM Guides [https://gorm.io](https://gorm.io) -* GORM Gen [gorm/gen](https://github.com/go-gorm/gen#gormgen) +* Gen Guides [https://gorm.io/gen/index.html](https://gorm.io/gen/index.html) ## Contributing [You can help to deliver a better GORM, check out things you can do](https://gorm.io/contribute.html) +## Contributors + +[Thank you](https://github.com/go-gorm/gorm/graphs/contributors) for contributing to the GORM framework! + ## License © Jinzhu, 2013~time.Now -Released under the [MIT License](https://github.com/go-gorm/gorm/blob/master/License) +Released under the [MIT License](https://github.com/go-gorm/gorm/blob/master/LICENSE) diff --git a/vendor/gorm.io/gorm/association.go b/vendor/gorm.io/gorm/association.go index 35e10dd..e3f51d1 100644 --- a/vendor/gorm.io/gorm/association.go +++ b/vendor/gorm.io/gorm/association.go @@ -14,6 +14,7 @@ import ( type Association struct { DB *DB Relationship *schema.Relationship + Unscope bool Error error } @@ -40,6 +41,15 @@ func (db *DB) Association(column string) *Association { return association } +func (association *Association) Unscoped() *Association { + return &Association{ + DB: association.DB, + Relationship: association.Relationship, + Error: association.Error, + Unscope: true, + } +} + func (association *Association) Find(out interface{}, conds ...interface{}) error { if association.Error == nil { association.Error = association.buildCondition().Find(out, conds...).Error @@ -64,14 +74,30 @@ func (association *Association) Append(values ...interface{}) error { func (association *Association) Replace(values ...interface{}) error { if association.Error == nil { + reflectValue := association.DB.Statement.ReflectValue + rel := association.Relationship + + var oldBelongsToExpr clause.Expression + // we have to record the old BelongsTo value + if association.Unscope && rel.Type == schema.BelongsTo { + var foreignFields []*schema.Field + for _, ref := range rel.References { + if !ref.OwnPrimaryKey { + foreignFields = append(foreignFields, ref.ForeignKey) + } + } + if _, fvs := schema.GetIdentityFieldValuesMap(association.DB.Statement.Context, reflectValue, foreignFields); len(fvs) > 0 { + column, values := schema.ToQueryValues(rel.FieldSchema.Table, rel.FieldSchema.PrimaryFieldDBNames, fvs) + oldBelongsToExpr = clause.IN{Column: column, Values: values} + } + } + // save associations if association.saveAssociation( /*clear*/ true, values...); association.Error != nil { return association.Error } // set old associations's foreign key to null - reflectValue := association.DB.Statement.ReflectValue - rel := association.Relationship switch rel.Type { case schema.BelongsTo: if len(values) == 0 { @@ -91,6 +117,9 @@ func (association *Association) Replace(values ...interface{}) error { association.Error = association.DB.UpdateColumns(updateMap).Error } + if association.Unscope && oldBelongsToExpr != nil { + association.Error = association.DB.Model(nil).Where(oldBelongsToExpr).Delete(reflect.New(rel.FieldSchema.ModelType).Interface()).Error + } case schema.HasOne, schema.HasMany: var ( primaryFields []*schema.Field @@ -119,7 +148,11 @@ func (association *Association) Replace(values ...interface{}) error { if _, pvs := schema.GetIdentityFieldValuesMap(association.DB.Statement.Context, reflectValue, primaryFields); len(pvs) > 0 { column, values := schema.ToQueryValues(rel.FieldSchema.Table, foreignKeys, pvs) - association.Error = tx.Where(clause.IN{Column: column, Values: values}).UpdateColumns(updateMap).Error + if association.Unscope { + association.Error = tx.Where(clause.IN{Column: column, Values: values}).Delete(modelValue).Error + } else { + association.Error = tx.Where(clause.IN{Column: column, Values: values}).UpdateColumns(updateMap).Error + } } case schema.Many2Many: var ( @@ -184,7 +217,8 @@ func (association *Association) Delete(values ...interface{}) error { switch rel.Type { case schema.BelongsTo: - tx := association.DB.Model(reflect.New(rel.Schema.ModelType).Interface()) + associationDB := association.DB.Session(&Session{}) + tx := associationDB.Model(reflect.New(rel.Schema.ModelType).Interface()) _, pvs := schema.GetIdentityFieldValuesMap(association.DB.Statement.Context, reflectValue, rel.Schema.PrimaryFields) if pcolumn, pvalues := schema.ToQueryValues(rel.Schema.Table, rel.Schema.PrimaryFieldDBNames, pvs); len(pvalues) > 0 { @@ -198,8 +232,21 @@ func (association *Association) Delete(values ...interface{}) error { conds = append(conds, clause.IN{Column: relColumn, Values: relValues}) association.Error = tx.Clauses(conds...).UpdateColumns(updateAttrs).Error + if association.Unscope { + var foreignFields []*schema.Field + for _, ref := range rel.References { + if !ref.OwnPrimaryKey { + foreignFields = append(foreignFields, ref.ForeignKey) + } + } + if _, fvs := schema.GetIdentityFieldValuesMap(association.DB.Statement.Context, reflectValue, foreignFields); len(fvs) > 0 { + column, values := schema.ToQueryValues(rel.FieldSchema.Table, rel.FieldSchema.PrimaryFieldDBNames, fvs) + association.Error = associationDB.Model(nil).Where(clause.IN{Column: column, Values: values}).Delete(reflect.New(rel.FieldSchema.ModelType).Interface()).Error + } + } case schema.HasOne, schema.HasMany: - tx := association.DB.Model(reflect.New(rel.FieldSchema.ModelType).Interface()) + model := reflect.New(rel.FieldSchema.ModelType).Interface() + tx := association.DB.Model(model) _, pvs := schema.GetIdentityFieldValuesMap(association.DB.Statement.Context, reflectValue, primaryFields) if pcolumn, pvalues := schema.ToQueryValues(rel.FieldSchema.Table, foreignKeys, pvs); len(pvalues) > 0 { @@ -212,7 +259,11 @@ func (association *Association) Delete(values ...interface{}) error { relColumn, relValues := schema.ToQueryValues(rel.FieldSchema.Table, rel.FieldSchema.PrimaryFieldDBNames, rvs) conds = append(conds, clause.IN{Column: relColumn, Values: relValues}) - association.Error = tx.Clauses(conds...).UpdateColumns(updateAttrs).Error + if association.Unscope { + association.Error = tx.Clauses(conds...).Delete(model).Error + } else { + association.Error = tx.Clauses(conds...).UpdateColumns(updateAttrs).Error + } case schema.Many2Many: var ( primaryFields, relPrimaryFields []*schema.Field @@ -345,6 +396,10 @@ func (association *Association) saveAssociation(clear bool, values ...interface{ } } case reflect.Struct: + if !rv.CanAddr() { + association.Error = ErrInvalidValue + return + } association.Error = association.Relationship.Field.Set(association.DB.Statement.Context, source, rv.Addr().Interface()) if association.Relationship.Field.FieldType.Kind() == reflect.Struct { @@ -353,9 +408,13 @@ func (association *Association) saveAssociation(clear bool, values ...interface{ } case schema.HasMany, schema.Many2Many: elemType := association.Relationship.Field.IndirectFieldType.Elem() - fieldValue := reflect.Indirect(association.Relationship.Field.ReflectValueOf(association.DB.Statement.Context, source)) + oldFieldValue := reflect.Indirect(association.Relationship.Field.ReflectValueOf(association.DB.Statement.Context, source)) + var fieldValue reflect.Value if clear { - fieldValue = reflect.New(association.Relationship.Field.IndirectFieldType).Elem() + fieldValue = reflect.MakeSlice(oldFieldValue.Type(), 0, oldFieldValue.Cap()) + } else { + fieldValue = reflect.MakeSlice(oldFieldValue.Type(), oldFieldValue.Len(), oldFieldValue.Cap()) + reflect.Copy(fieldValue, oldFieldValue) } appendToFieldValues := func(ev reflect.Value) { @@ -378,6 +437,10 @@ func (association *Association) saveAssociation(clear bool, values ...interface{ appendToFieldValues(reflect.Indirect(rv.Index(i)).Addr()) } case reflect.Struct: + if !rv.CanAddr() { + association.Error = ErrInvalidValue + return + } appendToFieldValues(rv.Addr()) } @@ -455,6 +518,9 @@ func (association *Association) saveAssociation(clear bool, values ...interface{ for i := 0; i < reflectValue.Len(); i++ { appendToRelations(reflectValue.Index(i), reflect.Indirect(reflect.ValueOf(values[i])), clear) + if association.Error != nil { + return + } // TODO support save slice data, sql with case? association.Error = associationDB.Updates(reflectValue.Index(i).Addr().Interface()).Error @@ -476,6 +542,9 @@ func (association *Association) saveAssociation(clear bool, values ...interface{ for idx, value := range values { rv := reflect.Indirect(reflect.ValueOf(value)) appendToRelations(reflectValue, rv, clear && idx == 0) + if association.Error != nil { + return + } } if len(values) > 0 { @@ -507,7 +576,9 @@ func (association *Association) buildCondition() *DB { joinStmt.AddClause(queryClause) } joinStmt.Build("WHERE") - tx.Clauses(clause.Expr{SQL: strings.Replace(joinStmt.SQL.String(), "WHERE ", "", 1), Vars: joinStmt.Vars}) + if len(joinStmt.SQL.String()) > 0 { + tx.Clauses(clause.Expr{SQL: strings.Replace(joinStmt.SQL.String(), "WHERE ", "", 1), Vars: joinStmt.Vars}) + } } tx = tx.Session(&Session{QueryFields: true}).Clauses(clause.From{Joins: []clause.Join{{ diff --git a/vendor/gorm.io/gorm/callbacks.go b/vendor/gorm.io/gorm/callbacks.go index c060ea7..50b5b0e 100644 --- a/vendor/gorm.io/gorm/callbacks.go +++ b/vendor/gorm.io/gorm/callbacks.go @@ -75,11 +75,7 @@ func (cs *callbacks) Raw() *processor { func (p *processor) Execute(db *DB) *DB { // call scopes for len(db.Statement.scopes) > 0 { - scopes := db.Statement.scopes - db.Statement.scopes = nil - for _, scope := range scopes { - db = scope(db) - } + db = db.executeScopes() } var ( @@ -93,6 +89,10 @@ func (p *processor) Execute(db *DB) *DB { resetBuildClauses = true } + if optimizer, ok := db.Statement.Dest.(StatementModifier); ok { + optimizer.ModifyStatement(stmt) + } + // assign model values if stmt.Model == nil { stmt.Model = stmt.Dest @@ -132,7 +132,11 @@ func (p *processor) Execute(db *DB) *DB { if stmt.SQL.Len() > 0 { db.Logger.Trace(stmt.Context, curTime, func() (string, int64) { - return db.Dialector.Explain(stmt.SQL.String(), stmt.Vars...), db.RowsAffected + sql, vars := stmt.SQL.String(), stmt.Vars + if filter, ok := db.Logger.(ParamsFilter); ok { + sql, vars = filter.ParamsFilter(stmt.Context, stmt.SQL.String(), stmt.Vars...) + } + return db.Dialector.Explain(sql, vars...), db.RowsAffected }, db.Error) } @@ -183,10 +187,18 @@ func (p *processor) Replace(name string, fn func(*DB)) error { func (p *processor) compile() (err error) { var callbacks []*callback + removedMap := map[string]bool{} for _, callback := range p.callbacks { if callback.match == nil || callback.match(p.db) { callbacks = append(callbacks, callback) } + if callback.remove { + removedMap[callback.name] = true + } + } + + if len(removedMap) > 0 { + callbacks = removeCallbacks(callbacks, removedMap) } p.callbacks = callbacks @@ -245,7 +257,7 @@ func sortCallbacks(cs []*callback) (fns []func(*DB), err error) { names, sorted []string sortCallback func(*callback) error ) - sort.Slice(cs, func(i, j int) bool { + sort.SliceStable(cs, func(i, j int) bool { if cs[j].before == "*" && cs[i].before != "*" { return true } @@ -335,3 +347,14 @@ func sortCallbacks(cs []*callback) (fns []func(*DB), err error) { return } + +func removeCallbacks(cs []*callback, nameMap map[string]bool) []*callback { + callbacks := make([]*callback, 0, len(cs)) + for _, callback := range cs { + if nameMap[callback.name] { + continue + } + callbacks = append(callbacks, callback) + } + return callbacks +} diff --git a/vendor/gorm.io/gorm/callbacks/associations.go b/vendor/gorm.io/gorm/callbacks/associations.go index 4a50e6c..f3cd464 100644 --- a/vendor/gorm.io/gorm/callbacks/associations.go +++ b/vendor/gorm.io/gorm/callbacks/associations.go @@ -51,25 +51,40 @@ func SaveBeforeAssociations(create bool) func(db *gorm.DB) { } elems := reflect.MakeSlice(reflect.SliceOf(fieldType), 0, 10) + distinctElems := reflect.MakeSlice(reflect.SliceOf(fieldType), 0, 10) + identityMap := map[string]bool{} for i := 0; i < rValLen; i++ { obj := db.Statement.ReflectValue.Index(i) if reflect.Indirect(obj).Kind() != reflect.Struct { break } - if _, zero := rel.Field.ValueOf(db.Statement.Context, obj); !zero { // check belongs to relation value rv := rel.Field.ReflectValueOf(db.Statement.Context, obj) // relation reflect value + if !isPtr { + rv = rv.Addr() + } objs = append(objs, obj) - if isPtr { - elems = reflect.Append(elems, rv) - } else { - elems = reflect.Append(elems, rv.Addr()) + elems = reflect.Append(elems, rv) + + relPrimaryValues := make([]interface{}, 0, len(rel.FieldSchema.PrimaryFields)) + for _, pf := range rel.FieldSchema.PrimaryFields { + if pfv, ok := pf.ValueOf(db.Statement.Context, rv); !ok { + relPrimaryValues = append(relPrimaryValues, pfv) + } + } + cacheKey := utils.ToStringKey(relPrimaryValues...) + if len(relPrimaryValues) != len(rel.FieldSchema.PrimaryFields) || !identityMap[cacheKey] { + if cacheKey != "" { // has primary fields + identityMap[cacheKey] = true + } + + distinctElems = reflect.Append(distinctElems, rv) } } } if elems.Len() > 0 { - if saveAssociations(db, rel, elems, selectColumns, restricted, nil) == nil { + if saveAssociations(db, rel, distinctElems, selectColumns, restricted, nil) == nil { for i := 0; i < elems.Len(); i++ { setupReferences(objs[i], elems.Index(i)) } @@ -206,9 +221,12 @@ func SaveAfterAssociations(create bool) func(db *gorm.DB) { } } - cacheKey := utils.ToStringKey(relPrimaryValues) + cacheKey := utils.ToStringKey(relPrimaryValues...) if len(relPrimaryValues) != len(rel.FieldSchema.PrimaryFields) || !identityMap[cacheKey] { - identityMap[cacheKey] = true + if cacheKey != "" { // has primary fields + identityMap[cacheKey] = true + } + if isPtr { elems = reflect.Append(elems, elem) } else { @@ -292,9 +310,12 @@ func SaveAfterAssociations(create bool) func(db *gorm.DB) { } } - cacheKey := utils.ToStringKey(relPrimaryValues) + cacheKey := utils.ToStringKey(relPrimaryValues...) if len(relPrimaryValues) != len(rel.FieldSchema.PrimaryFields) || !identityMap[cacheKey] { - identityMap[cacheKey] = true + if cacheKey != "" { // has primary fields + identityMap[cacheKey] = true + } + distinctElems = reflect.Append(distinctElems, elem) } diff --git a/vendor/gorm.io/gorm/callbacks/callmethod.go b/vendor/gorm.io/gorm/callbacks/callmethod.go index bcaa03f..fb90003 100644 --- a/vendor/gorm.io/gorm/callbacks/callmethod.go +++ b/vendor/gorm.io/gorm/callbacks/callmethod.go @@ -13,11 +13,20 @@ func callMethod(db *gorm.DB, fc func(value interface{}, tx *gorm.DB) bool) { case reflect.Slice, reflect.Array: db.Statement.CurDestIndex = 0 for i := 0; i < db.Statement.ReflectValue.Len(); i++ { - fc(reflect.Indirect(db.Statement.ReflectValue.Index(i)).Addr().Interface(), tx) + if value := reflect.Indirect(db.Statement.ReflectValue.Index(i)); value.CanAddr() { + fc(value.Addr().Interface(), tx) + } else { + db.AddError(gorm.ErrInvalidValue) + return + } db.Statement.CurDestIndex++ } case reflect.Struct: - fc(db.Statement.ReflectValue.Addr().Interface(), tx) + if db.Statement.ReflectValue.CanAddr() { + fc(db.Statement.ReflectValue.Addr().Interface(), tx) + } else { + db.AddError(gorm.ErrInvalidValue) + } } } } diff --git a/vendor/gorm.io/gorm/callbacks/create.go b/vendor/gorm.io/gorm/callbacks/create.go index 0fe1dc9..8b7846b 100644 --- a/vendor/gorm.io/gorm/callbacks/create.go +++ b/vendor/gorm.io/gorm/callbacks/create.go @@ -3,6 +3,7 @@ package callbacks import ( "fmt" "reflect" + "strings" "gorm.io/gorm" "gorm.io/gorm/clause" @@ -102,13 +103,62 @@ func Create(config *Config) func(db *gorm.DB) { } db.RowsAffected, _ = result.RowsAffected() - if db.RowsAffected != 0 && db.Statement.Schema != nil && - db.Statement.Schema.PrioritizedPrimaryField != nil && - db.Statement.Schema.PrioritizedPrimaryField.HasDefaultValue { - insertID, err := result.LastInsertId() - insertOk := err == nil && insertID > 0 - if !insertOk { + if db.RowsAffected == 0 { + return + } + + var ( + pkField *schema.Field + pkFieldName = "@id" + ) + + insertID, err := result.LastInsertId() + insertOk := err == nil && insertID > 0 + + if !insertOk { + if !supportReturning { db.AddError(err) + } + return + } + + if db.Statement.Schema != nil { + if db.Statement.Schema.PrioritizedPrimaryField == nil || !db.Statement.Schema.PrioritizedPrimaryField.HasDefaultValue { + return + } + pkField = db.Statement.Schema.PrioritizedPrimaryField + pkFieldName = db.Statement.Schema.PrioritizedPrimaryField.DBName + } + + // append @id column with value for auto-increment primary key + // the @id value is correct, when: 1. without setting auto-increment primary key, 2. database AutoIncrementIncrement = 1 + switch values := db.Statement.Dest.(type) { + case map[string]interface{}: + values[pkFieldName] = insertID + case *map[string]interface{}: + (*values)[pkFieldName] = insertID + case []map[string]interface{}, *[]map[string]interface{}: + mapValues, ok := values.([]map[string]interface{}) + if !ok { + if v, ok := values.(*[]map[string]interface{}); ok { + if *v != nil { + mapValues = *v + } + } + } + + if config.LastInsertIDReversed { + insertID -= int64(len(mapValues)-1) * schema.DefaultAutoIncrementIncrement + } + + for _, mapValue := range mapValues { + if mapValue != nil { + mapValue[pkFieldName] = insertID + } + insertID += schema.DefaultAutoIncrementIncrement + } + default: + if pkField == nil { return } @@ -121,10 +171,10 @@ func Create(config *Config) func(db *gorm.DB) { break } - _, isZero := db.Statement.Schema.PrioritizedPrimaryField.ValueOf(db.Statement.Context, rv) + _, isZero := pkField.ValueOf(db.Statement.Context, rv) if isZero { - db.AddError(db.Statement.Schema.PrioritizedPrimaryField.Set(db.Statement.Context, rv, insertID)) - insertID -= db.Statement.Schema.PrioritizedPrimaryField.AutoIncrementIncrement + db.AddError(pkField.Set(db.Statement.Context, rv, insertID)) + insertID -= pkField.AutoIncrementIncrement } } } else { @@ -134,16 +184,16 @@ func Create(config *Config) func(db *gorm.DB) { break } - if _, isZero := db.Statement.Schema.PrioritizedPrimaryField.ValueOf(db.Statement.Context, rv); isZero { - db.AddError(db.Statement.Schema.PrioritizedPrimaryField.Set(db.Statement.Context, rv, insertID)) - insertID += db.Statement.Schema.PrioritizedPrimaryField.AutoIncrementIncrement + if _, isZero := pkField.ValueOf(db.Statement.Context, rv); isZero { + db.AddError(pkField.Set(db.Statement.Context, rv, insertID)) + insertID += pkField.AutoIncrementIncrement } } } case reflect.Struct: - _, isZero := db.Statement.Schema.PrioritizedPrimaryField.ValueOf(db.Statement.Context, db.Statement.ReflectValue) + _, isZero := pkField.ValueOf(db.Statement.Context, db.Statement.ReflectValue) if isZero { - db.AddError(db.Statement.Schema.PrioritizedPrimaryField.Set(db.Statement.Context, db.Statement.ReflectValue, insertID)) + db.AddError(pkField.Set(db.Statement.Context, db.Statement.ReflectValue, insertID)) } } } @@ -252,13 +302,15 @@ func ConvertToCreateValues(stmt *gorm.Statement) (values clause.Values) { } } - for field, vs := range defaultValueFieldsHavingValue { - values.Columns = append(values.Columns, clause.Column{Name: field.DBName}) - for idx := range values.Values { - if vs[idx] == nil { - values.Values[idx] = append(values.Values[idx], stmt.Dialector.DefaultValueOf(field)) - } else { - values.Values[idx] = append(values.Values[idx], vs[idx]) + for _, field := range stmt.Schema.FieldsWithDefaultDBValue { + if vs, ok := defaultValueFieldsHavingValue[field]; ok { + values.Columns = append(values.Columns, clause.Column{Name: field.DBName}) + for idx := range values.Values { + if vs[idx] == nil { + values.Values[idx] = append(values.Values[idx], stmt.DefaultValueOf(field)) + } else { + values.Values[idx] = append(values.Values[idx], vs[idx]) + } } } } @@ -281,7 +333,7 @@ func ConvertToCreateValues(stmt *gorm.Statement) (values clause.Values) { } for _, field := range stmt.Schema.FieldsWithDefaultDBValue { - if v, ok := selectColumns[field.DBName]; (ok && v) || (!ok && !restricted) { + if v, ok := selectColumns[field.DBName]; (ok && v) || (!ok && !restricted) && field.DefaultValueInterface == nil { if rvOfvalue, isZero := field.ValueOf(stmt.Context, stmt.ReflectValue); !isZero { values.Columns = append(values.Columns, clause.Column{Name: field.DBName}) values.Values[0] = append(values.Values[0], rvOfvalue) @@ -302,14 +354,15 @@ func ConvertToCreateValues(stmt *gorm.Statement) (values clause.Values) { for _, column := range values.Columns { if field := stmt.Schema.LookUpField(column.Name); field != nil { if v, ok := selectColumns[field.DBName]; (ok && v) || (!ok && !restricted) { - if !field.PrimaryKey && (!field.HasDefaultValue || field.DefaultValueInterface != nil) && field.AutoCreateTime == 0 { + if !field.PrimaryKey && (!field.HasDefaultValue || field.DefaultValueInterface != nil || + strings.EqualFold(field.DefaultValue, "NULL")) && field.AutoCreateTime == 0 { if field.AutoUpdateTime > 0 { assignment := clause.Assignment{Column: clause.Column{Name: field.DBName}, Value: curTime} switch field.AutoUpdateTime { case schema.UnixNanosecond: assignment.Value = curTime.UnixNano() case schema.UnixMillisecond: - assignment.Value = curTime.UnixNano() / 1e6 + assignment.Value = curTime.UnixMilli() case schema.UnixSecond: assignment.Value = curTime.Unix() } diff --git a/vendor/gorm.io/gorm/callbacks/preload.go b/vendor/gorm.io/gorm/callbacks/preload.go index ea2570b..fd8214b 100644 --- a/vendor/gorm.io/gorm/callbacks/preload.go +++ b/vendor/gorm.io/gorm/callbacks/preload.go @@ -3,6 +3,8 @@ package callbacks import ( "fmt" "reflect" + "sort" + "strings" "gorm.io/gorm" "gorm.io/gorm/clause" @@ -10,6 +12,176 @@ import ( "gorm.io/gorm/utils" ) +// parsePreloadMap extracts nested preloads. e.g. +// +// // schema has a "k0" relation and a "k7.k8" embedded relation +// parsePreloadMap(schema, map[string][]interface{}{ +// clause.Associations: {"arg1"}, +// "k1": {"arg2"}, +// "k2.k3": {"arg3"}, +// "k4.k5.k6": {"arg4"}, +// }) +// // preloadMap is +// map[string]map[string][]interface{}{ +// "k0": {}, +// "k7": { +// "k8": {}, +// }, +// "k1": {}, +// "k2": { +// "k3": {"arg3"}, +// }, +// "k4": { +// "k5.k6": {"arg4"}, +// }, +// } +func parsePreloadMap(s *schema.Schema, preloads map[string][]interface{}) map[string]map[string][]interface{} { + preloadMap := map[string]map[string][]interface{}{} + setPreloadMap := func(name, value string, args []interface{}) { + if _, ok := preloadMap[name]; !ok { + preloadMap[name] = map[string][]interface{}{} + } + if value != "" { + preloadMap[name][value] = args + } + } + + for name, args := range preloads { + preloadFields := strings.Split(name, ".") + value := strings.TrimPrefix(strings.TrimPrefix(name, preloadFields[0]), ".") + if preloadFields[0] == clause.Associations { + for _, relation := range s.Relationships.Relations { + if relation.Schema == s { + setPreloadMap(relation.Name, value, args) + } + } + + for embedded, embeddedRelations := range s.Relationships.EmbeddedRelations { + for _, value := range embeddedValues(embeddedRelations) { + setPreloadMap(embedded, value, args) + } + } + } else { + setPreloadMap(preloadFields[0], value, args) + } + } + return preloadMap +} + +func embeddedValues(embeddedRelations *schema.Relationships) []string { + if embeddedRelations == nil { + return nil + } + names := make([]string, 0, len(embeddedRelations.Relations)+len(embeddedRelations.EmbeddedRelations)) + for _, relation := range embeddedRelations.Relations { + // skip first struct name + names = append(names, strings.Join(relation.Field.EmbeddedBindNames[1:], ".")) + } + for _, relations := range embeddedRelations.EmbeddedRelations { + names = append(names, embeddedValues(relations)...) + } + return names +} + +// preloadEntryPoint enters layer by layer. It will call real preload if it finds the right entry point. +// If the current relationship is embedded or joined, current query will be ignored. +// +//nolint:cyclop +func preloadEntryPoint(db *gorm.DB, joins []string, relationships *schema.Relationships, preloads map[string][]interface{}, associationsConds []interface{}) error { + preloadMap := parsePreloadMap(db.Statement.Schema, preloads) + + // avoid random traversal of the map + preloadNames := make([]string, 0, len(preloadMap)) + for key := range preloadMap { + preloadNames = append(preloadNames, key) + } + sort.Strings(preloadNames) + + isJoined := func(name string) (joined bool, nestedJoins []string) { + for _, join := range joins { + if _, ok := relationships.Relations[join]; ok && name == join { + joined = true + continue + } + joinNames := strings.SplitN(join, ".", 2) + if len(joinNames) == 2 { + if _, ok := relationships.Relations[joinNames[0]]; ok && name == joinNames[0] { + joined = true + nestedJoins = append(nestedJoins, joinNames[1]) + } + } + } + return joined, nestedJoins + } + + for _, name := range preloadNames { + if relations := relationships.EmbeddedRelations[name]; relations != nil { + if err := preloadEntryPoint(db, joins, relations, preloadMap[name], associationsConds); err != nil { + return err + } + } else if rel := relationships.Relations[name]; rel != nil { + if joined, nestedJoins := isJoined(name); joined { + switch rv := db.Statement.ReflectValue; rv.Kind() { + case reflect.Slice, reflect.Array: + if rv.Len() > 0 { + reflectValue := rel.FieldSchema.MakeSlice().Elem() + for i := 0; i < rv.Len(); i++ { + frv := rel.Field.ReflectValueOf(db.Statement.Context, rv.Index(i)) + if frv.Kind() != reflect.Ptr { + reflectValue = reflect.Append(reflectValue, frv.Addr()) + } else { + if frv.IsNil() { + continue + } + reflectValue = reflect.Append(reflectValue, frv) + } + } + + tx := preloadDB(db, reflectValue, reflectValue.Interface()) + if err := preloadEntryPoint(tx, nestedJoins, &tx.Statement.Schema.Relationships, preloadMap[name], associationsConds); err != nil { + return err + } + } + case reflect.Struct, reflect.Pointer: + reflectValue := rel.Field.ReflectValueOf(db.Statement.Context, rv) + tx := preloadDB(db, reflectValue, reflectValue.Interface()) + if err := preloadEntryPoint(tx, nestedJoins, &tx.Statement.Schema.Relationships, preloadMap[name], associationsConds); err != nil { + return err + } + default: + return gorm.ErrInvalidData + } + } else { + tx := db.Table("").Session(&gorm.Session{Context: db.Statement.Context, SkipHooks: db.Statement.SkipHooks}) + tx.Statement.ReflectValue = db.Statement.ReflectValue + tx.Statement.Unscoped = db.Statement.Unscoped + if err := preload(tx, rel, append(preloads[name], associationsConds...), preloadMap[name]); err != nil { + return err + } + } + } else { + return fmt.Errorf("%s: %w for schema %s", name, gorm.ErrUnsupportedRelation, db.Statement.Schema.Name) + } + } + return nil +} + +func preloadDB(db *gorm.DB, reflectValue reflect.Value, dest interface{}) *gorm.DB { + tx := db.Session(&gorm.Session{Context: db.Statement.Context, NewDB: true, SkipHooks: db.Statement.SkipHooks, Initialized: true}) + db.Statement.Settings.Range(func(k, v interface{}) bool { + tx.Statement.Settings.Store(k, v) + return true + }) + + if err := tx.Statement.Parse(dest); err != nil { + tx.AddError(err) + return tx + } + tx.Statement.ReflectValue = reflectValue + tx.Statement.Unscoped = db.Statement.Unscoped + return tx +} + func preload(tx *gorm.DB, rel *schema.Relationship, conds []interface{}, preloads map[string][]interface{}) error { var ( reflectValue = tx.Statement.ReflectValue diff --git a/vendor/gorm.io/gorm/callbacks/query.go b/vendor/gorm.io/gorm/callbacks/query.go index 26ee8c3..bbf238a 100644 --- a/vendor/gorm.io/gorm/callbacks/query.go +++ b/vendor/gorm.io/gorm/callbacks/query.go @@ -3,11 +3,12 @@ package callbacks import ( "fmt" "reflect" - "sort" "strings" "gorm.io/gorm" "gorm.io/gorm/clause" + "gorm.io/gorm/schema" + "gorm.io/gorm/utils" ) func Query(db *gorm.DB) { @@ -109,78 +110,141 @@ func BuildQuerySQL(db *gorm.DB) { } } + specifiedRelationsName := make(map[string]interface{}) for _, join := range db.Statement.Joins { - if db.Statement.Schema == nil { - fromClause.Joins = append(fromClause.Joins, clause.Join{ - Expression: clause.NamedExpr{SQL: join.Name, Vars: join.Conds}, - }) - } else if relation, ok := db.Statement.Schema.Relationships.Relations[join.Name]; ok { - tableAliasName := relation.Name - - for _, s := range relation.FieldSchema.DBNames { - clauseSelect.Columns = append(clauseSelect.Columns, clause.Column{ - Table: tableAliasName, - Name: s, - Alias: tableAliasName + "__" + s, - }) + if db.Statement.Schema != nil { + var isRelations bool // is relations or raw sql + var relations []*schema.Relationship + relation, ok := db.Statement.Schema.Relationships.Relations[join.Name] + if ok { + isRelations = true + relations = append(relations, relation) + } else { + // handle nested join like "Manager.Company" + nestedJoinNames := strings.Split(join.Name, ".") + if len(nestedJoinNames) > 1 { + isNestedJoin := true + gussNestedRelations := make([]*schema.Relationship, 0, len(nestedJoinNames)) + currentRelations := db.Statement.Schema.Relationships.Relations + for _, relname := range nestedJoinNames { + // incomplete match, only treated as raw sql + if relation, ok = currentRelations[relname]; ok { + gussNestedRelations = append(gussNestedRelations, relation) + currentRelations = relation.FieldSchema.Relationships.Relations + } else { + isNestedJoin = false + break + } + } + + if isNestedJoin { + isRelations = true + relations = gussNestedRelations + } + } } - exprs := make([]clause.Expression, len(relation.References)) - for idx, ref := range relation.References { - if ref.OwnPrimaryKey { - exprs[idx] = clause.Eq{ - Column: clause.Column{Table: clause.CurrentTable, Name: ref.PrimaryKey.DBName}, - Value: clause.Column{Table: tableAliasName, Name: ref.ForeignKey.DBName}, + if isRelations { + genJoinClause := func(joinType clause.JoinType, parentTableName string, relation *schema.Relationship) clause.Join { + tableAliasName := relation.Name + if parentTableName != clause.CurrentTable { + tableAliasName = utils.NestedRelationName(parentTableName, tableAliasName) } - } else { - if ref.PrimaryValue == "" { - exprs[idx] = clause.Eq{ - Column: clause.Column{Table: clause.CurrentTable, Name: ref.ForeignKey.DBName}, - Value: clause.Column{Table: tableAliasName, Name: ref.PrimaryKey.DBName}, + + columnStmt := gorm.Statement{ + Table: tableAliasName, DB: db, Schema: relation.FieldSchema, + Selects: join.Selects, Omits: join.Omits, + } + + selectColumns, restricted := columnStmt.SelectAndOmitColumns(false, false) + for _, s := range relation.FieldSchema.DBNames { + if v, ok := selectColumns[s]; (ok && v) || (!ok && !restricted) { + clauseSelect.Columns = append(clauseSelect.Columns, clause.Column{ + Table: tableAliasName, + Name: s, + Alias: utils.NestedRelationName(tableAliasName, s), + }) } - } else { - exprs[idx] = clause.Eq{ - Column: clause.Column{Table: tableAliasName, Name: ref.ForeignKey.DBName}, - Value: ref.PrimaryValue, + } + + exprs := make([]clause.Expression, len(relation.References)) + for idx, ref := range relation.References { + if ref.OwnPrimaryKey { + exprs[idx] = clause.Eq{ + Column: clause.Column{Table: parentTableName, Name: ref.PrimaryKey.DBName}, + Value: clause.Column{Table: tableAliasName, Name: ref.ForeignKey.DBName}, + } + } else { + if ref.PrimaryValue == "" { + exprs[idx] = clause.Eq{ + Column: clause.Column{Table: parentTableName, Name: ref.ForeignKey.DBName}, + Value: clause.Column{Table: tableAliasName, Name: ref.PrimaryKey.DBName}, + } + } else { + exprs[idx] = clause.Eq{ + Column: clause.Column{Table: tableAliasName, Name: ref.ForeignKey.DBName}, + Value: ref.PrimaryValue, + } + } } } - } - } - { - onStmt := gorm.Statement{Table: tableAliasName, DB: db, Clauses: map[string]clause.Clause{}} - for _, c := range relation.FieldSchema.QueryClauses { - onStmt.AddClause(c) - } + { + onStmt := gorm.Statement{Table: tableAliasName, DB: db, Clauses: map[string]clause.Clause{}} + for _, c := range relation.FieldSchema.QueryClauses { + onStmt.AddClause(c) + } - if join.On != nil { - onStmt.AddClause(join.On) - } + if join.On != nil { + onStmt.AddClause(join.On) + } - if cs, ok := onStmt.Clauses["WHERE"]; ok { - if where, ok := cs.Expression.(clause.Where); ok { - where.Build(&onStmt) - - if onSQL := onStmt.SQL.String(); onSQL != "" { - vars := onStmt.Vars - for idx, v := range vars { - bindvar := strings.Builder{} - onStmt.Vars = vars[0 : idx+1] - db.Dialector.BindVarTo(&bindvar, &onStmt, v) - onSQL = strings.Replace(onSQL, bindvar.String(), "?", 1) + if cs, ok := onStmt.Clauses["WHERE"]; ok { + if where, ok := cs.Expression.(clause.Where); ok { + where.Build(&onStmt) + + if onSQL := onStmt.SQL.String(); onSQL != "" { + vars := onStmt.Vars + for idx, v := range vars { + bindvar := strings.Builder{} + onStmt.Vars = vars[0 : idx+1] + db.Dialector.BindVarTo(&bindvar, &onStmt, v) + onSQL = strings.Replace(onSQL, bindvar.String(), "?", 1) + } + + exprs = append(exprs, clause.Expr{SQL: onSQL, Vars: vars}) + } } - - exprs = append(exprs, clause.Expr{SQL: onSQL, Vars: vars}) } } + + return clause.Join{ + Type: joinType, + Table: clause.Table{Name: relation.FieldSchema.Table, Alias: tableAliasName}, + ON: clause.Where{Exprs: exprs}, + } } - } - fromClause.Joins = append(fromClause.Joins, clause.Join{ - Type: clause.LeftJoin, - Table: clause.Table{Name: relation.FieldSchema.Table, Alias: tableAliasName}, - ON: clause.Where{Exprs: exprs}, - }) + parentTableName := clause.CurrentTable + for _, rel := range relations { + // joins table alias like "Manager, Company, Manager__Company" + nestedAlias := utils.NestedRelationName(parentTableName, rel.Name) + if _, ok := specifiedRelationsName[nestedAlias]; !ok { + fromClause.Joins = append(fromClause.Joins, genJoinClause(join.JoinType, parentTableName, rel)) + specifiedRelationsName[nestedAlias] = nil + } + + if parentTableName != clause.CurrentTable { + parentTableName = utils.NestedRelationName(parentTableName, rel.Name) + } else { + parentTableName = rel.Name + } + } + } else { + fromClause.Joins = append(fromClause.Joins, clause.Join{ + Expression: clause.NamedExpr{SQL: join.Name, Vars: join.Conds}, + }) + } } else { fromClause.Joins = append(fromClause.Joins, clause.Join{ Expression: clause.NamedExpr{SQL: join.Name, Vars: join.Conds}, @@ -189,7 +253,6 @@ func BuildQuerySQL(db *gorm.DB) { } db.Statement.AddClause(fromClause) - db.Statement.Joins = nil } else { db.Statement.AddClauseIfNotExists(clause.From{}) } @@ -207,60 +270,27 @@ func Preload(db *gorm.DB) { return } - preloadMap := map[string]map[string][]interface{}{} - for name := range db.Statement.Preloads { - preloadFields := strings.Split(name, ".") - if preloadFields[0] == clause.Associations { - for _, rel := range db.Statement.Schema.Relationships.Relations { - if rel.Schema == db.Statement.Schema { - if _, ok := preloadMap[rel.Name]; !ok { - preloadMap[rel.Name] = map[string][]interface{}{} - } - - if value := strings.TrimPrefix(strings.TrimPrefix(name, preloadFields[0]), "."); value != "" { - preloadMap[rel.Name][value] = db.Statement.Preloads[name] - } - } - } - } else { - if _, ok := preloadMap[preloadFields[0]]; !ok { - preloadMap[preloadFields[0]] = map[string][]interface{}{} - } - - if value := strings.TrimPrefix(strings.TrimPrefix(name, preloadFields[0]), "."); value != "" { - preloadMap[preloadFields[0]][value] = db.Statement.Preloads[name] - } - } - } - - preloadNames := make([]string, 0, len(preloadMap)) - for key := range preloadMap { - preloadNames = append(preloadNames, key) + joins := make([]string, 0, len(db.Statement.Joins)) + for _, join := range db.Statement.Joins { + joins = append(joins, join.Name) } - sort.Strings(preloadNames) - - preloadDB := db.Session(&gorm.Session{Context: db.Statement.Context, NewDB: true, SkipHooks: db.Statement.SkipHooks, Initialized: true}) - db.Statement.Settings.Range(func(k, v interface{}) bool { - preloadDB.Statement.Settings.Store(k, v) - return true - }) - if err := preloadDB.Statement.Parse(db.Statement.Dest); err != nil { + tx := preloadDB(db, db.Statement.ReflectValue, db.Statement.Dest) + if tx.Error != nil { return } - preloadDB.Statement.ReflectValue = db.Statement.ReflectValue - for _, name := range preloadNames { - if rel := preloadDB.Statement.Schema.Relationships.Relations[name]; rel != nil { - db.AddError(preload(preloadDB.Table("").Session(&gorm.Session{Context: db.Statement.Context, SkipHooks: db.Statement.SkipHooks}), rel, append(db.Statement.Preloads[name], db.Statement.Preloads[clause.Associations]...), preloadMap[name])) - } else { - db.AddError(fmt.Errorf("%s: %w for schema %s", name, gorm.ErrUnsupportedRelation, db.Statement.Schema.Name)) - } - } + db.AddError(preloadEntryPoint(tx, joins, &tx.Statement.Schema.Relationships, db.Statement.Preloads, db.Statement.Preloads[clause.Associations])) } } func AfterQuery(db *gorm.DB) { + // clear the joins after query because preload need it + if v, ok := db.Statement.Clauses["FROM"].Expression.(clause.From); ok { + fromClause := db.Statement.Clauses["FROM"] + fromClause.Expression = clause.From{Tables: v.Tables, Joins: utils.RTrimSlice(v.Joins, len(db.Statement.Joins))} // keep the original From Joins + db.Statement.Clauses["FROM"] = fromClause + } if db.Error == nil && db.Statement.Schema != nil && !db.Statement.SkipHooks && db.Statement.Schema.AfterFind && db.RowsAffected > 0 { callMethod(db, func(value interface{}, tx *gorm.DB) bool { if i, ok := value.(AfterFindInterface); ok { diff --git a/vendor/gorm.io/gorm/callbacks/row.go b/vendor/gorm.io/gorm/callbacks/row.go index 56be742..beaa189 100644 --- a/vendor/gorm.io/gorm/callbacks/row.go +++ b/vendor/gorm.io/gorm/callbacks/row.go @@ -7,7 +7,7 @@ import ( func RowQuery(db *gorm.DB) { if db.Error == nil { BuildQuerySQL(db) - if db.DryRun { + if db.DryRun || db.Error != nil { return } diff --git a/vendor/gorm.io/gorm/callbacks/update.go b/vendor/gorm.io/gorm/callbacks/update.go index 42ffe2f..7cde7f6 100644 --- a/vendor/gorm.io/gorm/callbacks/update.go +++ b/vendor/gorm.io/gorm/callbacks/update.go @@ -70,10 +70,13 @@ func Update(config *Config) func(db *gorm.DB) { if db.Statement.SQL.Len() == 0 { db.Statement.SQL.Grow(180) db.Statement.AddClauseIfNotExists(clause.Update{}) - if set := ConvertToAssignments(db.Statement); len(set) != 0 { - db.Statement.AddClause(set) - } else if _, ok := db.Statement.Clauses["SET"]; !ok { - return + if _, ok := db.Statement.Clauses["SET"]; !ok { + if set := ConvertToAssignments(db.Statement); len(set) != 0 { + defer delete(db.Statement.Clauses, "SET") + db.Statement.AddClause(set) + } else { + return + } } db.Statement.Build(db.Statement.BuildClauses...) @@ -135,7 +138,9 @@ func ConvertToAssignments(stmt *gorm.Statement) (set clause.Set) { case reflect.Slice, reflect.Array: assignValue = func(field *schema.Field, value interface{}) { for i := 0; i < stmt.ReflectValue.Len(); i++ { - field.Set(stmt.Context, stmt.ReflectValue.Index(i), value) + if stmt.ReflectValue.CanAddr() { + field.Set(stmt.Context, stmt.ReflectValue.Index(i), value) + } } } case reflect.Struct: @@ -158,21 +163,21 @@ func ConvertToAssignments(stmt *gorm.Statement) (set clause.Set) { switch stmt.ReflectValue.Kind() { case reflect.Slice, reflect.Array: if size := stmt.ReflectValue.Len(); size > 0 { - var primaryKeyExprs []clause.Expression + var isZero bool for i := 0; i < size; i++ { - exprs := make([]clause.Expression, len(stmt.Schema.PrimaryFields)) - var notZero bool - for idx, field := range stmt.Schema.PrimaryFields { - value, isZero := field.ValueOf(stmt.Context, stmt.ReflectValue.Index(i)) - exprs[idx] = clause.Eq{Column: field.DBName, Value: value} - notZero = notZero || !isZero - } - if notZero { - primaryKeyExprs = append(primaryKeyExprs, clause.And(exprs...)) + for _, field := range stmt.Schema.PrimaryFields { + _, isZero = field.ValueOf(stmt.Context, stmt.ReflectValue.Index(i)) + if !isZero { + break + } } } - stmt.AddClause(clause.Where{Exprs: []clause.Expression{clause.And(clause.Or(primaryKeyExprs...))}}) + if !isZero { + _, primaryValues := schema.GetIdentityFieldValuesMap(stmt.Context, stmt.ReflectValue, stmt.Schema.PrimaryFields) + column, values := schema.ToQueryValues("", stmt.Schema.PrimaryFieldDBNames, primaryValues) + stmt.AddClause(clause.Where{Exprs: []clause.Expression{clause.IN{Column: column, Values: values}}}) + } } case reflect.Struct: for _, field := range stmt.Schema.PrimaryFields { @@ -229,7 +234,7 @@ func ConvertToAssignments(stmt *gorm.Statement) (set clause.Set) { if field.AutoUpdateTime == schema.UnixNanosecond { set = append(set, clause.Assignment{Column: clause.Column{Name: field.DBName}, Value: now.UnixNano()}) } else if field.AutoUpdateTime == schema.UnixMillisecond { - set = append(set, clause.Assignment{Column: clause.Column{Name: field.DBName}, Value: now.UnixNano() / 1e6}) + set = append(set, clause.Assignment{Column: clause.Column{Name: field.DBName}, Value: now.UnixMilli()}) } else if field.AutoUpdateTime == schema.UnixSecond { set = append(set, clause.Assignment{Column: clause.Column{Name: field.DBName}, Value: now.Unix()}) } else { @@ -241,11 +246,13 @@ func ConvertToAssignments(stmt *gorm.Statement) (set clause.Set) { } default: updatingSchema := stmt.Schema + var isDiffSchema bool if !updatingValue.CanAddr() || stmt.Dest != stmt.Model { // different schema updatingStmt := &gorm.Statement{DB: stmt.DB} if err := updatingStmt.Parse(stmt.Dest); err == nil { updatingSchema = updatingStmt.Schema + isDiffSchema = true } } @@ -261,7 +268,7 @@ func ConvertToAssignments(stmt *gorm.Statement) (set clause.Set) { if field.AutoUpdateTime == schema.UnixNanosecond { value = stmt.DB.NowFunc().UnixNano() } else if field.AutoUpdateTime == schema.UnixMillisecond { - value = stmt.DB.NowFunc().UnixNano() / 1e6 + value = stmt.DB.NowFunc().UnixMilli() } else if field.AutoUpdateTime == schema.UnixSecond { value = stmt.DB.NowFunc().Unix() } else { @@ -272,7 +279,13 @@ func ConvertToAssignments(stmt *gorm.Statement) (set clause.Set) { if (ok || !isZero) && field.Updatable { set = append(set, clause.Assignment{Column: clause.Column{Name: field.DBName}, Value: value}) - assignValue(field, value) + assignField := field + if isDiffSchema { + if originField := stmt.Schema.LookUpField(dbName); originField != nil { + assignField = originField + } + } + assignValue(assignField, value) } } } else { diff --git a/vendor/gorm.io/gorm/chainable_api.go b/vendor/gorm.io/gorm/chainable_api.go index 68b4d1a..8953413 100644 --- a/vendor/gorm.io/gorm/chainable_api.go +++ b/vendor/gorm.io/gorm/chainable_api.go @@ -10,10 +10,11 @@ import ( ) // Model specify the model you would like to run db operations -// // update all users's name to `hello` -// db.Model(&User{}).Update("name", "hello") -// // if user's primary key is non-blank, will use it as condition, then will only update the user's name to `hello` -// db.Model(&user).Update("name", "hello") +// +// // update all users's name to `hello` +// db.Model(&User{}).Update("name", "hello") +// // if user's primary key is non-blank, will use it as condition, then will only update that user's name to `hello` +// db.Model(&user).Update("name", "hello") func (db *DB) Model(value interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.Model = value @@ -21,6 +22,19 @@ func (db *DB) Model(value interface{}) (tx *DB) { } // Clauses Add clauses +// +// This supports both standard clauses (clause.OrderBy, clause.Limit, clause.Where) and more +// advanced techniques like specifying lock strength and optimizer hints. See the +// [docs] for more depth. +// +// // add a simple limit clause +// db.Clauses(clause.Limit{Limit: 1}).Find(&User{}) +// // tell the optimizer to use the `idx_user_name` index +// db.Clauses(hints.UseIndex("idx_user_name")).Find(&User{}) +// // specify the lock strength to UPDATE +// db.Clauses(clause.Locking{Strength: "UPDATE"}).Find(&users) +// +// [docs]: https://gorm.io/docs/sql_builder.html#Clauses func (db *DB) Clauses(conds ...clause.Expression) (tx *DB) { tx = db.getInstance() var whereConds []interface{} @@ -41,15 +55,22 @@ func (db *DB) Clauses(conds ...clause.Expression) (tx *DB) { return } -var tableRegexp = regexp.MustCompile(`(?i).+? AS (\w+)\s*(?:$|,)`) +var tableRegexp = regexp.MustCompile(`(?i)(?:.+? AS (\w+)\s*(?:$|,)|^\w+\s+(\w+)$)`) // Table specify the table you would like to run db operations +// +// // Get a user +// db.Table("users").Take(&result) func (db *DB) Table(name string, args ...interface{}) (tx *DB) { tx = db.getInstance() if strings.Contains(name, " ") || strings.Contains(name, "`") || len(args) > 0 { tx.Statement.TableExpr = &clause.Expr{SQL: name, Vars: args} - if results := tableRegexp.FindStringSubmatch(name); len(results) == 2 { - tx.Statement.Table = results[1] + if results := tableRegexp.FindStringSubmatch(name); len(results) == 3 { + if results[1] != "" { + tx.Statement.Table = results[1] + } else { + tx.Statement.Table = results[2] + } } } else if tables := strings.Split(name, "."); len(tables) == 2 { tx.Statement.TableExpr = &clause.Expr{SQL: tx.Statement.Quote(name)} @@ -65,6 +86,11 @@ func (db *DB) Table(name string, args ...interface{}) (tx *DB) { } // Distinct specify distinct fields that you want querying +// +// // Select distinct names of users +// db.Distinct("name").Find(&results) +// // Select distinct name/age pairs from users +// db.Distinct("name", "age").Find(&results) func (db *DB) Distinct(args ...interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.Distinct = true @@ -75,6 +101,14 @@ func (db *DB) Distinct(args ...interface{}) (tx *DB) { } // Select specify fields that you want when querying, creating, updating +// +// Use Select when you only want a subset of the fields. By default, GORM will select all fields. +// Select accepts both string arguments and arrays. +// +// // Select name and age of user using multiple arguments +// db.Select("name", "age").Find(&users) +// // Select name and age of user using an array +// db.Select([]string{"name", "age"}).Find(&users) func (db *DB) Select(query interface{}, args ...interface{}) (tx *DB) { tx = db.getInstance() @@ -151,7 +185,25 @@ func (db *DB) Omit(columns ...string) (tx *DB) { return } +// MapColumns modify the column names in the query results to facilitate align to the corresponding structural fields +func (db *DB) MapColumns(m map[string]string) (tx *DB) { + tx = db.getInstance() + tx.Statement.ColumnMapping = m + return +} + // Where add conditions +// +// See the [docs] for details on the various formats that where clauses can take. By default, where clauses chain with AND. +// +// // Find the first user with name jinzhu +// db.Where("name = ?", "jinzhu").First(&user) +// // Find the first user with name jinzhu and age 20 +// db.Where(&User{Name: "jinzhu", Age: 20}).First(&user) +// // Find the first user with name jinzhu and age not equal to 20 +// db.Where("name = ?", "jinzhu").Where("age <> ?", "20").First(&user) +// +// [docs]: https://gorm.io/docs/query.html#Conditions func (db *DB) Where(query interface{}, args ...interface{}) (tx *DB) { tx = db.getInstance() if conds := tx.Statement.BuildCondition(query, args...); len(conds) > 0 { @@ -161,6 +213,11 @@ func (db *DB) Where(query interface{}, args ...interface{}) (tx *DB) { } // Not add NOT conditions +// +// Not works similarly to where, and has the same syntax. +// +// // Find the first user with name not equal to jinzhu +// db.Not("name = ?", "jinzhu").First(&user) func (db *DB) Not(query interface{}, args ...interface{}) (tx *DB) { tx = db.getInstance() if conds := tx.Statement.BuildCondition(query, args...); len(conds) > 0 { @@ -170,6 +227,11 @@ func (db *DB) Not(query interface{}, args ...interface{}) (tx *DB) { } // Or add OR conditions +// +// Or is used to chain together queries with an OR. +// +// // Find the first user with name equal to jinzhu or john +// db.Where("name = ?", "jinzhu").Or("name = ?", "john").First(&user) func (db *DB) Or(query interface{}, args ...interface{}) (tx *DB) { tx = db.getInstance() if conds := tx.Statement.BuildCondition(query, args...); len(conds) > 0 { @@ -179,26 +241,45 @@ func (db *DB) Or(query interface{}, args ...interface{}) (tx *DB) { } // Joins specify Joins conditions -// db.Joins("Account").Find(&user) -// db.Joins("JOIN emails ON emails.user_id = users.id AND emails.email = ?", "jinzhu@example.org").Find(&user) -// db.Joins("Account", DB.Select("id").Where("user_id = users.id AND name = ?", "someName").Model(&Account{})) +// +// db.Joins("Account").Find(&user) +// db.Joins("JOIN emails ON emails.user_id = users.id AND emails.email = ?", "jinzhu@example.org").Find(&user) +// db.Joins("Account", DB.Select("id").Where("user_id = users.id AND name = ?", "someName").Model(&Account{})) func (db *DB) Joins(query string, args ...interface{}) (tx *DB) { + return joins(db, clause.LeftJoin, query, args...) +} + +// InnerJoins specify inner joins conditions +// db.InnerJoins("Account").Find(&user) +func (db *DB) InnerJoins(query string, args ...interface{}) (tx *DB) { + return joins(db, clause.InnerJoin, query, args...) +} + +func joins(db *DB, joinType clause.JoinType, query string, args ...interface{}) (tx *DB) { tx = db.getInstance() if len(args) == 1 { if db, ok := args[0].(*DB); ok { + j := join{ + Name: query, Conds: args, Selects: db.Statement.Selects, + Omits: db.Statement.Omits, JoinType: joinType, + } if where, ok := db.Statement.Clauses["WHERE"].Expression.(clause.Where); ok { - tx.Statement.Joins = append(tx.Statement.Joins, join{Name: query, Conds: args, On: &where}) - return + j.On = &where } + tx.Statement.Joins = append(tx.Statement.Joins, j) + return } } - tx.Statement.Joins = append(tx.Statement.Joins, join{Name: query, Conds: args}) + tx.Statement.Joins = append(tx.Statement.Joins, join{Name: query, Conds: args, JoinType: joinType}) return } // Group specify the group method on the find +// +// // Select the sum age of users with given names +// db.Model(&User{}).Select("name, sum(age) as total").Group("name").Find(&results) func (db *DB) Group(name string) (tx *DB) { tx = db.getInstance() @@ -210,6 +291,9 @@ func (db *DB) Group(name string) (tx *DB) { } // Having specify HAVING conditions for GROUP BY +// +// // Select the sum age of users with name jinzhu +// db.Model(&User{}).Select("name, sum(age) as total").Group("name").Having("name = ?", "jinzhu").Find(&result) func (db *DB) Having(query interface{}, args ...interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.AddClause(clause.GroupBy{ @@ -218,13 +302,20 @@ func (db *DB) Having(query interface{}, args ...interface{}) (tx *DB) { return } -// Order specify order when retrieve records from database -// db.Order("name DESC") -// db.Order(clause.OrderByColumn{Column: clause.Column{Name: "name"}, Desc: true}) +// Order specify order when retrieving records from database +// +// db.Order("name DESC") +// db.Order(clause.OrderByColumn{Column: clause.Column{Name: "name"}, Desc: true}) +// db.Order(clause.OrderBy{Columns: []clause.OrderByColumn{ +// {Column: clause.Column{Name: "name"}, Desc: true}, +// {Column: clause.Column{Name: "age"}, Desc: true}, +// }}) func (db *DB) Order(value interface{}) (tx *DB) { tx = db.getInstance() switch v := value.(type) { + case clause.OrderBy: + tx.Statement.AddClause(v) case clause.OrderByColumn: tx.Statement.AddClause(clause.OrderBy{ Columns: []clause.OrderByColumn{v}, @@ -242,13 +333,27 @@ func (db *DB) Order(value interface{}) (tx *DB) { } // Limit specify the number of records to be retrieved +// +// Limit conditions can be cancelled by using `Limit(-1)`. +// +// // retrieve 3 users +// db.Limit(3).Find(&users) +// // retrieve 3 users into users1, and all users into users2 +// db.Limit(3).Find(&users1).Limit(-1).Find(&users2) func (db *DB) Limit(limit int) (tx *DB) { tx = db.getInstance() - tx.Statement.AddClause(clause.Limit{Limit: limit}) + tx.Statement.AddClause(clause.Limit{Limit: &limit}) return } // Offset specify the number of records to skip before starting to return the records +// +// Offset conditions can be cancelled by using `Offset(-1)`. +// +// // select the third user +// db.Offset(2).First(&user) +// // select the first user by cancelling an earlier chained offset +// db.Offset(5).Offset(-1).First(&user) func (db *DB) Offset(offset int) (tx *DB) { tx = db.getInstance() tx.Statement.AddClause(clause.Limit{Offset: offset}) @@ -256,25 +361,37 @@ func (db *DB) Offset(offset int) (tx *DB) { } // Scopes pass current database connection to arguments `func(DB) DB`, which could be used to add conditions dynamically -// func AmountGreaterThan1000(db *gorm.DB) *gorm.DB { -// return db.Where("amount > ?", 1000) -// } // -// func OrderStatus(status []string) func (db *gorm.DB) *gorm.DB { -// return func (db *gorm.DB) *gorm.DB { -// return db.Scopes(AmountGreaterThan1000).Where("status in (?)", status) -// } -// } +// func AmountGreaterThan1000(db *gorm.DB) *gorm.DB { +// return db.Where("amount > ?", 1000) +// } +// +// func OrderStatus(status []string) func (db *gorm.DB) *gorm.DB { +// return func (db *gorm.DB) *gorm.DB { +// return db.Scopes(AmountGreaterThan1000).Where("status in (?)", status) +// } +// } // -// db.Scopes(AmountGreaterThan1000, OrderStatus([]string{"paid", "shipped"})).Find(&orders) +// db.Scopes(AmountGreaterThan1000, OrderStatus([]string{"paid", "shipped"})).Find(&orders) func (db *DB) Scopes(funcs ...func(*DB) *DB) (tx *DB) { tx = db.getInstance() tx.Statement.scopes = append(tx.Statement.scopes, funcs...) return tx } +func (db *DB) executeScopes() (tx *DB) { + scopes := db.Statement.scopes + db.Statement.scopes = nil + for _, scope := range scopes { + db = scope(db) + } + return db +} + // Preload preload associations with given conditions -// db.Preload("Orders", "state NOT IN (?)", "cancelled").Find(&users) +// +// // get all users, and preload all non-cancelled orders +// db.Preload("Orders", "state NOT IN (?)", "cancelled").Find(&users) func (db *DB) Preload(query string, args ...interface{}) (tx *DB) { tx = db.getInstance() if tx.Statement.Preloads == nil { @@ -284,18 +401,56 @@ func (db *DB) Preload(query string, args ...interface{}) (tx *DB) { return } +// Attrs provide attributes used in [FirstOrCreate] or [FirstOrInit] +// +// Attrs only adds attributes if the record is not found. +// +// // assign an email if the record is not found +// db.Where(User{Name: "non_existing"}).Attrs(User{Email: "fake@fake.org"}).FirstOrInit(&user) +// // user -> User{Name: "non_existing", Email: "fake@fake.org"} +// +// // assign an email if the record is not found, otherwise ignore provided email +// db.Where(User{Name: "jinzhu"}).Attrs(User{Email: "fake@fake.org"}).FirstOrInit(&user) +// // user -> User{Name: "jinzhu", Age: 20} +// +// [FirstOrCreate]: https://gorm.io/docs/advanced_query.html#FirstOrCreate +// [FirstOrInit]: https://gorm.io/docs/advanced_query.html#FirstOrInit func (db *DB) Attrs(attrs ...interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.attrs = attrs return } +// Assign provide attributes used in [FirstOrCreate] or [FirstOrInit] +// +// Assign adds attributes even if the record is found. If using FirstOrCreate, this means that +// records will be updated even if they are found. +// +// // assign an email regardless of if the record is not found +// db.Where(User{Name: "non_existing"}).Assign(User{Email: "fake@fake.org"}).FirstOrInit(&user) +// // user -> User{Name: "non_existing", Email: "fake@fake.org"} +// +// // assign email regardless of if record is found +// db.Where(User{Name: "jinzhu"}).Assign(User{Email: "fake@fake.org"}).FirstOrInit(&user) +// // user -> User{Name: "jinzhu", Age: 20, Email: "fake@fake.org"} +// +// [FirstOrCreate]: https://gorm.io/docs/advanced_query.html#FirstOrCreate +// [FirstOrInit]: https://gorm.io/docs/advanced_query.html#FirstOrInit func (db *DB) Assign(attrs ...interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.assigns = attrs return } +// Unscoped disables the global scope of soft deletion in a query. +// By default, GORM uses soft deletion, marking records as "deleted" +// by setting a timestamp on a specific field (e.g., `deleted_at`). +// Unscoped allows queries to include records marked as deleted, +// overriding the soft deletion behavior. +// Example: +// var users []User +// db.Unscoped().Find(&users) +// // Retrieves all users, including deleted ones. func (db *DB) Unscoped() (tx *DB) { tx = db.getInstance() tx.Statement.Unscoped = true diff --git a/vendor/gorm.io/gorm/clause/clause.go b/vendor/gorm.io/gorm/clause/clause.go index de19f2e..1354fc0 100644 --- a/vendor/gorm.io/gorm/clause/clause.go +++ b/vendor/gorm.io/gorm/clause/clause.go @@ -20,6 +20,7 @@ type Builder interface { Writer WriteQuoted(field interface{}) AddVar(Writer, ...interface{}) + AddError(error) error } // Clause diff --git a/vendor/gorm.io/gorm/clause/expression.go b/vendor/gorm.io/gorm/clause/expression.go index 92ac7f2..3140846 100644 --- a/vendor/gorm.io/gorm/clause/expression.go +++ b/vendor/gorm.io/gorm/clause/expression.go @@ -126,7 +126,7 @@ func (expr NamedExpr) Build(builder Builder) { for _, v := range []byte(expr.SQL) { if v == '@' && !inName { inName = true - name = []byte{} + name = name[:0] } else if v == ' ' || v == ',' || v == ')' || v == '"' || v == '\'' || v == '`' || v == '\r' || v == '\n' || v == ';' { if inName { if nv, ok := namedMap[string(name)]; ok { @@ -246,15 +246,19 @@ func (eq Eq) Build(builder Builder) { switch eq.Value.(type) { case []string, []int, []int32, []int64, []uint, []uint32, []uint64, []interface{}: - builder.WriteString(" IN (") rv := reflect.ValueOf(eq.Value) - for i := 0; i < rv.Len(); i++ { - if i > 0 { - builder.WriteByte(',') + if rv.Len() == 0 { + builder.WriteString(" IN (NULL)") + } else { + builder.WriteString(" IN (") + for i := 0; i < rv.Len(); i++ { + if i > 0 { + builder.WriteByte(',') + } + builder.AddVar(builder, rv.Index(i).Interface()) } - builder.AddVar(builder, rv.Index(i).Interface()) + builder.WriteByte(')') } - builder.WriteByte(')') default: if eqNil(eq.Value) { builder.WriteString(" IS NULL") diff --git a/vendor/gorm.io/gorm/clause/joins.go b/vendor/gorm.io/gorm/clause/joins.go index f3e373f..879892b 100644 --- a/vendor/gorm.io/gorm/clause/joins.go +++ b/vendor/gorm.io/gorm/clause/joins.go @@ -9,7 +9,7 @@ const ( RightJoin JoinType = "RIGHT" ) -// Join join clause for from +// Join clause for from type Join struct { Type JoinType Table Table diff --git a/vendor/gorm.io/gorm/clause/limit.go b/vendor/gorm.io/gorm/clause/limit.go index 184f602..3edde43 100644 --- a/vendor/gorm.io/gorm/clause/limit.go +++ b/vendor/gorm.io/gorm/clause/limit.go @@ -1,10 +1,8 @@ package clause -import "strconv" - // Limit limit clause type Limit struct { - Limit int + Limit *int Offset int } @@ -15,16 +13,16 @@ func (limit Limit) Name() string { // Build build where clause func (limit Limit) Build(builder Builder) { - if limit.Limit > 0 { + if limit.Limit != nil && *limit.Limit >= 0 { builder.WriteString("LIMIT ") - builder.WriteString(strconv.Itoa(limit.Limit)) + builder.AddVar(builder, *limit.Limit) } if limit.Offset > 0 { - if limit.Limit > 0 { + if limit.Limit != nil && *limit.Limit >= 0 { builder.WriteByte(' ') } builder.WriteString("OFFSET ") - builder.WriteString(strconv.Itoa(limit.Offset)) + builder.AddVar(builder, limit.Offset) } } @@ -33,7 +31,7 @@ func (limit Limit) MergeClause(clause *Clause) { clause.Name = "" if v, ok := clause.Expression.(Limit); ok { - if limit.Limit == 0 && v.Limit != 0 { + if (limit.Limit == nil || *limit.Limit == 0) && v.Limit != nil { limit.Limit = v.Limit } diff --git a/vendor/gorm.io/gorm/clause/locking.go b/vendor/gorm.io/gorm/clause/locking.go index 290aac9..2bc48ce 100644 --- a/vendor/gorm.io/gorm/clause/locking.go +++ b/vendor/gorm.io/gorm/clause/locking.go @@ -1,5 +1,12 @@ package clause +const ( + LockingStrengthUpdate = "UPDATE" + LockingStrengthShare = "SHARE" + LockingOptionsSkipLocked = "SKIP LOCKED" + LockingOptionsNoWait = "NOWAIT" +) + type Locking struct { Strength string Table Table diff --git a/vendor/gorm.io/gorm/clause/on_conflict.go b/vendor/gorm.io/gorm/clause/on_conflict.go index 309c5fc..032bf4a 100644 --- a/vendor/gorm.io/gorm/clause/on_conflict.go +++ b/vendor/gorm.io/gorm/clause/on_conflict.go @@ -16,27 +16,27 @@ func (OnConflict) Name() string { // Build build onConflict clause func (onConflict OnConflict) Build(builder Builder) { - if len(onConflict.Columns) > 0 { - builder.WriteByte('(') - for idx, column := range onConflict.Columns { - if idx > 0 { - builder.WriteByte(',') - } - builder.WriteQuoted(column) - } - builder.WriteString(`) `) - } - - if len(onConflict.TargetWhere.Exprs) > 0 { - builder.WriteString(" WHERE ") - onConflict.TargetWhere.Build(builder) - builder.WriteByte(' ') - } - if onConflict.OnConstraint != "" { builder.WriteString("ON CONSTRAINT ") builder.WriteString(onConflict.OnConstraint) builder.WriteByte(' ') + } else { + if len(onConflict.Columns) > 0 { + builder.WriteByte('(') + for idx, column := range onConflict.Columns { + if idx > 0 { + builder.WriteByte(',') + } + builder.WriteQuoted(column) + } + builder.WriteString(`) `) + } + + if len(onConflict.TargetWhere.Exprs) > 0 { + builder.WriteString(" WHERE ") + onConflict.TargetWhere.Build(builder) + builder.WriteByte(' ') + } } if onConflict.DoNothing { diff --git a/vendor/gorm.io/gorm/clause/where.go b/vendor/gorm.io/gorm/clause/where.go index a29401c..2c3c90f 100644 --- a/vendor/gorm.io/gorm/clause/where.go +++ b/vendor/gorm.io/gorm/clause/where.go @@ -21,6 +21,12 @@ func (where Where) Name() string { // Build build where clause func (where Where) Build(builder Builder) { + if len(where.Exprs) == 1 { + if andCondition, ok := where.Exprs[0].(AndConditions); ok { + where.Exprs = andCondition.Exprs + } + } + // Switch position if the first query expression is a single Or condition for idx, expr := range where.Exprs { if v, ok := expr.(OrConditions); !ok || len(v.Exprs) > 1 { @@ -147,6 +153,11 @@ func Not(exprs ...Expression) Expression { if len(exprs) == 0 { return nil } + if len(exprs) == 1 { + if andCondition, ok := exprs[0].(AndConditions); ok { + exprs = andCondition.Exprs + } + } return NotConditions{Exprs: exprs} } @@ -155,19 +166,63 @@ type NotConditions struct { } func (not NotConditions) Build(builder Builder) { - if len(not.Exprs) > 1 { - builder.WriteByte('(') + anyNegationBuilder := false + for _, c := range not.Exprs { + if _, ok := c.(NegationExpressionBuilder); ok { + anyNegationBuilder = true + break + } } - for idx, c := range not.Exprs { - if idx > 0 { - builder.WriteString(AndWithSpace) + if anyNegationBuilder { + if len(not.Exprs) > 1 { + builder.WriteByte('(') } - if negationBuilder, ok := c.(NegationExpressionBuilder); ok { - negationBuilder.NegationBuild(builder) - } else { - builder.WriteString("NOT ") + for idx, c := range not.Exprs { + if idx > 0 { + builder.WriteString(AndWithSpace) + } + + if negationBuilder, ok := c.(NegationExpressionBuilder); ok { + negationBuilder.NegationBuild(builder) + } else { + builder.WriteString("NOT ") + e, wrapInParentheses := c.(Expr) + if wrapInParentheses { + sql := strings.ToUpper(e.SQL) + if wrapInParentheses = strings.Contains(sql, AndWithSpace) || strings.Contains(sql, OrWithSpace); wrapInParentheses { + builder.WriteByte('(') + } + } + + c.Build(builder) + + if wrapInParentheses { + builder.WriteByte(')') + } + } + } + + if len(not.Exprs) > 1 { + builder.WriteByte(')') + } + } else { + builder.WriteString("NOT ") + if len(not.Exprs) > 1 { + builder.WriteByte('(') + } + + for idx, c := range not.Exprs { + if idx > 0 { + switch c.(type) { + case OrConditions: + builder.WriteString(OrWithSpace) + default: + builder.WriteString(AndWithSpace) + } + } + e, wrapInParentheses := c.(Expr) if wrapInParentheses { sql := strings.ToUpper(e.SQL) @@ -182,9 +237,9 @@ func (not NotConditions) Build(builder Builder) { builder.WriteByte(')') } } - } - if len(not.Exprs) > 1 { - builder.WriteByte(')') + if len(not.Exprs) > 1 { + builder.WriteByte(')') + } } } diff --git a/vendor/gorm.io/gorm/errors.go b/vendor/gorm.io/gorm/errors.go index 49cbfe6..025f5d6 100644 --- a/vendor/gorm.io/gorm/errors.go +++ b/vendor/gorm.io/gorm/errors.go @@ -21,6 +21,10 @@ var ( ErrPrimaryKeyRequired = errors.New("primary key required") // ErrModelValueRequired model value required ErrModelValueRequired = errors.New("model value required") + // ErrModelAccessibleFieldsRequired model accessible fields required + ErrModelAccessibleFieldsRequired = errors.New("model accessible fields required") + // ErrSubQueryRequired sub query required + ErrSubQueryRequired = errors.New("sub query required") // ErrInvalidData unsupported data ErrInvalidData = errors.New("unsupported data") // ErrUnsupportedDriver unsupported driver @@ -41,4 +45,10 @@ var ( ErrInvalidValueOfLength = errors.New("invalid association values, length doesn't match") // ErrPreloadNotAllowed preload is not allowed when count is used ErrPreloadNotAllowed = errors.New("preload is not allowed when count is used") + // ErrDuplicatedKey occurs when there is a unique key constraint violation + ErrDuplicatedKey = errors.New("duplicated key not allowed") + // ErrForeignKeyViolated occurs when there is a foreign key constraint violation + ErrForeignKeyViolated = errors.New("violates foreign key constraint") + // ErrCheckConstraintViolated occurs when there is a check constraint violation + ErrCheckConstraintViolated = errors.New("violates check constraint") ) diff --git a/vendor/gorm.io/gorm/finisher_api.go b/vendor/gorm.io/gorm/finisher_api.go index 7a3f27b..f97571e 100644 --- a/vendor/gorm.io/gorm/finisher_api.go +++ b/vendor/gorm.io/gorm/finisher_api.go @@ -13,7 +13,7 @@ import ( "gorm.io/gorm/utils" ) -// Create insert the value into database +// Create inserts value, returning the inserted data's primary key in value's id func (db *DB) Create(value interface{}) (tx *DB) { if db.CreateBatchSize > 0 { return db.CreateInBatches(value, db.CreateBatchSize) @@ -24,7 +24,7 @@ func (db *DB) Create(value interface{}) (tx *DB) { return tx.callbacks.Create().Execute(tx) } -// CreateInBatches insert the value in batches into database +// CreateInBatches inserts value in batches of batchSize func (db *DB) CreateInBatches(value interface{}, batchSize int) (tx *DB) { reflectValue := reflect.Indirect(reflect.ValueOf(value)) @@ -33,9 +33,10 @@ func (db *DB) CreateInBatches(value interface{}, batchSize int) (tx *DB) { var rowsAffected int64 tx = db.getInstance() + // the reflection length judgment of the optimized value + reflectLen := reflectValue.Len() + callFc := func(tx *DB) error { - // the reflection length judgment of the optimized value - reflectLen := reflectValue.Len() for i := 0; i < reflectLen; i += batchSize { ends := i + batchSize if ends > reflectLen { @@ -53,7 +54,7 @@ func (db *DB) CreateInBatches(value interface{}, batchSize int) (tx *DB) { return nil } - if tx.SkipDefaultTransaction { + if tx.SkipDefaultTransaction || reflectLen <= batchSize { tx.AddError(callFc(tx.Session(&Session{}))) } else { tx.AddError(tx.Transaction(callFc)) @@ -68,7 +69,7 @@ func (db *DB) CreateInBatches(value interface{}, batchSize int) (tx *DB) { return } -// Save update value in database, if the value doesn't have primary key, will insert it +// Save updates value in database. If value doesn't contain a matching primary key, value is inserted. func (db *DB) Save(value interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.Dest = value @@ -101,20 +102,19 @@ func (db *DB) Save(value interface{}) (tx *DB) { tx.Statement.Selects = append(tx.Statement.Selects, "*") } - tx = tx.callbacks.Update().Execute(tx) + updateTx := tx.callbacks.Update().Execute(tx.Session(&Session{Initialized: true})) - if tx.Error == nil && tx.RowsAffected == 0 && !tx.DryRun && !selectedUpdate { - result := reflect.New(tx.Statement.Schema.ModelType).Interface() - if result := tx.Session(&Session{}).Limit(1).Find(result); result.RowsAffected == 0 { - return tx.Create(value) - } + if updateTx.Error == nil && updateTx.RowsAffected == 0 && !updateTx.DryRun && !selectedUpdate { + return tx.Session(&Session{SkipHooks: true}).Clauses(clause.OnConflict{UpdateAll: true}).Create(value) } + + return updateTx } return } -// First find first record that match given conditions, order by primary key +// First finds the first record ordered by primary key, matching given conditions conds func (db *DB) First(dest interface{}, conds ...interface{}) (tx *DB) { tx = db.Limit(1).Order(clause.OrderByColumn{ Column: clause.Column{Table: clause.CurrentTable, Name: clause.PrimaryKey}, @@ -129,7 +129,7 @@ func (db *DB) First(dest interface{}, conds ...interface{}) (tx *DB) { return tx.callbacks.Query().Execute(tx) } -// Take return a record that match given conditions, the order will depend on the database implementation +// Take finds the first record returned by the database in no specified order, matching given conditions conds func (db *DB) Take(dest interface{}, conds ...interface{}) (tx *DB) { tx = db.Limit(1) if len(conds) > 0 { @@ -142,7 +142,7 @@ func (db *DB) Take(dest interface{}, conds ...interface{}) (tx *DB) { return tx.callbacks.Query().Execute(tx) } -// Last find last record that match given conditions, order by primary key +// Last finds the last record ordered by primary key, matching given conditions conds func (db *DB) Last(dest interface{}, conds ...interface{}) (tx *DB) { tx = db.Limit(1).Order(clause.OrderByColumn{ Column: clause.Column{Table: clause.CurrentTable, Name: clause.PrimaryKey}, @@ -158,7 +158,7 @@ func (db *DB) Last(dest interface{}, conds ...interface{}) (tx *DB) { return tx.callbacks.Query().Execute(tx) } -// Find find records that match given conditions +// Find finds all records matching given conditions conds func (db *DB) Find(dest interface{}, conds ...interface{}) (tx *DB) { tx = db.getInstance() if len(conds) > 0 { @@ -170,7 +170,7 @@ func (db *DB) Find(dest interface{}, conds ...interface{}) (tx *DB) { return tx.callbacks.Query().Execute(tx) } -// FindInBatches find records in batches +// FindInBatches finds all records in batches of batchSize func (db *DB) FindInBatches(dest interface{}, batchSize int, fc func(tx *DB, batch int) error) *DB { var ( tx = db.Order(clause.OrderByColumn{ @@ -185,7 +185,9 @@ func (db *DB) FindInBatches(dest interface{}, batchSize int, fc func(tx *DB, bat var totalSize int if c, ok := tx.Statement.Clauses["LIMIT"]; ok { if limit, ok := c.Expression.(clause.Limit); ok { - totalSize = limit.Limit + if limit.Limit != nil { + totalSize = *limit.Limit + } if totalSize > 0 && batchSize > totalSize { batchSize = totalSize @@ -202,7 +204,9 @@ func (db *DB) FindInBatches(dest interface{}, batchSize int, fc func(tx *DB, bat batch++ if result.Error == nil && result.RowsAffected != 0 { - tx.AddError(fc(result, batch)) + fcTx := result.Session(&Session{NewDB: true}) + fcTx.RowsAffected = result.RowsAffected + tx.AddError(fc(fcTx, batch)) } else if result.Error != nil { tx.AddError(result.Error) } @@ -227,7 +231,11 @@ func (db *DB) FindInBatches(dest interface{}, batchSize int, fc func(tx *DB, bat break } - primaryValue, _ := result.Statement.Schema.PrioritizedPrimaryField.ValueOf(tx.Statement.Context, resultsValue.Index(resultsValue.Len()-1)) + primaryValue, zero := result.Statement.Schema.PrioritizedPrimaryField.ValueOf(tx.Statement.Context, resultsValue.Index(resultsValue.Len()-1)) + if zero { + tx.AddError(ErrPrimaryKeyRequired) + break + } queryDB = tx.Clauses(clause.Gt{Column: clause.Column{Table: clause.CurrentTable, Name: clause.PrimaryKey}, Value: primaryValue}) } @@ -284,7 +292,18 @@ func (db *DB) assignInterfacesToValue(values ...interface{}) { } } -// FirstOrInit gets the first matched record or initialize a new instance with given conditions (only works with struct or map conditions) +// FirstOrInit finds the first matching record, otherwise if not found initializes a new instance with given conds. +// Each conds must be a struct or map. +// +// FirstOrInit never modifies the database. It is often used with Assign and Attrs. +// +// // assign an email if the record is not found +// db.Where(User{Name: "non_existing"}).Attrs(User{Email: "fake@fake.org"}).FirstOrInit(&user) +// // user -> User{Name: "non_existing", Email: "fake@fake.org"} +// +// // assign email regardless of if record is found +// db.Where(User{Name: "jinzhu"}).Assign(User{Email: "fake@fake.org"}).FirstOrInit(&user) +// // user -> User{Name: "jinzhu", Age: 20, Email: "fake@fake.org"} func (db *DB) FirstOrInit(dest interface{}, conds ...interface{}) (tx *DB) { queryTx := db.Limit(1).Order(clause.OrderByColumn{ Column: clause.Column{Table: clause.CurrentTable, Name: clause.PrimaryKey}, @@ -310,62 +329,82 @@ func (db *DB) FirstOrInit(dest interface{}, conds ...interface{}) (tx *DB) { return } -// FirstOrCreate gets the first matched record or create a new one with given conditions (only works with struct, map conditions) +// FirstOrCreate finds the first matching record, otherwise if not found creates a new instance with given conds. +// Each conds must be a struct or map. +// +// Using FirstOrCreate in conjunction with Assign will result in an update to the database even if the record exists. +// +// // assign an email if the record is not found +// result := db.Where(User{Name: "non_existing"}).Attrs(User{Email: "fake@fake.org"}).FirstOrCreate(&user) +// // user -> User{Name: "non_existing", Email: "fake@fake.org"} +// // result.RowsAffected -> 1 +// +// // assign email regardless of if record is found +// result := db.Where(User{Name: "jinzhu"}).Assign(User{Email: "fake@fake.org"}).FirstOrCreate(&user) +// // user -> User{Name: "jinzhu", Age: 20, Email: "fake@fake.org"} +// // result.RowsAffected -> 1 func (db *DB) FirstOrCreate(dest interface{}, conds ...interface{}) (tx *DB) { tx = db.getInstance() queryTx := db.Session(&Session{}).Limit(1).Order(clause.OrderByColumn{ Column: clause.Column{Table: clause.CurrentTable, Name: clause.PrimaryKey}, }) - if result := queryTx.Find(dest, conds...); result.Error == nil { - if result.RowsAffected == 0 { - if c, ok := result.Statement.Clauses["WHERE"]; ok { - if where, ok := c.Expression.(clause.Where); ok { - result.assignInterfacesToValue(where.Exprs) - } - } - // initialize with attrs, conds - if len(db.Statement.attrs) > 0 { - result.assignInterfacesToValue(db.Statement.attrs...) - } + result := queryTx.Find(dest, conds...) + if result.Error != nil { + tx.Error = result.Error + return tx + } - // initialize with attrs, conds - if len(db.Statement.assigns) > 0 { - result.assignInterfacesToValue(db.Statement.assigns...) + if result.RowsAffected == 0 { + if c, ok := result.Statement.Clauses["WHERE"]; ok { + if where, ok := c.Expression.(clause.Where); ok { + result.assignInterfacesToValue(where.Exprs) } + } - return tx.Create(dest) - } else if len(db.Statement.assigns) > 0 { - exprs := tx.Statement.BuildCondition(db.Statement.assigns[0], db.Statement.assigns[1:]...) - assigns := map[string]interface{}{} - for _, expr := range exprs { - if eq, ok := expr.(clause.Eq); ok { - switch column := eq.Column.(type) { - case string: - assigns[column] = eq.Value - case clause.Column: - assigns[column.Name] = eq.Value - default: - } + // initialize with attrs, conds + if len(db.Statement.attrs) > 0 { + result.assignInterfacesToValue(db.Statement.attrs...) + } + + // initialize with attrs, conds + if len(db.Statement.assigns) > 0 { + result.assignInterfacesToValue(db.Statement.assigns...) + } + + return tx.Create(dest) + } else if len(db.Statement.assigns) > 0 { + exprs := tx.Statement.BuildCondition(db.Statement.assigns[0], db.Statement.assigns[1:]...) + assigns := map[string]interface{}{} + for i := 0; i < len(exprs); i++ { + expr := exprs[i] + + if eq, ok := expr.(clause.AndConditions); ok { + exprs = append(exprs, eq.Exprs...) + } else if eq, ok := expr.(clause.Eq); ok { + switch column := eq.Column.(type) { + case string: + assigns[column] = eq.Value + case clause.Column: + assigns[column.Name] = eq.Value } } - - return tx.Model(dest).Updates(assigns) } - } else { - tx.Error = result.Error + + return tx.Model(dest).Updates(assigns) } + return tx } -// Update update attributes with callbacks, refer: https://gorm.io/docs/update.html#Update-Changed-Fields +// Update updates column with value using callbacks. Reference: https://gorm.io/docs/update.html#Update-Changed-Fields func (db *DB) Update(column string, value interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.Dest = map[string]interface{}{column: value} return tx.callbacks.Update().Execute(tx) } -// Updates update attributes with callbacks, refer: https://gorm.io/docs/update.html#Update-Changed-Fields +// Updates updates attributes using callbacks. values must be a struct or map. Reference: https://gorm.io/docs/update.html#Update-Changed-Fields func (db *DB) Updates(values interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.Dest = values @@ -386,7 +425,9 @@ func (db *DB) UpdateColumns(values interface{}) (tx *DB) { return tx.callbacks.Update().Execute(tx) } -// Delete delete value match given conditions, if the value has primary key, then will including the primary key as condition +// Delete deletes value matching given conditions. If value contains primary key it is included in the conditions. If +// value includes a deleted_at field, then Delete performs a soft delete instead by setting deleted_at with the current +// time if null. func (db *DB) Delete(value interface{}, conds ...interface{}) (tx *DB) { tx = db.getInstance() if len(conds) > 0 { @@ -480,7 +521,7 @@ func (db *DB) Rows() (*sql.Rows, error) { return rows, tx.Error } -// Scan scan value to a struct +// Scan scans selected value to the struct dest func (db *DB) Scan(dest interface{}) (tx *DB) { config := *db.Config currentLogger, newLogger := config.Logger, logger.Recorder.New() @@ -494,6 +535,7 @@ func (db *DB) Scan(dest interface{}) (tx *DB) { tx.ScanRows(rows, dest) } else { tx.RowsAffected = 0 + tx.AddError(rows.Err()) } tx.AddError(rows.Close()) } @@ -505,9 +547,10 @@ func (db *DB) Scan(dest interface{}) (tx *DB) { return } -// Pluck used to query single column from a model as a map -// var ages []int64 -// db.Model(&users).Pluck("age", &ages) +// Pluck queries a single column from a model, returning in the slice dest. E.g.: +// +// var ages []int64 +// db.Model(&users).Pluck("age", &ages) func (db *DB) Pluck(column string, dest interface{}) (tx *DB) { tx = db.getInstance() if tx.Statement.Model != nil { @@ -548,7 +591,8 @@ func (db *DB) ScanRows(rows *sql.Rows, dest interface{}) error { return tx.Error } -// Connection use a db conn to execute Multiple commands,this conn will put conn pool after it is executed. +// Connection uses a db connection to execute an arbitrary number of commands in fc. When finished, the connection is +// returned to the connection pool. func (db *DB) Connection(fc func(tx *DB) error) (err error) { if db.Error != nil { return db.Error @@ -570,7 +614,9 @@ func (db *DB) Connection(fc func(tx *DB) error) (err error) { return fc(tx) } -// Transaction start a transaction as a block, return error will rollback, otherwise to commit. +// Transaction start a transaction as a block, return error will rollback, otherwise to commit. Transaction executes an +// arbitrary number of commands in fc within a transaction. On success the changes are committed; if an error occurs +// they are rolled back. func (db *DB) Transaction(fc func(tx *DB) error, opts ...*sql.TxOptions) (err error) { panicked := true @@ -581,7 +627,6 @@ func (db *DB) Transaction(fc func(tx *DB) error, opts ...*sql.TxOptions) (err er if err != nil { return } - defer func() { // Make sure to rollback when panic, Block error or Commit error if panicked || err != nil { @@ -613,7 +658,7 @@ func (db *DB) Transaction(fc func(tx *DB) error, opts ...*sql.TxOptions) (err er return } -// Begin begins a transaction +// Begin begins a transaction with any transaction options opts func (db *DB) Begin(opts ...*sql.TxOptions) *DB { var ( // clone statement @@ -642,7 +687,7 @@ func (db *DB) Begin(opts ...*sql.TxOptions) *DB { return tx } -// Commit commit a transaction +// Commit commits the changes in a transaction func (db *DB) Commit() *DB { if committer, ok := db.Statement.ConnPool.(TxCommitter); ok && committer != nil && !reflect.ValueOf(committer).IsNil() { db.AddError(committer.Commit()) @@ -652,7 +697,7 @@ func (db *DB) Commit() *DB { return db } -// Rollback rollback a transaction +// Rollback rollbacks the changes in a transaction func (db *DB) Rollback() *DB { if committer, ok := db.Statement.ConnPool.(TxCommitter); ok && committer != nil { if !reflect.ValueOf(committer).IsNil() { @@ -666,7 +711,21 @@ func (db *DB) Rollback() *DB { func (db *DB) SavePoint(name string) *DB { if savePointer, ok := db.Dialector.(SavePointerDialectorInterface); ok { + // close prepared statement, because SavePoint not support prepared statement. + // e.g. mysql8.0 doc: https://dev.mysql.com/doc/refman/8.0/en/sql-prepared-statements.html + var ( + preparedStmtTx *PreparedStmtTX + isPreparedStmtTx bool + ) + // close prepared statement, because SavePoint not support prepared statement. + if preparedStmtTx, isPreparedStmtTx = db.Statement.ConnPool.(*PreparedStmtTX); isPreparedStmtTx { + db.Statement.ConnPool = preparedStmtTx.Tx + } db.AddError(savePointer.SavePoint(db, name)) + // restore prepared statement + if isPreparedStmtTx { + db.Statement.ConnPool = preparedStmtTx + } } else { db.AddError(ErrUnsupportedDriver) } @@ -675,14 +734,28 @@ func (db *DB) SavePoint(name string) *DB { func (db *DB) RollbackTo(name string) *DB { if savePointer, ok := db.Dialector.(SavePointerDialectorInterface); ok { + // close prepared statement, because RollbackTo not support prepared statement. + // e.g. mysql8.0 doc: https://dev.mysql.com/doc/refman/8.0/en/sql-prepared-statements.html + var ( + preparedStmtTx *PreparedStmtTX + isPreparedStmtTx bool + ) + // close prepared statement, because SavePoint not support prepared statement. + if preparedStmtTx, isPreparedStmtTx = db.Statement.ConnPool.(*PreparedStmtTX); isPreparedStmtTx { + db.Statement.ConnPool = preparedStmtTx.Tx + } db.AddError(savePointer.RollbackTo(db, name)) + // restore prepared statement + if isPreparedStmtTx { + db.Statement.ConnPool = preparedStmtTx + } } else { db.AddError(ErrUnsupportedDriver) } return db } -// Exec execute raw sql +// Exec executes raw sql func (db *DB) Exec(sql string, values ...interface{}) (tx *DB) { tx = db.getInstance() tx.Statement.SQL = strings.Builder{} diff --git a/vendor/gorm.io/gorm/gorm.go b/vendor/gorm.io/gorm/gorm.go index 6a6bb03..117d2fd 100644 --- a/vendor/gorm.io/gorm/gorm.go +++ b/vendor/gorm.io/gorm/gorm.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "fmt" + "reflect" "sort" "sync" "time" @@ -37,6 +38,8 @@ type Config struct { DisableAutomaticPing bool // DisableForeignKeyConstraintWhenMigrating DisableForeignKeyConstraintWhenMigrating bool + // IgnoreRelationshipsWhenMigrating + IgnoreRelationshipsWhenMigrating bool // DisableNestedTransaction disable nested transaction DisableNestedTransaction bool // AllowGlobalUpdate allow global update @@ -45,6 +48,10 @@ type Config struct { QueryFields bool // CreateBatchSize default create batch size CreateBatchSize int + // TranslateError enabling error translation + TranslateError bool + // PropagateUnscoped propagate Unscoped to every other nested statement + PropagateUnscoped bool // ClauseBuilders clause builder ClauseBuilders map[string]clause.ClauseBuilder @@ -105,6 +112,7 @@ type Session struct { DisableNestedTransaction bool AllowGlobalUpdate bool FullSaveAssociations bool + PropagateUnscoped bool QueryFields bool Context context.Context Logger logger.Interface @@ -142,7 +150,7 @@ func Open(dialector Dialector, opts ...Option) (db *DB, err error) { } if config.NamingStrategy == nil { - config.NamingStrategy = schema.NamingStrategy{} + config.NamingStrategy = schema.NamingStrategy{IdentifierMaxLength: 64} // Default Identifier length is 64 } if config.Logger == nil { @@ -175,17 +183,17 @@ func Open(dialector Dialector, opts ...Option) (db *DB, err error) { if config.Dialector != nil { err = config.Dialector.Initialize(db) - } - preparedStmt := &PreparedStmtDB{ - ConnPool: db.ConnPool, - Stmts: map[string]Stmt{}, - Mux: &sync.RWMutex{}, - PreparedSQL: make([]string, 0, 100), + if err != nil { + if db, _ := db.DB(); db != nil { + _ = db.Close() + } + } } - db.cacheStore.Store(preparedStmtDBKey, preparedStmt) if config.PrepareStmt { + preparedStmt := NewPreparedStmtDB(db.ConnPool) + db.cacheStore.Store(preparedStmtDBKey, preparedStmt) db.ConnPool = preparedStmt } @@ -236,6 +244,10 @@ func (db *DB) Session(config *Session) *DB { txConfig.FullSaveAssociations = true } + if config.PropagateUnscoped { + txConfig.PropagateUnscoped = true + } + if config.Context != nil || config.PrepareStmt || config.SkipHooks { tx.Statement = tx.Statement.clone() tx.Statement.DB = tx @@ -246,16 +258,30 @@ func (db *DB) Session(config *Session) *DB { } if config.PrepareStmt { + var preparedStmt *PreparedStmtDB + if v, ok := db.cacheStore.Load(preparedStmtDBKey); ok { - preparedStmt := v.(*PreparedStmtDB) + preparedStmt = v.(*PreparedStmtDB) + } else { + preparedStmt = NewPreparedStmtDB(db.ConnPool) + db.cacheStore.Store(preparedStmtDBKey, preparedStmt) + } + + switch t := tx.Statement.ConnPool.(type) { + case Tx: + tx.Statement.ConnPool = &PreparedStmtTX{ + Tx: t, + PreparedStmtDB: preparedStmt, + } + default: tx.Statement.ConnPool = &PreparedStmtDB{ ConnPool: db.Config.ConnPool, Mux: preparedStmt.Mux, Stmts: preparedStmt.Stmts, } - txConfig.ConnPool = tx.Statement.ConnPool - txConfig.PrepareStmt = true } + txConfig.ConnPool = tx.Statement.ConnPool + txConfig.PrepareStmt = true } if config.SkipHooks { @@ -300,7 +326,8 @@ func (db *DB) WithContext(ctx context.Context) *DB { // Debug start debug mode func (db *DB) Debug() (tx *DB) { - return db.Session(&Session{ + tx = db.getInstance() + return tx.Session(&Session{ Logger: db.Logger.LogMode(logger.Info), }) } @@ -336,10 +363,18 @@ func (db *DB) Callback() *callbacks { // AddError add error to db func (db *DB) AddError(err error) error { - if db.Error == nil { - db.Error = err - } else if err != nil { - db.Error = fmt.Errorf("%v; %w", db.Error, err) + if err != nil { + if db.Config.TranslateError { + if errTranslator, ok := db.Dialector.(ErrorTranslator); ok { + err = errTranslator.Translate(err) + } + } + + if db.Error == nil { + db.Error = err + } else { + db.Error = fmt.Errorf("%v; %w", db.Error, err) + } } return db.Error } @@ -347,12 +382,20 @@ func (db *DB) AddError(err error) error { // DB returns `*sql.DB` func (db *DB) DB() (*sql.DB, error) { connPool := db.ConnPool + if db.Statement != nil && db.Statement.ConnPool != nil { + connPool = db.Statement.ConnPool + } + if tx, ok := connPool.(*sql.Tx); ok && tx != nil { + return (*sql.DB)(reflect.ValueOf(tx).Elem().FieldByName("db").UnsafePointer()), nil + } if dbConnector, ok := connPool.(GetDBConnector); ok && dbConnector != nil { - return dbConnector.GetDBConn() + if sqldb, err := dbConnector.GetDBConn(); sqldb != nil || err != nil { + return sqldb, err + } } - if sqldb, ok := connPool.(*sql.DB); ok { + if sqldb, ok := connPool.(*sql.DB); ok && sqldb != nil { return sqldb, nil } @@ -366,11 +409,15 @@ func (db *DB) getInstance() *DB { if db.clone == 1 { // clone with new statement tx.Statement = &Statement{ - DB: tx, - ConnPool: db.Statement.ConnPool, - Context: db.Statement.Context, - Clauses: map[string]clause.Clause{}, - Vars: make([]interface{}, 0, 8), + DB: tx, + ConnPool: db.Statement.ConnPool, + Context: db.Statement.Context, + Clauses: map[string]clause.Clause{}, + Vars: make([]interface{}, 0, 8), + SkipHooks: db.Statement.SkipHooks, + } + if db.Config.PropagateUnscoped { + tx.Statement.Unscoped = db.Statement.Unscoped } } else { // with clone statement @@ -412,7 +459,7 @@ func (db *DB) SetupJoinTable(model interface{}, field string, joinTable interfac relation, ok := modelSchema.Relationships.Relations[field] isRelation := ok && relation.JoinTable != nil if !isRelation { - return fmt.Errorf("failed to found relation: %s", field) + return fmt.Errorf("failed to find relation: %s", field) } for _, ref := range relation.References { @@ -455,12 +502,12 @@ func (db *DB) Use(plugin Plugin) error { // ToSQL for generate SQL string. // -// db.ToSQL(func(tx *gorm.DB) *gorm.DB { -// return tx.Model(&User{}).Where(&User{Name: "foo", Age: 20}) -// .Limit(10).Offset(5) -// .Order("name ASC") -// .First(&User{}) -// }) +// db.ToSQL(func(tx *gorm.DB) *gorm.DB { +// return tx.Model(&User{}).Where(&User{Name: "foo", Age: 20}) +// .Limit(10).Offset(5) +// .Order("name ASC") +// .First(&User{}) +// }) func (db *DB) ToSQL(queryFn func(tx *DB) *DB) string { tx := queryFn(db.Session(&Session{DryRun: true, SkipDefaultTransaction: true})) stmt := tx.Statement diff --git a/vendor/gorm.io/gorm/interfaces.go b/vendor/gorm.io/gorm/interfaces.go index 32d4960..3bcc3d5 100644 --- a/vendor/gorm.io/gorm/interfaces.go +++ b/vendor/gorm.io/gorm/interfaces.go @@ -26,6 +26,10 @@ type Plugin interface { Initialize(*DB) error } +type ParamsFilter interface { + ParamsFilter(ctx context.Context, sql string, params ...interface{}) (string, []interface{}) +} + // ConnPool db conns pool interface type ConnPool interface { PrepareContext(ctx context.Context, query string) (*sql.Stmt, error) @@ -82,3 +86,7 @@ type Rows interface { Err() error Close() error } + +type ErrorTranslator interface { + Translate(err error) error +} diff --git a/vendor/gorm.io/gorm/logger/logger.go b/vendor/gorm.io/gorm/logger/logger.go index 2ffd28d..253f032 100644 --- a/vendor/gorm.io/gorm/logger/logger.go +++ b/vendor/gorm.io/gorm/logger/logger.go @@ -4,7 +4,7 @@ import ( "context" "errors" "fmt" - "io/ioutil" + "io" "log" "os" "time" @@ -55,6 +55,7 @@ type Config struct { SlowThreshold time.Duration Colorful bool IgnoreRecordNotFoundError bool + ParameterizedQueries bool LogLevel LogLevel } @@ -68,8 +69,8 @@ type Interface interface { } var ( - // Discard Discard logger will print any log to ioutil.Discard - Discard = New(log.New(ioutil.Discard, "", log.LstdFlags), Config{}) + // Discard logger will print any log to io.Discard + Discard = New(log.New(io.Discard, "", log.LstdFlags), Config{}) // Default Default logger Default = New(log.New(os.Stdout, "\r\n", log.LstdFlags), Config{ SlowThreshold: 200 * time.Millisecond, @@ -77,7 +78,7 @@ var ( IgnoreRecordNotFoundError: false, Colorful: true, }) - // Recorder Recorder logger records running SQL into a recorder instance + // Recorder logger records running SQL into a recorder instance Recorder = traceRecorder{Interface: Default, BeginAt: time.Now()} ) @@ -128,28 +129,30 @@ func (l *logger) LogMode(level LogLevel) Interface { } // Info print info -func (l logger) Info(ctx context.Context, msg string, data ...interface{}) { +func (l *logger) Info(ctx context.Context, msg string, data ...interface{}) { if l.LogLevel >= Info { l.Printf(l.infoStr+msg, append([]interface{}{utils.FileWithLineNum()}, data...)...) } } // Warn print warn messages -func (l logger) Warn(ctx context.Context, msg string, data ...interface{}) { +func (l *logger) Warn(ctx context.Context, msg string, data ...interface{}) { if l.LogLevel >= Warn { l.Printf(l.warnStr+msg, append([]interface{}{utils.FileWithLineNum()}, data...)...) } } // Error print error messages -func (l logger) Error(ctx context.Context, msg string, data ...interface{}) { +func (l *logger) Error(ctx context.Context, msg string, data ...interface{}) { if l.LogLevel >= Error { l.Printf(l.errStr+msg, append([]interface{}{utils.FileWithLineNum()}, data...)...) } } // Trace print sql message -func (l logger) Trace(ctx context.Context, begin time.Time, fc func() (string, int64), err error) { +// +//nolint:cyclop +func (l *logger) Trace(ctx context.Context, begin time.Time, fc func() (string, int64), err error) { if l.LogLevel <= Silent { return } @@ -181,6 +184,14 @@ func (l logger) Trace(ctx context.Context, begin time.Time, fc func() (string, i } } +// ParamsFilter filter params +func (l *logger) ParamsFilter(ctx context.Context, sql string, params ...interface{}) (string, []interface{}) { + if l.Config.ParameterizedQueries { + return sql, nil + } + return sql, params +} + type traceRecorder struct { Interface BeginAt time.Time @@ -189,8 +200,8 @@ type traceRecorder struct { Err error } -// New new trace recorder -func (l traceRecorder) New() *traceRecorder { +// New trace recorder +func (l *traceRecorder) New() *traceRecorder { return &traceRecorder{Interface: l.Interface, BeginAt: time.Now()} } diff --git a/vendor/gorm.io/gorm/logger/sql.go b/vendor/gorm.io/gorm/logger/sql.go index c8b194c..ad47879 100644 --- a/vendor/gorm.io/gorm/logger/sql.go +++ b/vendor/gorm.io/gorm/logger/sql.go @@ -28,8 +28,25 @@ func isPrintable(s string) bool { return true } +// A list of Go types that should be converted to SQL primitives var convertibleTypes = []reflect.Type{reflect.TypeOf(time.Time{}), reflect.TypeOf(false), reflect.TypeOf([]byte{})} +// RegEx matches only numeric values +var numericPlaceholderRe = regexp.MustCompile(`\$\d+\$`) + +func isNumeric(k reflect.Kind) bool { + switch k { + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return true + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + return true + case reflect.Float32, reflect.Float64: + return true + default: + return false + } +} + // ExplainSQL generate SQL string with given parameters, the generated SQL is expected to be used in logger, execute it might introduce a SQL injection vulnerability func ExplainSQL(sql string, numericPlaceholder *regexp.Regexp, escaper string, avars ...interface{}) string { var ( @@ -75,26 +92,28 @@ func ExplainSQL(sql string, numericPlaceholder *regexp.Regexp, escaper string, a case reflect.Bool: vars[idx] = fmt.Sprintf("%t", reflectValue.Interface()) case reflect.String: - vars[idx] = escaper + strings.ReplaceAll(fmt.Sprintf("%v", v), escaper, "\\"+escaper) + escaper + vars[idx] = escaper + strings.ReplaceAll(fmt.Sprintf("%v", v), escaper, escaper+escaper) + escaper default: if v != nil && reflectValue.IsValid() && ((reflectValue.Kind() == reflect.Ptr && !reflectValue.IsNil()) || reflectValue.Kind() != reflect.Ptr) { - vars[idx] = escaper + strings.ReplaceAll(fmt.Sprintf("%v", v), escaper, "\\"+escaper) + escaper + vars[idx] = escaper + strings.ReplaceAll(fmt.Sprintf("%v", v), escaper, escaper+escaper) + escaper } else { vars[idx] = nullStr } } case []byte: if s := string(v); isPrintable(s) { - vars[idx] = escaper + strings.ReplaceAll(s, escaper, "\\"+escaper) + escaper + vars[idx] = escaper + strings.ReplaceAll(s, escaper, escaper+escaper) + escaper } else { vars[idx] = escaper + "" + escaper } case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: vars[idx] = utils.ToString(v) - case float64, float32: - vars[idx] = fmt.Sprintf("%.6f", v) + case float32: + vars[idx] = strconv.FormatFloat(float64(v), 'f', -1, 32) + case float64: + vars[idx] = strconv.FormatFloat(v, 'f', -1, 64) case string: - vars[idx] = escaper + strings.ReplaceAll(v, escaper, "\\"+escaper) + escaper + vars[idx] = escaper + strings.ReplaceAll(v, escaper, escaper+escaper) + escaper default: rv := reflect.ValueOf(v) if v == nil || !rv.IsValid() || rv.Kind() == reflect.Ptr && rv.IsNil() { @@ -104,6 +123,12 @@ func ExplainSQL(sql string, numericPlaceholder *regexp.Regexp, escaper string, a convertParams(v, idx) } else if rv.Kind() == reflect.Ptr && !rv.IsZero() { convertParams(reflect.Indirect(rv).Interface(), idx) + } else if isNumeric(rv.Kind()) { + if rv.CanInt() || rv.CanUint() { + vars[idx] = fmt.Sprintf("%d", rv.Interface()) + } else { + vars[idx] = fmt.Sprintf("%.6f", rv.Interface()) + } } else { for _, t := range convertibleTypes { if rv.Type().ConvertibleTo(t) { @@ -111,7 +136,7 @@ func ExplainSQL(sql string, numericPlaceholder *regexp.Regexp, escaper string, a return } } - vars[idx] = escaper + strings.ReplaceAll(fmt.Sprint(v), escaper, "\\"+escaper) + escaper + vars[idx] = escaper + strings.ReplaceAll(fmt.Sprint(v), escaper, escaper+escaper) + escaper } } } @@ -138,9 +163,18 @@ func ExplainSQL(sql string, numericPlaceholder *regexp.Regexp, escaper string, a sql = newSQL.String() } else { sql = numericPlaceholder.ReplaceAllString(sql, "$$$1$$") - for idx, v := range vars { - sql = strings.Replace(sql, "$"+strconv.Itoa(idx+1)+"$", v, 1) - } + + sql = numericPlaceholderRe.ReplaceAllStringFunc(sql, func(v string) string { + num := v[1 : len(v)-1] + n, _ := strconv.Atoi(num) + + // position var start from 1 ($1, $2) + n -= 1 + if n >= 0 && n <= len(vars)-1 { + return vars[n] + } + return v + }) } return sql diff --git a/vendor/gorm.io/gorm/migrator.go b/vendor/gorm.io/gorm/migrator.go index 34e888f..3d2b032 100644 --- a/vendor/gorm.io/gorm/migrator.go +++ b/vendor/gorm.io/gorm/migrator.go @@ -13,11 +13,7 @@ func (db *DB) Migrator() Migrator { // apply scopes to migrator for len(tx.Statement.scopes) > 0 { - scopes := tx.Statement.scopes - tx.Statement.scopes = nil - for _, scope := range scopes { - tx = scope(tx) - } + tx = tx.executeScopes() } return tx.Dialector.Migrator(tx.Session(&Session{})) @@ -30,9 +26,9 @@ func (db *DB) AutoMigrate(dst ...interface{}) error { // ViewOption view option type ViewOption struct { - Replace bool - CheckOption string - Query *DB + Replace bool // If true, exec `CREATE`. If false, exec `CREATE OR REPLACE` + CheckOption string // optional. e.g. `WITH [ CASCADED | LOCAL ] CHECK OPTION` + Query *DB // required subquery. } // ColumnType column type interface @@ -60,6 +56,14 @@ type Index interface { Option() string } +// TableType table type interface +type TableType interface { + Schema() string + Name() string + Type() string + Comment() (comment string, ok bool) +} + // Migrator migrator interface type Migrator interface { // AutoMigrate @@ -68,6 +72,7 @@ type Migrator interface { // Database CurrentDatabase() string FullDataTypeOf(*schema.Field) clause.Expr + GetTypeAliases(databaseTypeName string) []string // Tables CreateTable(dst ...interface{}) error @@ -75,12 +80,15 @@ type Migrator interface { HasTable(dst interface{}) bool RenameTable(oldName, newName interface{}) error GetTables() (tableList []string, err error) + TableType(dst interface{}) (TableType, error) // Columns AddColumn(dst interface{}, field string) error DropColumn(dst interface{}, field string) error AlterColumn(dst interface{}, field string) error MigrateColumn(dst interface{}, field *schema.Field, columnType ColumnType) error + // MigrateColumnUnique migrate column's UNIQUE constraint, it's part of MigrateColumn. + MigrateColumnUnique(dst interface{}, field *schema.Field, columnType ColumnType) error HasColumn(dst interface{}, field string) bool RenameColumn(dst interface{}, oldName, field string) error ColumnTypes(dst interface{}) ([]ColumnType, error) diff --git a/vendor/gorm.io/gorm/migrator/index.go b/vendor/gorm.io/gorm/migrator/index.go index fe686e5..8845da9 100644 --- a/vendor/gorm.io/gorm/migrator/index.go +++ b/vendor/gorm.io/gorm/migrator/index.go @@ -17,12 +17,12 @@ func (idx Index) Table() string { return idx.TableName } -// Name return the name of the index. +// Name return the name of the index. func (idx Index) Name() string { return idx.NameValue } -// Columns return the columns fo the index +// Columns return the columns of the index func (idx Index) Columns() []string { return idx.ColumnList } @@ -37,7 +37,7 @@ func (idx Index) Unique() (unique bool, ok bool) { return idx.UniqueValue.Bool, idx.UniqueValue.Valid } -// Option return the optional attribute fo the index +// Option return the optional attribute of the index func (idx Index) Option() string { return idx.OptionValue } diff --git a/vendor/gorm.io/gorm/migrator/migrator.go b/vendor/gorm.io/gorm/migrator/migrator.go index 87ac774..189a141 100644 --- a/vendor/gorm.io/gorm/migrator/migrator.go +++ b/vendor/gorm.io/gorm/migrator/migrator.go @@ -7,16 +7,28 @@ import ( "fmt" "reflect" "regexp" + "strconv" "strings" + "time" "gorm.io/gorm" "gorm.io/gorm/clause" + "gorm.io/gorm/logger" "gorm.io/gorm/schema" ) -var ( - regFullDataType = regexp.MustCompile(`[^\d]*(\d+)[^\d]?`) -) +// This regular expression seeks to find a sequence of digits (\d+) among zero or more non-digit characters (\D*), +// with a possible trailing non-digit character (\D?). + +// For example, values that can pass this regular expression are: +// - "123" +// - "abc456" +// -"%$#@789" +var regFullDataType = regexp.MustCompile(`\D*(\d+)\D?`) + +// TODO:? Create const vars for raw sql queries ? + +var _ gorm.Migrator = (*Migrator)(nil) // Migrator m struct type Migrator struct { @@ -30,6 +42,16 @@ type Config struct { gorm.Dialector } +type printSQLLogger struct { + logger.Interface +} + +func (l *printSQLLogger) Trace(ctx context.Context, begin time.Time, fc func() (sql string, rowsAffected int64), err error) { + sql, _ := fc() + fmt.Println(sql + ";") + l.Interface.Trace(ctx, begin, fc, err) +} + // GormDataTypeInterface gorm data type interface type GormDataTypeInterface interface { GormDBDataType(*gorm.DB, *schema.Field) string @@ -72,10 +94,6 @@ func (m Migrator) FullDataTypeOf(field *schema.Field) (expr clause.Expr) { expr.SQL += " NOT NULL" } - if field.Unique { - expr.SQL += " UNIQUE" - } - if field.HasDefaultValue && (field.DefaultValueInterface != nil || field.DefaultValue != "") { if field.DefaultValueInterface != nil { defaultStmt := &gorm.Statement{Vars: []interface{}{field.DefaultValueInterface}} @@ -89,23 +107,40 @@ func (m Migrator) FullDataTypeOf(field *schema.Field) (expr clause.Expr) { return } +func (m Migrator) GetQueryAndExecTx() (queryTx, execTx *gorm.DB) { + queryTx = m.DB.Session(&gorm.Session{}) + execTx = queryTx + if m.DB.DryRun { + queryTx.DryRun = false + execTx = m.DB.Session(&gorm.Session{Logger: &printSQLLogger{Interface: m.DB.Logger}}) + } + return queryTx, execTx +} + // AutoMigrate auto migrate values func (m Migrator) AutoMigrate(values ...interface{}) error { for _, value := range m.ReorderModels(values, true) { - tx := m.DB.Session(&gorm.Session{}) - if !tx.Migrator().HasTable(value) { - if err := tx.Migrator().CreateTable(value); err != nil { + queryTx, execTx := m.GetQueryAndExecTx() + if !queryTx.Migrator().HasTable(value) { + if err := execTx.Migrator().CreateTable(value); err != nil { return err } } else { - if err := m.RunWithValue(value, func(stmt *gorm.Statement) (errr error) { - columnTypes, err := m.DB.Migrator().ColumnTypes(value) + if err := m.RunWithValue(value, func(stmt *gorm.Statement) error { + + if stmt.Schema == nil { + return errors.New("failed to get schema") + } + + columnTypes, err := queryTx.Migrator().ColumnTypes(value) if err != nil { return err } - + var ( + parseIndexes = stmt.Schema.ParseIndexes() + parseCheckConstraints = stmt.Schema.ParseCheckConstraints() + ) for _, dbName := range stmt.Schema.DBNames { - field := stmt.Schema.FieldsByDBName[dbName] var foundColumn gorm.ColumnType for _, columnType := range columnTypes { @@ -117,37 +152,43 @@ func (m Migrator) AutoMigrate(values ...interface{}) error { if foundColumn == nil { // not found, add column - if err := tx.Migrator().AddColumn(value, dbName); err != nil { + if err = execTx.Migrator().AddColumn(value, dbName); err != nil { + return err + } + } else { + // found, smartly migrate + field := stmt.Schema.FieldsByDBName[dbName] + if err = execTx.Migrator().MigrateColumn(value, field, foundColumn); err != nil { return err } - } else if err := m.DB.Migrator().MigrateColumn(value, field, foundColumn); err != nil { - // found, smart migrate - return err } } - for _, rel := range stmt.Schema.Relationships.Relations { - if !m.DB.Config.DisableForeignKeyConstraintWhenMigrating { + if !m.DB.DisableForeignKeyConstraintWhenMigrating && !m.DB.IgnoreRelationshipsWhenMigrating { + for _, rel := range stmt.Schema.Relationships.Relations { + if rel.Field.IgnoreMigration { + continue + } if constraint := rel.ParseConstraint(); constraint != nil && - constraint.Schema == stmt.Schema && !tx.Migrator().HasConstraint(value, constraint.Name) { - if err := tx.Migrator().CreateConstraint(value, constraint.Name); err != nil { + constraint.Schema == stmt.Schema && !queryTx.Migrator().HasConstraint(value, constraint.Name) { + if err := execTx.Migrator().CreateConstraint(value, constraint.Name); err != nil { return err } } } + } - for _, chk := range stmt.Schema.ParseCheckConstraints() { - if !tx.Migrator().HasConstraint(value, chk.Name) { - if err := tx.Migrator().CreateConstraint(value, chk.Name); err != nil { - return err - } + for _, chk := range parseCheckConstraints { + if !queryTx.Migrator().HasConstraint(value, chk.Name) { + if err := execTx.Migrator().CreateConstraint(value, chk.Name); err != nil { + return err } } } - for _, idx := range stmt.Schema.ParseIndexes() { - if !tx.Migrator().HasIndex(value, idx.Name) { - if err := tx.Migrator().CreateIndex(value, idx.Name); err != nil { + for _, idx := range parseIndexes { + if !queryTx.Migrator().HasIndex(value, idx.Name) { + if err := execTx.Migrator().CreateIndex(value, idx.Name); err != nil { return err } } @@ -174,7 +215,12 @@ func (m Migrator) GetTables() (tableList []string, err error) { func (m Migrator) CreateTable(values ...interface{}) error { for _, value := range m.ReorderModels(values, false) { tx := m.DB.Session(&gorm.Session{}) - if err := m.RunWithValue(value, func(stmt *gorm.Statement) (errr error) { + if err := m.RunWithValue(value, func(stmt *gorm.Statement) (err error) { + + if stmt.Schema == nil { + return errors.New("failed to get schema") + } + var ( createTableSQL = "CREATE TABLE ? (" values = []interface{}{m.CurrentTable(stmt)} @@ -185,7 +231,7 @@ func (m Migrator) CreateTable(values ...interface{}) error { field := stmt.Schema.FieldsByDBName[dbName] if !field.IgnoreMigration { createTableSQL += "? ?" - hasPrimaryKeyInDataType = hasPrimaryKeyInDataType || strings.Contains(strings.ToUpper(string(field.DataType)), "PRIMARY KEY") + hasPrimaryKeyInDataType = hasPrimaryKeyInDataType || strings.Contains(strings.ToUpper(m.DataTypeOf(field)), "PRIMARY KEY") values = append(values, clause.Column{Name: dbName}, m.DB.Migrator().FullDataTypeOf(field)) createTableSQL += "," } @@ -193,7 +239,7 @@ func (m Migrator) CreateTable(values ...interface{}) error { if !hasPrimaryKeyInDataType && len(stmt.Schema.PrimaryFields) > 0 { createTableSQL += "PRIMARY KEY ?," - primaryKeys := []interface{}{} + primaryKeys := make([]interface{}, 0, len(stmt.Schema.PrimaryFields)) for _, field := range stmt.Schema.PrimaryFields { primaryKeys = append(primaryKeys, clause.Column{Name: field.DBName}) } @@ -204,8 +250,8 @@ func (m Migrator) CreateTable(values ...interface{}) error { for _, idx := range stmt.Schema.ParseIndexes() { if m.CreateIndexAfterCreateTable { defer func(value interface{}, name string) { - if errr == nil { - errr = tx.Migrator().CreateIndex(value, name) + if err == nil { + err = tx.Migrator().CreateIndex(value, name) } }(value, idx.Name) } else { @@ -227,11 +273,14 @@ func (m Migrator) CreateTable(values ...interface{}) error { } } - for _, rel := range stmt.Schema.Relationships.Relations { - if !m.DB.DisableForeignKeyConstraintWhenMigrating { + if !m.DB.DisableForeignKeyConstraintWhenMigrating && !m.DB.IgnoreRelationshipsWhenMigrating { + for _, rel := range stmt.Schema.Relationships.Relations { + if rel.Field.IgnoreMigration { + continue + } if constraint := rel.ParseConstraint(); constraint != nil { if constraint.Schema == stmt.Schema { - sql, vars := buildConstraint(constraint) + sql, vars := constraint.Build() createTableSQL += sql + "," values = append(values, vars...) } @@ -239,6 +288,11 @@ func (m Migrator) CreateTable(values ...interface{}) error { } } + for _, uni := range stmt.Schema.ParseUniqueConstraints() { + createTableSQL += "CONSTRAINT ? UNIQUE (?)," + values = append(values, clause.Column{Name: uni.Name}, clause.Expr{SQL: stmt.Quote(uni.Field.DBName)}) + } + for _, chk := range stmt.Schema.ParseCheckConstraints() { createTableSQL += "CONSTRAINT ? CHECK (?)," values = append(values, clause.Column{Name: chk.Name}, clause.Expr{SQL: chk.Constraint}) @@ -252,8 +306,8 @@ func (m Migrator) CreateTable(values ...interface{}) error { createTableSQL += fmt.Sprint(tableOption) } - errr = tx.Exec(createTableSQL, values...).Error - return errr + err = tx.Exec(createTableSQL, values...).Error + return err }); err != nil { return err } @@ -319,6 +373,9 @@ func (m Migrator) RenameTable(oldName, newName interface{}) error { func (m Migrator) AddColumn(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { // avoid using the same name field + if stmt.Schema == nil { + return errors.New("failed to get schema") + } f := stmt.Schema.LookUpField(name) if f == nil { return fmt.Errorf("failed to look up field with name: %s", name) @@ -338,8 +395,10 @@ func (m Migrator) AddColumn(value interface{}, name string) error { // DropColumn drop value's `name` column func (m Migrator) DropColumn(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if field := stmt.Schema.LookUpField(name); field != nil { - name = field.DBName + if stmt.Schema != nil { + if field := stmt.Schema.LookUpField(name); field != nil { + name = field.DBName + } } return m.DB.Exec( @@ -351,13 +410,15 @@ func (m Migrator) DropColumn(value interface{}, name string) error { // AlterColumn alter value's `field` column' type based on schema definition func (m Migrator) AlterColumn(value interface{}, field string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if field := stmt.Schema.LookUpField(field); field != nil { - fileType := m.FullDataTypeOf(field) - return m.DB.Exec( - "ALTER TABLE ? ALTER COLUMN ? TYPE ?", - m.CurrentTable(stmt), clause.Column{Name: field.DBName}, fileType, - ).Error + if stmt.Schema != nil { + if field := stmt.Schema.LookUpField(field); field != nil { + fileType := m.FullDataTypeOf(field) + return m.DB.Exec( + "ALTER TABLE ? ALTER COLUMN ? TYPE ?", + m.CurrentTable(stmt), clause.Column{Name: field.DBName}, fileType, + ).Error + } } return fmt.Errorf("failed to look up field with name: %s", field) }) @@ -369,8 +430,10 @@ func (m Migrator) HasColumn(value interface{}, field string) bool { m.RunWithValue(value, func(stmt *gorm.Statement) error { currentDatabase := m.DB.Migrator().CurrentDatabase() name := field - if field := stmt.Schema.LookUpField(field); field != nil { - name = field.DBName + if stmt.Schema != nil { + if field := stmt.Schema.LookUpField(field); field != nil { + name = field.DBName + } } return m.DB.Raw( @@ -385,12 +448,14 @@ func (m Migrator) HasColumn(value interface{}, field string) bool { // RenameColumn rename value's field name from oldName to newName func (m Migrator) RenameColumn(value interface{}, oldName, newName string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if field := stmt.Schema.LookUpField(oldName); field != nil { - oldName = field.DBName - } + if stmt.Schema != nil { + if field := stmt.Schema.LookUpField(oldName); field != nil { + oldName = field.DBName + } - if field := stmt.Schema.LookUpField(newName); field != nil { - newName = field.DBName + if field := stmt.Schema.LookUpField(newName); field != nil { + newName = field.DBName + } } return m.DB.Exec( @@ -402,36 +467,58 @@ func (m Migrator) RenameColumn(value interface{}, oldName, newName string) error // MigrateColumn migrate column func (m Migrator) MigrateColumn(value interface{}, field *schema.Field, columnType gorm.ColumnType) error { + if field.IgnoreMigration { + return nil + } + // found, smart migrate fullDataType := strings.TrimSpace(strings.ToLower(m.DB.Migrator().FullDataTypeOf(field).SQL)) realDataType := strings.ToLower(columnType.DatabaseTypeName()) - alterColumn := false + var ( + alterColumn bool + isSameType = fullDataType == realDataType + ) + + if !field.PrimaryKey { + // check type + if !strings.HasPrefix(fullDataType, realDataType) { + // check type aliases + aliases := m.DB.Migrator().GetTypeAliases(realDataType) + for _, alias := range aliases { + if strings.HasPrefix(fullDataType, alias) { + isSameType = true + break + } + } - // check type - if !field.PrimaryKey && !strings.HasPrefix(fullDataType, realDataType) { - alterColumn = true + if !isSameType { + alterColumn = true + } + } } - // check size - if length, ok := columnType.Length(); length != int64(field.Size) { - if length > 0 && field.Size > 0 { - alterColumn = true - } else { - // has size in data type and not equal - // Since the following code is frequently called in the for loop, reg optimization is needed here - matches2 := regFullDataType.FindAllStringSubmatch(fullDataType, -1) - if !field.PrimaryKey && - (len(matches2) == 1 && matches2[0][1] != fmt.Sprint(length) && ok) { + if !isSameType { + // check size + if length, ok := columnType.Length(); length != int64(field.Size) { + if length > 0 && field.Size > 0 { alterColumn = true + } else { + // has size in data type and not equal + // Since the following code is frequently called in the for loop, reg optimization is needed here + matches2 := regFullDataType.FindAllStringSubmatch(fullDataType, -1) + if !field.PrimaryKey && + (len(matches2) == 1 && matches2[0][1] != fmt.Sprint(length) && ok) { + alterColumn = true + } } } - } - // check precision - if precision, _, ok := columnType.DecimalSize(); ok && int64(field.Precision) != precision { - if regexp.MustCompile(fmt.Sprintf("[^0-9]%d[^0-9]", field.Precision)).MatchString(m.DataTypeOf(field)) { - alterColumn = true + // check precision + if precision, _, ok := columnType.DecimalSize(); ok && int64(field.Precision) != precision { + if regexp.MustCompile(fmt.Sprintf("[^0-9]%d[^0-9]", field.Precision)).MatchString(m.DataTypeOf(field)) { + alterColumn = true + } } } @@ -443,28 +530,28 @@ func (m Migrator) MigrateColumn(value interface{}, field *schema.Field, columnTy } } - // check unique - if unique, ok := columnType.Unique(); ok && unique != field.Unique { - // not primary key - if !field.PrimaryKey { - alterColumn = true - } - } - // check default value if !field.PrimaryKey { + currentDefaultNotNull := field.HasDefaultValue && (field.DefaultValueInterface != nil || !strings.EqualFold(field.DefaultValue, "NULL")) dv, dvNotNull := columnType.DefaultValue() - if dvNotNull && field.DefaultValueInterface == nil { - // defalut value -> null + if dvNotNull && !currentDefaultNotNull { + // default value -> null alterColumn = true - } else if !dvNotNull && field.DefaultValueInterface != nil { + } else if !dvNotNull && currentDefaultNotNull { // null -> default value alterColumn = true - } else if dv != field.DefaultValue { - // default value not equal - // not both null - if !(field.DefaultValueInterface == nil && !dvNotNull) { - alterColumn = true + } else if currentDefaultNotNull || dvNotNull { + switch field.GORMDataType { + case schema.Time: + if !strings.EqualFold(strings.TrimSuffix(dv, "()"), strings.TrimSuffix(field.DefaultValue, "()")) { + alterColumn = true + } + case schema.Bool: + v1, _ := strconv.ParseBool(dv) + v2, _ := strconv.ParseBool(field.DefaultValue) + alterColumn = v1 != v2 + default: + alterColumn = dv != field.DefaultValue } } } @@ -477,13 +564,39 @@ func (m Migrator) MigrateColumn(value interface{}, field *schema.Field, columnTy } } - if alterColumn && !field.IgnoreMigration { - return m.DB.Migrator().AlterColumn(value, field.Name) + if alterColumn { + if err := m.DB.Migrator().AlterColumn(value, field.DBName); err != nil { + return err + } + } + + if err := m.DB.Migrator().MigrateColumnUnique(value, field, columnType); err != nil { + return err } return nil } +func (m Migrator) MigrateColumnUnique(value interface{}, field *schema.Field, columnType gorm.ColumnType) error { + unique, ok := columnType.Unique() + if !ok || field.PrimaryKey { + return nil // skip primary key + } + // By default, ColumnType's Unique is not affected by UniqueIndex, so we don't care about UniqueIndex. + return m.RunWithValue(value, func(stmt *gorm.Statement) error { + // We're currently only receiving boolean values on `Unique` tag, + // so the UniqueConstraint name is fixed + constraint := m.DB.NamingStrategy.UniqueName(stmt.Table, field.DBName) + if unique && !field.Unique { + return m.DB.Migrator().DropConstraint(value, constraint) + } + if !unique && field.Unique { + return m.DB.Migrator().CreateConstraint(value, constraint) + } + return nil + }) +} + // ColumnTypes return columnTypes []gorm.ColumnType and execErr error func (m Migrator) ColumnTypes(value interface{}) ([]gorm.ColumnType, error) { columnTypes := make([]gorm.ColumnType, 0) @@ -513,47 +626,76 @@ func (m Migrator) ColumnTypes(value interface{}) ([]gorm.ColumnType, error) { return columnTypes, execErr } -// CreateView create view +// CreateView create view from Query in gorm.ViewOption. +// Query in gorm.ViewOption is a [subquery] +// +// // CREATE VIEW `user_view` AS SELECT * FROM `users` WHERE age > 20 +// q := DB.Model(&User{}).Where("age > ?", 20) +// DB.Debug().Migrator().CreateView("user_view", gorm.ViewOption{Query: q}) +// +// // CREATE OR REPLACE VIEW `users_view` AS SELECT * FROM `users` WITH CHECK OPTION +// q := DB.Model(&User{}) +// DB.Debug().Migrator().CreateView("user_view", gorm.ViewOption{Query: q, Replace: true, CheckOption: "WITH CHECK OPTION"}) +// +// [subquery]: https://gorm.io/docs/advanced_query.html#SubQuery func (m Migrator) CreateView(name string, option gorm.ViewOption) error { - return gorm.ErrNotImplemented -} - -// DropView drop view -func (m Migrator) DropView(name string) error { - return gorm.ErrNotImplemented -} - -func buildConstraint(constraint *schema.Constraint) (sql string, results []interface{}) { - sql = "CONSTRAINT ? FOREIGN KEY ? REFERENCES ??" - if constraint.OnDelete != "" { - sql += " ON DELETE " + constraint.OnDelete + if option.Query == nil { + return gorm.ErrSubQueryRequired } - if constraint.OnUpdate != "" { - sql += " ON UPDATE " + constraint.OnUpdate + sql := new(strings.Builder) + sql.WriteString("CREATE ") + if option.Replace { + sql.WriteString("OR REPLACE ") } + sql.WriteString("VIEW ") + m.QuoteTo(sql, name) + sql.WriteString(" AS ") - var foreignKeys, references []interface{} - for _, field := range constraint.ForeignKeys { - foreignKeys = append(foreignKeys, clause.Column{Name: field.DBName}) - } + m.DB.Statement.AddVar(sql, option.Query) - for _, field := range constraint.References { - references = append(references, clause.Column{Name: field.DBName}) + if option.CheckOption != "" { + sql.WriteString(" ") + sql.WriteString(option.CheckOption) } - results = append(results, clause.Table{Name: constraint.Name}, foreignKeys, clause.Table{Name: constraint.ReferenceSchema.Table}, references) - return + return m.DB.Exec(m.Explain(sql.String(), m.DB.Statement.Vars...)).Error +} + +// DropView drop view +func (m Migrator) DropView(name string) error { + return m.DB.Exec("DROP VIEW IF EXISTS ?", clause.Table{Name: name}).Error } // GuessConstraintAndTable guess statement's constraint and it's table based on name -func (m Migrator) GuessConstraintAndTable(stmt *gorm.Statement, name string) (_ *schema.Constraint, _ *schema.Check, table string) { +// +// Deprecated: use GuessConstraintInterfaceAndTable instead. +func (m Migrator) GuessConstraintAndTable(stmt *gorm.Statement, name string) (*schema.Constraint, *schema.CheckConstraint, string) { + constraint, table := m.GuessConstraintInterfaceAndTable(stmt, name) + switch c := constraint.(type) { + case *schema.Constraint: + return c, nil, table + case *schema.CheckConstraint: + return nil, c, table + default: + return nil, nil, table + } +} + +// GuessConstraintInterfaceAndTable guess statement's constraint and it's table based on name +// nolint:cyclop +func (m Migrator) GuessConstraintInterfaceAndTable(stmt *gorm.Statement, name string) (_ schema.ConstraintInterface, table string) { if stmt.Schema == nil { - return nil, nil, stmt.Table + return nil, stmt.Table } checkConstraints := stmt.Schema.ParseCheckConstraints() if chk, ok := checkConstraints[name]; ok { - return nil, &chk, stmt.Table + return &chk, stmt.Table + } + + uniqueConstraints := stmt.Schema.ParseUniqueConstraints() + if uni, ok := uniqueConstraints[name]; ok { + return &uni, stmt.Table } getTable := func(rel *schema.Relationship) string { @@ -568,7 +710,7 @@ func (m Migrator) GuessConstraintAndTable(stmt *gorm.Statement, name string) (_ for _, rel := range stmt.Schema.Relationships.Relations { if constraint := rel.ParseConstraint(); constraint != nil && constraint.Name == name { - return constraint, nil, getTable(rel) + return constraint, getTable(rel) } } @@ -576,40 +718,39 @@ func (m Migrator) GuessConstraintAndTable(stmt *gorm.Statement, name string) (_ for k := range checkConstraints { if checkConstraints[k].Field == field { v := checkConstraints[k] - return nil, &v, stmt.Table + return &v, stmt.Table + } + } + + for k := range uniqueConstraints { + if uniqueConstraints[k].Field == field { + v := uniqueConstraints[k] + return &v, stmt.Table } } for _, rel := range stmt.Schema.Relationships.Relations { if constraint := rel.ParseConstraint(); constraint != nil && rel.Field == field { - return constraint, nil, getTable(rel) + return constraint, getTable(rel) } } } - return nil, nil, stmt.Schema.Table + return nil, stmt.Schema.Table } // CreateConstraint create constraint func (m Migrator) CreateConstraint(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - constraint, chk, table := m.GuessConstraintAndTable(stmt, name) - if chk != nil { - return m.DB.Exec( - "ALTER TABLE ? ADD CONSTRAINT ? CHECK (?)", - m.CurrentTable(stmt), clause.Column{Name: chk.Name}, clause.Expr{SQL: chk.Constraint}, - ).Error - } - + constraint, table := m.GuessConstraintInterfaceAndTable(stmt, name) if constraint != nil { vars := []interface{}{clause.Table{Name: table}} if stmt.TableExpr != nil { vars[0] = stmt.TableExpr } - sql, values := buildConstraint(constraint) + sql, values := constraint.Build() return m.DB.Exec("ALTER TABLE ? ADD "+sql, append(vars, values...)...).Error } - return nil }) } @@ -617,11 +758,9 @@ func (m Migrator) CreateConstraint(value interface{}, name string) error { // DropConstraint drop constraint func (m Migrator) DropConstraint(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - constraint, chk, table := m.GuessConstraintAndTable(stmt, name) + constraint, table := m.GuessConstraintInterfaceAndTable(stmt, name) if constraint != nil { - name = constraint.Name - } else if chk != nil { - name = chk.Name + name = constraint.GetName() } return m.DB.Exec("ALTER TABLE ? DROP CONSTRAINT ?", clause.Table{Name: table}, clause.Column{Name: name}).Error }) @@ -632,11 +771,9 @@ func (m Migrator) HasConstraint(value interface{}, name string) bool { var count int64 m.RunWithValue(value, func(stmt *gorm.Statement) error { currentDatabase := m.DB.Migrator().CurrentDatabase() - constraint, chk, table := m.GuessConstraintAndTable(stmt, name) + constraint, table := m.GuessConstraintInterfaceAndTable(stmt, name) if constraint != nil { - name = constraint.Name - } else if chk != nil { - name = chk.Name + name = constraint.GetName() } return m.DB.Raw( @@ -678,6 +815,9 @@ type BuildIndexOptionsInterface interface { // CreateIndex create index `name` func (m Migrator) CreateIndex(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { + if stmt.Schema == nil { + return errors.New("failed to get schema") + } if idx := stmt.Schema.LookIndex(name); idx != nil { opts := m.DB.Migrator().(BuildIndexOptionsInterface).BuildIndexOptions(idx.Fields, stmt) values := []interface{}{clause.Column{Name: idx.Name}, m.CurrentTable(stmt), opts} @@ -710,8 +850,10 @@ func (m Migrator) CreateIndex(value interface{}, name string) error { // DropIndex drop index `name` func (m Migrator) DropIndex(value interface{}, name string) error { return m.RunWithValue(value, func(stmt *gorm.Statement) error { - if idx := stmt.Schema.LookIndex(name); idx != nil { - name = idx.Name + if stmt.Schema != nil { + if idx := stmt.Schema.LookIndex(name); idx != nil { + name = idx.Name + } } return m.DB.Exec("DROP INDEX ? ON ?", clause.Column{Name: name}, m.CurrentTable(stmt)).Error @@ -723,8 +865,10 @@ func (m Migrator) HasIndex(value interface{}, name string) bool { var count int64 m.RunWithValue(value, func(stmt *gorm.Statement) error { currentDatabase := m.DB.Migrator().CurrentDatabase() - if idx := stmt.Schema.LookIndex(name); idx != nil { - name = idx.Name + if stmt.Schema != nil { + if idx := stmt.Schema.LookIndex(name); idx != nil { + name = idx.Name + } } return m.DB.Raw( @@ -782,26 +926,31 @@ func (m Migrator) ReorderModels(values []interface{}, autoAdd bool) (results []i } parsedSchemas[dep.Statement.Schema] = true - for _, rel := range dep.Schema.Relationships.Relations { - if c := rel.ParseConstraint(); c != nil && c.Schema == dep.Statement.Schema && c.Schema != c.ReferenceSchema { - dep.Depends = append(dep.Depends, c.ReferenceSchema) - } + if !m.DB.IgnoreRelationshipsWhenMigrating { + for _, rel := range dep.Schema.Relationships.Relations { + if rel.Field.IgnoreMigration { + continue + } + if c := rel.ParseConstraint(); c != nil && c.Schema == dep.Statement.Schema && c.Schema != c.ReferenceSchema { + dep.Depends = append(dep.Depends, c.ReferenceSchema) + } - if rel.Type == schema.HasOne || rel.Type == schema.HasMany { - beDependedOn[rel.FieldSchema] = true - } + if rel.Type == schema.HasOne || rel.Type == schema.HasMany { + beDependedOn[rel.FieldSchema] = true + } - if rel.JoinTable != nil { - // append join value - defer func(rel *schema.Relationship, joinValue interface{}) { - if !beDependedOn[rel.FieldSchema] { - dep.Depends = append(dep.Depends, rel.FieldSchema) - } else { - fieldValue := reflect.New(rel.FieldSchema.ModelType).Interface() - parseDependence(fieldValue, autoAdd) - } - parseDependence(joinValue, autoAdd) - }(rel, reflect.New(rel.JoinTable.ModelType).Interface()) + if rel.JoinTable != nil { + // append join value + defer func(rel *schema.Relationship, joinValue interface{}) { + if !beDependedOn[rel.FieldSchema] { + dep.Depends = append(dep.Depends, rel.FieldSchema) + } else { + fieldValue := reflect.New(rel.FieldSchema.ModelType).Interface() + parseDependence(fieldValue, autoAdd) + } + parseDependence(joinValue, autoAdd) + }(rel, reflect.New(rel.JoinTable.ModelType).Interface()) + } } } @@ -863,3 +1012,13 @@ func (m Migrator) CurrentTable(stmt *gorm.Statement) interface{} { func (m Migrator) GetIndexes(dst interface{}) ([]gorm.Index, error) { return nil, errors.New("not support") } + +// GetTypeAliases return database type aliases +func (m Migrator) GetTypeAliases(databaseTypeName string) []string { + return nil +} + +// TableType return tableType gorm.TableType and execErr error +func (m Migrator) TableType(dst interface{}) (gorm.TableType, error) { + return nil, errors.New("not support") +} diff --git a/vendor/gorm.io/gorm/migrator/table_type.go b/vendor/gorm.io/gorm/migrator/table_type.go new file mode 100644 index 0000000..ed6e42a --- /dev/null +++ b/vendor/gorm.io/gorm/migrator/table_type.go @@ -0,0 +1,33 @@ +package migrator + +import ( + "database/sql" +) + +// TableType table type implements TableType interface +type TableType struct { + SchemaValue string + NameValue string + TypeValue string + CommentValue sql.NullString +} + +// Schema returns the schema of the table. +func (ct TableType) Schema() string { + return ct.SchemaValue +} + +// Name returns the name of the table. +func (ct TableType) Name() string { + return ct.NameValue +} + +// Type returns the type of the table. +func (ct TableType) Type() string { + return ct.TypeValue +} + +// Comment returns the comment of current table. +func (ct TableType) Comment() (comment string, ok bool) { + return ct.CommentValue.String, ct.CommentValue.Valid +} diff --git a/vendor/gorm.io/gorm/model.go b/vendor/gorm.io/gorm/model.go index 3334d17..fa705df 100644 --- a/vendor/gorm.io/gorm/model.go +++ b/vendor/gorm.io/gorm/model.go @@ -4,9 +4,10 @@ import "time" // Model a basic GoLang struct which includes the following fields: ID, CreatedAt, UpdatedAt, DeletedAt // It may be embedded into your model or you may build your own model without it -// type User struct { -// gorm.Model -// } +// +// type User struct { +// gorm.Model +// } type Model struct { ID uint `gorm:"primarykey"` CreatedAt time.Time diff --git a/vendor/gorm.io/gorm/prepare_stmt.go b/vendor/gorm.io/gorm/prepare_stmt.go index b062b0d..094bb47 100644 --- a/vendor/gorm.io/gorm/prepare_stmt.go +++ b/vendor/gorm.io/gorm/prepare_stmt.go @@ -3,30 +3,42 @@ package gorm import ( "context" "database/sql" + "database/sql/driver" + "errors" + "reflect" "sync" ) type Stmt struct { *sql.Stmt Transaction bool + prepared chan struct{} + prepareErr error } type PreparedStmtDB struct { - Stmts map[string]Stmt - PreparedSQL []string - Mux *sync.RWMutex + Stmts map[string]*Stmt + Mux *sync.RWMutex ConnPool } -func (db *PreparedStmtDB) GetDBConn() (*sql.DB, error) { - if dbConnector, ok := db.ConnPool.(GetDBConnector); ok && dbConnector != nil { - return dbConnector.GetDBConn() +func NewPreparedStmtDB(connPool ConnPool) *PreparedStmtDB { + return &PreparedStmtDB{ + ConnPool: connPool, + Stmts: make(map[string]*Stmt), + Mux: &sync.RWMutex{}, } +} +func (db *PreparedStmtDB) GetDBConn() (*sql.DB, error) { if sqldb, ok := db.ConnPool.(*sql.DB); ok { return sqldb, nil } + if dbConnector, ok := db.ConnPool.(GetDBConnector); ok && dbConnector != nil { + return dbConnector.GetDBConn() + } + return nil, ErrInvalidDB } @@ -34,39 +46,94 @@ func (db *PreparedStmtDB) Close() { db.Mux.Lock() defer db.Mux.Unlock() - for _, query := range db.PreparedSQL { - if stmt, ok := db.Stmts[query]; ok { - delete(db.Stmts, query) - go stmt.Close() - } + for _, stmt := range db.Stmts { + go func(s *Stmt) { + // make sure the stmt must finish preparation first + <-s.prepared + if s.Stmt != nil { + _ = s.Close() + } + }(stmt) + } + // setting db.Stmts to nil to avoid further using + db.Stmts = nil +} + +func (sdb *PreparedStmtDB) Reset() { + sdb.Mux.Lock() + defer sdb.Mux.Unlock() + + for _, stmt := range sdb.Stmts { + go func(s *Stmt) { + // make sure the stmt must finish preparation first + <-s.prepared + if s.Stmt != nil { + _ = s.Close() + } + }(stmt) } + sdb.Stmts = make(map[string]*Stmt) } func (db *PreparedStmtDB) prepare(ctx context.Context, conn ConnPool, isTransaction bool, query string) (Stmt, error) { db.Mux.RLock() if stmt, ok := db.Stmts[query]; ok && (!stmt.Transaction || isTransaction) { db.Mux.RUnlock() - return stmt, nil + // wait for other goroutines prepared + <-stmt.prepared + if stmt.prepareErr != nil { + return Stmt{}, stmt.prepareErr + } + + return *stmt, nil } db.Mux.RUnlock() db.Mux.Lock() - defer db.Mux.Unlock() - // double check if stmt, ok := db.Stmts[query]; ok && (!stmt.Transaction || isTransaction) { - return stmt, nil - } else if ok { - go stmt.Close() + db.Mux.Unlock() + // wait for other goroutines prepared + <-stmt.prepared + if stmt.prepareErr != nil { + return Stmt{}, stmt.prepareErr + } + + return *stmt, nil + } + // check db.Stmts first to avoid Segmentation Fault(setting value to nil map) + // which cause by calling Close and executing SQL concurrently + if db.Stmts == nil { + db.Mux.Unlock() + return Stmt{}, ErrInvalidDB } + // cache preparing stmt first + cacheStmt := Stmt{Transaction: isTransaction, prepared: make(chan struct{})} + db.Stmts[query] = &cacheStmt + db.Mux.Unlock() + // prepare completed + defer close(cacheStmt.prepared) + + // Reason why cannot lock conn.PrepareContext + // suppose the maxopen is 1, g1 is creating record and g2 is querying record. + // 1. g1 begin tx, g1 is requeue because of waiting for the system call, now `db.ConnPool` db.numOpen == 1. + // 2. g2 select lock `conn.PrepareContext(ctx, query)`, now db.numOpen == db.maxOpen , wait for release. + // 3. g1 tx exec insert, wait for unlock `conn.PrepareContext(ctx, query)` to finish tx and release. stmt, err := conn.PrepareContext(ctx, query) - if err == nil { - db.Stmts[query] = Stmt{Stmt: stmt, Transaction: isTransaction} - db.PreparedSQL = append(db.PreparedSQL, query) + if err != nil { + cacheStmt.prepareErr = err + db.Mux.Lock() + delete(db.Stmts, query) + db.Mux.Unlock() + return Stmt{}, err } - return db.Stmts[query], err + db.Mux.Lock() + cacheStmt.Stmt = stmt + db.Mux.Unlock() + + return cacheStmt, nil } func (db *PreparedStmtDB) BeginTx(ctx context.Context, opt *sql.TxOptions) (ConnPool, error) { @@ -74,6 +141,19 @@ func (db *PreparedStmtDB) BeginTx(ctx context.Context, opt *sql.TxOptions) (Conn tx, err := beginner.BeginTx(ctx, opt) return &PreparedStmtTX{PreparedStmtDB: db, Tx: tx}, err } + + beginner, ok := db.ConnPool.(ConnPoolBeginner) + if !ok { + return nil, ErrInvalidTransaction + } + + connPool, err := beginner.BeginTx(ctx, opt) + if err != nil { + return nil, err + } + if tx, ok := connPool.(Tx); ok { + return &PreparedStmtTX{PreparedStmtDB: db, Tx: tx}, nil + } return nil, ErrInvalidTransaction } @@ -81,7 +161,7 @@ func (db *PreparedStmtDB) ExecContext(ctx context.Context, query string, args .. stmt, err := db.prepare(ctx, db.ConnPool, false, query) if err == nil { result, err = stmt.ExecContext(ctx, args...) - if err != nil { + if errors.Is(err, driver.ErrBadConn) { db.Mux.Lock() defer db.Mux.Unlock() go stmt.Close() @@ -95,7 +175,7 @@ func (db *PreparedStmtDB) QueryContext(ctx context.Context, query string, args . stmt, err := db.prepare(ctx, db.ConnPool, false, query) if err == nil { rows, err = stmt.QueryContext(ctx, args...) - if err != nil { + if errors.Is(err, driver.ErrBadConn) { db.Mux.Lock() defer db.Mux.Unlock() @@ -114,20 +194,32 @@ func (db *PreparedStmtDB) QueryRowContext(ctx context.Context, query string, arg return &sql.Row{} } +func (db *PreparedStmtDB) Ping() error { + conn, err := db.GetDBConn() + if err != nil { + return err + } + return conn.Ping() +} + type PreparedStmtTX struct { Tx PreparedStmtDB *PreparedStmtDB } +func (db *PreparedStmtTX) GetDBConn() (*sql.DB, error) { + return db.PreparedStmtDB.GetDBConn() +} + func (tx *PreparedStmtTX) Commit() error { - if tx.Tx != nil { + if tx.Tx != nil && !reflect.ValueOf(tx.Tx).IsNil() { return tx.Tx.Commit() } return ErrInvalidTransaction } func (tx *PreparedStmtTX) Rollback() error { - if tx.Tx != nil { + if tx.Tx != nil && !reflect.ValueOf(tx.Tx).IsNil() { return tx.Tx.Rollback() } return ErrInvalidTransaction @@ -137,7 +229,7 @@ func (tx *PreparedStmtTX) ExecContext(ctx context.Context, query string, args .. stmt, err := tx.PreparedStmtDB.prepare(ctx, tx.Tx, true, query) if err == nil { result, err = tx.Tx.StmtContext(ctx, stmt.Stmt).ExecContext(ctx, args...) - if err != nil { + if errors.Is(err, driver.ErrBadConn) { tx.PreparedStmtDB.Mux.Lock() defer tx.PreparedStmtDB.Mux.Unlock() @@ -152,7 +244,7 @@ func (tx *PreparedStmtTX) QueryContext(ctx context.Context, query string, args . stmt, err := tx.PreparedStmtDB.prepare(ctx, tx.Tx, true, query) if err == nil { rows, err = tx.Tx.StmtContext(ctx, stmt.Stmt).QueryContext(ctx, args...) - if err != nil { + if errors.Is(err, driver.ErrBadConn) { tx.PreparedStmtDB.Mux.Lock() defer tx.PreparedStmtDB.Mux.Unlock() @@ -170,3 +262,11 @@ func (tx *PreparedStmtTX) QueryRowContext(ctx context.Context, query string, arg } return &sql.Row{} } + +func (tx *PreparedStmtTX) Ping() error { + conn, err := tx.GetDBConn() + if err != nil { + return err + } + return conn.Ping() +} diff --git a/vendor/gorm.io/gorm/scan.go b/vendor/gorm.io/gorm/scan.go index 6250fb5..d852c2c 100644 --- a/vendor/gorm.io/gorm/scan.go +++ b/vendor/gorm.io/gorm/scan.go @@ -4,10 +4,10 @@ import ( "database/sql" "database/sql/driver" "reflect" - "strings" "time" "gorm.io/gorm/schema" + "gorm.io/gorm/utils" ) // prepareValues prepare values slice @@ -50,7 +50,7 @@ func scanIntoMap(mapValue map[string]interface{}, values []interface{}, columns } } -func (db *DB) scanIntoStruct(rows Rows, reflectValue reflect.Value, values []interface{}, fields []*schema.Field, joinFields [][2]*schema.Field) { +func (db *DB) scanIntoStruct(rows Rows, reflectValue reflect.Value, values []interface{}, fields []*schema.Field, joinFields [][]*schema.Field) { for idx, field := range fields { if field != nil { values[idx] = field.NewValuePool.Get() @@ -65,31 +65,49 @@ func (db *DB) scanIntoStruct(rows Rows, reflectValue reflect.Value, values []int db.RowsAffected++ db.AddError(rows.Scan(values...)) - - joinedSchemaMap := make(map[*schema.Field]interface{}, 0) + joinedNestedSchemaMap := make(map[string]interface{}) for idx, field := range fields { - if field != nil { - if len(joinFields) == 0 || joinFields[idx][0] == nil { - db.AddError(field.Set(db.Statement.Context, reflectValue, values[idx])) - } else { - joinSchema := joinFields[idx][0] - relValue := joinSchema.ReflectValueOf(db.Statement.Context, reflectValue) + if field == nil { + continue + } + + if len(joinFields) == 0 || len(joinFields[idx]) == 0 { + db.AddError(field.Set(db.Statement.Context, reflectValue, values[idx])) + } else { // joinFields count is larger than 2 when using join + var isNilPtrValue bool + var relValue reflect.Value + // does not contain raw dbname + nestedJoinSchemas := joinFields[idx][:len(joinFields[idx])-1] + // current reflect value + currentReflectValue := reflectValue + fullRels := make([]string, 0, len(nestedJoinSchemas)) + for _, joinSchema := range nestedJoinSchemas { + fullRels = append(fullRels, joinSchema.Name) + relValue = joinSchema.ReflectValueOf(db.Statement.Context, currentReflectValue) if relValue.Kind() == reflect.Ptr { - if _, ok := joinedSchemaMap[joinSchema]; !ok { + fullRelsName := utils.JoinNestedRelationNames(fullRels) + // same nested structure + if _, ok := joinedNestedSchemaMap[fullRelsName]; !ok { if value := reflect.ValueOf(values[idx]).Elem(); value.Kind() == reflect.Ptr && value.IsNil() { - continue + isNilPtrValue = true + break } relValue.Set(reflect.New(relValue.Type().Elem())) - joinedSchemaMap[joinSchema] = nil + joinedNestedSchemaMap[fullRelsName] = nil } } - db.AddError(joinFields[idx][1].Set(db.Statement.Context, relValue, values[idx])) + currentReflectValue = relValue } - // release data to pool - field.NewValuePool.Put(values[idx]) + if !isNilPtrValue { // ignore if value is nil + f := joinFields[idx][len(joinFields[idx])-1] + db.AddError(f.Set(db.Statement.Context, relValue, values[idx])) + } } + + // release data to pool + field.NewValuePool.Put(values[idx]) } } @@ -113,6 +131,15 @@ func Scan(rows Rows, db *DB, mode ScanMode) { onConflictDonothing = mode&ScanOnConflictDoNothing != 0 ) + if len(db.Statement.ColumnMapping) > 0 { + for i, column := range columns { + v, ok := db.Statement.ColumnMapping[column] + if ok { + columns[i] = v + } + } + } + db.RowsAffected = 0 switch dest := db.Statement.Dest.(type) { @@ -161,11 +188,10 @@ func Scan(rows Rows, db *DB, mode ScanMode) { } default: var ( - fields = make([]*schema.Field, len(columns)) - selectedColumnsMap = make(map[string]int, len(columns)) - joinFields [][2]*schema.Field - sch = db.Statement.Schema - reflectValue = db.Statement.ReflectValue + fields = make([]*schema.Field, len(columns)) + joinFields [][]*schema.Field + sch = db.Statement.Schema + reflectValue = db.Statement.ReflectValue ) if reflectValue.Kind() == reflect.Interface { @@ -198,42 +224,53 @@ func Scan(rows Rows, db *DB, mode ScanMode) { // Not Pluck if sch != nil { - schFieldsCount := len(sch.Fields) + matchedFieldCount := make(map[string]int, len(columns)) for idx, column := range columns { if field := sch.LookUpField(column); field != nil && field.Readable { - if curIndex, ok := selectedColumnsMap[column]; ok { - fields[idx] = field // handle duplicate fields - offset := curIndex + 1 - // handle sch inconsistent with database - // like Raw(`...`).Scan - if schFieldsCount > offset { - for fieldIndex, selectField := range sch.Fields[offset:] { - if selectField.DBName == column && selectField.Readable { - selectedColumnsMap[column] = curIndex + fieldIndex + 1 + fields[idx] = field + if count, ok := matchedFieldCount[column]; ok { + // handle duplicate fields + for _, selectField := range sch.Fields { + if selectField.DBName == column && selectField.Readable { + if count == 0 { + matchedFieldCount[column]++ fields[idx] = selectField break } + count-- } } } else { - fields[idx] = field - selectedColumnsMap[column] = idx + matchedFieldCount[column] = 1 } - } else if names := strings.Split(column, "__"); len(names) > 1 { + } else if names := utils.SplitNestedRelationName(column); len(names) > 1 { // has nested relation if rel, ok := sch.Relationships.Relations[names[0]]; ok { - if field := rel.FieldSchema.LookUpField(strings.Join(names[1:], "__")); field != nil && field.Readable { + subNameCount := len(names) + // nested relation fields + relFields := make([]*schema.Field, 0, subNameCount-1) + relFields = append(relFields, rel.Field) + for _, name := range names[1 : subNameCount-1] { + rel = rel.FieldSchema.Relationships.Relations[name] + relFields = append(relFields, rel.Field) + } + // latest name is raw dbname + dbName := names[subNameCount-1] + if field := rel.FieldSchema.LookUpField(dbName); field != nil && field.Readable { fields[idx] = field if len(joinFields) == 0 { - joinFields = make([][2]*schema.Field, len(columns)) + joinFields = make([][]*schema.Field, len(columns)) } - joinFields[idx] = [2]*schema.Field{rel.Field, field} + relFields = append(relFields, field) + joinFields[idx] = relFields continue } } - values[idx] = &sql.RawBytes{} + var val interface{} + values[idx] = &val } else { - values[idx] = &sql.RawBytes{} + var val interface{} + values[idx] = &val } } } @@ -241,12 +278,24 @@ func Scan(rows Rows, db *DB, mode ScanMode) { switch reflectValue.Kind() { case reflect.Slice, reflect.Array: - var elem reflect.Value - recyclableStruct := reflect.New(reflectValueType) + var ( + elem reflect.Value + isArrayKind = reflectValue.Kind() == reflect.Array + ) if !update || reflectValue.Len() == 0 { update = false - db.Statement.ReflectValue.Set(reflect.MakeSlice(reflectValue.Type(), 0, 20)) + if isArrayKind { + db.Statement.ReflectValue.Set(reflect.Zero(reflectValue.Type())) + } else { + // if the slice cap is externally initialized, the externally initialized slice is directly used here + if reflectValue.Cap() == 0 { + db.Statement.ReflectValue.Set(reflect.MakeSlice(reflectValue.Type(), 0, 20)) + } else { + reflectValue.SetLen(0) + db.Statement.ReflectValue.Set(reflectValue) + } + } } for initialized || rows.Next() { @@ -267,20 +316,21 @@ func Scan(rows Rows, db *DB, mode ScanMode) { } } } else { - if isPtr && db.RowsAffected > 0 { - elem = reflect.New(reflectValueType) - } else { - elem = recyclableStruct - } + elem = reflect.New(reflectValueType) } db.scanIntoStruct(rows, elem, values, fields, joinFields) if !update { - if isPtr { - reflectValue = reflect.Append(reflectValue, elem) + if !isPtr { + elem = elem.Elem() + } + if isArrayKind { + if reflectValue.Len() >= int(db.RowsAffected) { + reflectValue.Index(int(db.RowsAffected - 1)).Set(elem) + } } else { - reflectValue = reflect.Append(reflectValue, elem.Elem()) + reflectValue = reflect.Append(reflectValue, elem) } } } @@ -290,6 +340,9 @@ func Scan(rows Rows, db *DB, mode ScanMode) { } case reflect.Struct, reflect.Ptr: if initialized || rows.Next() { + if mode == ScanInitialized && reflectValue.Kind() == reflect.Struct { + db.Statement.ReflectValue.Set(reflect.Zero(reflectValue.Type())) + } db.scanIntoStruct(rows, reflectValue, values, fields, joinFields) } default: diff --git a/vendor/gorm.io/gorm/schema/check.go b/vendor/gorm.io/gorm/schema/check.go deleted file mode 100644 index 89e732d..0000000 --- a/vendor/gorm.io/gorm/schema/check.go +++ /dev/null @@ -1,35 +0,0 @@ -package schema - -import ( - "regexp" - "strings" -) - -// reg match english letters and midline -var regEnLetterAndMidline = regexp.MustCompile("^[A-Za-z-_]+$") - -type Check struct { - Name string - Constraint string // length(phone) >= 10 - *Field -} - -// ParseCheckConstraints parse schema check constraints -func (schema *Schema) ParseCheckConstraints() map[string]Check { - checks := map[string]Check{} - for _, field := range schema.FieldsByDBName { - if chk := field.TagSettings["CHECK"]; chk != "" { - names := strings.Split(chk, ",") - if len(names) > 1 && regEnLetterAndMidline.MatchString(names[0]) { - checks[names[0]] = Check{Name: names[0], Constraint: strings.Join(names[1:], ","), Field: field} - } else { - if names[0] == "" { - chk = strings.Join(names[1:], ",") - } - name := schema.namer.CheckerName(schema.Table, field.DBName) - checks[name] = Check{Name: name, Constraint: chk, Field: field} - } - } - } - return checks -} diff --git a/vendor/gorm.io/gorm/schema/constraint.go b/vendor/gorm.io/gorm/schema/constraint.go new file mode 100644 index 0000000..80a743a --- /dev/null +++ b/vendor/gorm.io/gorm/schema/constraint.go @@ -0,0 +1,66 @@ +package schema + +import ( + "regexp" + "strings" + + "gorm.io/gorm/clause" +) + +// reg match english letters and midline +var regEnLetterAndMidline = regexp.MustCompile(`^[\w-]+$`) + +type CheckConstraint struct { + Name string + Constraint string // length(phone) >= 10 + *Field +} + +func (chk *CheckConstraint) GetName() string { return chk.Name } + +func (chk *CheckConstraint) Build() (sql string, vars []interface{}) { + return "CONSTRAINT ? CHECK (?)", []interface{}{clause.Column{Name: chk.Name}, clause.Expr{SQL: chk.Constraint}} +} + +// ParseCheckConstraints parse schema check constraints +func (schema *Schema) ParseCheckConstraints() map[string]CheckConstraint { + checks := map[string]CheckConstraint{} + for _, field := range schema.FieldsByDBName { + if chk := field.TagSettings["CHECK"]; chk != "" { + names := strings.Split(chk, ",") + if len(names) > 1 && regEnLetterAndMidline.MatchString(names[0]) { + checks[names[0]] = CheckConstraint{Name: names[0], Constraint: strings.Join(names[1:], ","), Field: field} + } else { + if names[0] == "" { + chk = strings.Join(names[1:], ",") + } + name := schema.namer.CheckerName(schema.Table, field.DBName) + checks[name] = CheckConstraint{Name: name, Constraint: chk, Field: field} + } + } + } + return checks +} + +type UniqueConstraint struct { + Name string + Field *Field +} + +func (uni *UniqueConstraint) GetName() string { return uni.Name } + +func (uni *UniqueConstraint) Build() (sql string, vars []interface{}) { + return "CONSTRAINT ? UNIQUE (?)", []interface{}{clause.Column{Name: uni.Name}, clause.Column{Name: uni.Field.DBName}} +} + +// ParseUniqueConstraints parse schema unique constraints +func (schema *Schema) ParseUniqueConstraints() map[string]UniqueConstraint { + uniques := make(map[string]UniqueConstraint) + for _, field := range schema.Fields { + if field.Unique { + name := schema.namer.UniqueName(schema.Table, field.DBName) + uniques[name] = UniqueConstraint{Name: name, Field: field} + } + } + return uniques +} diff --git a/vendor/gorm.io/gorm/schema/field.go b/vendor/gorm.io/gorm/schema/field.go index d4dfbd6..a16c98a 100644 --- a/vendor/gorm.io/gorm/schema/field.go +++ b/vendor/gorm.io/gorm/schema/field.go @@ -49,11 +49,14 @@ const ( Bytes DataType = "bytes" ) +const DefaultAutoIncrementIncrement int64 = 1 + // Field is the representation of model schema's field type Field struct { Name string DBName string BindNames []string + EmbeddedBindNames []string DataType DataType GORMDataType DataType PrimaryKey bool @@ -87,6 +90,16 @@ type Field struct { Set func(context.Context, reflect.Value, interface{}) error Serializer SerializerInterface NewValuePool FieldNewValuePool + + // In some db (e.g. MySQL), Unique and UniqueIndex are indistinguishable. + // When a column has a (not Mul) UniqueIndex, Migrator always reports its gorm.ColumnType is Unique. + // It causes field unnecessarily migration. + // Therefore, we need to record the UniqueIndex on this column (exclude Mul UniqueIndex) for MigrateColumnUnique. + UniqueIndex string +} + +func (field *Field) BindName() string { + return strings.Join(field.BindNames, ".") } // ParseField parses reflect.StructField to Field @@ -100,6 +113,7 @@ func (schema *Schema) ParseField(fieldStruct reflect.StructField) *Field { Name: fieldStruct.Name, DBName: tagSetting["COLUMN"], BindNames: []string{fieldStruct.Name}, + EmbeddedBindNames: []string{fieldStruct.Name}, FieldType: fieldStruct.Type, IndirectFieldType: fieldStruct.Type, StructField: fieldStruct, @@ -115,7 +129,7 @@ func (schema *Schema) ParseField(fieldStruct reflect.StructField) *Field { NotNull: utils.CheckTruth(tagSetting["NOT NULL"], tagSetting["NOTNULL"]), Unique: utils.CheckTruth(tagSetting["UNIQUE"]), Comment: tagSetting["COMMENT"], - AutoIncrementIncrement: 1, + AutoIncrementIncrement: DefaultAutoIncrementIncrement, } for field.IndirectFieldType.Kind() == reflect.Ptr { @@ -174,7 +188,7 @@ func (schema *Schema) ParseField(fieldStruct reflect.StructField) *Field { field.DataType = String field.Serializer = v } else { - var serializerName = field.TagSettings["JSON"] + serializerName := field.TagSettings["JSON"] if serializerName == "" { serializerName = field.TagSettings["SERIALIZER"] } @@ -391,6 +405,9 @@ func (schema *Schema) ParseField(fieldStruct reflect.StructField) *Field { ef.Schema = schema ef.OwnerSchema = field.EmbeddedSchema ef.BindNames = append([]string{fieldStruct.Name}, ef.BindNames...) + if _, ok := field.TagSettings["EMBEDDED"]; ok || !fieldStruct.Anonymous { + ef.EmbeddedBindNames = append([]string{fieldStruct.Name}, ef.EmbeddedBindNames...) + } // index is negative means is pointer if field.FieldType.Kind() == reflect.Struct { ef.StructField.Index = append([]int{fieldStruct.Index[0]}, ef.StructField.Index...) @@ -403,18 +420,14 @@ func (schema *Schema) ParseField(fieldStruct reflect.StructField) *Field { } if ef.PrimaryKey { - if val, ok := ef.TagSettings["PRIMARYKEY"]; ok && utils.CheckTruth(val) { - ef.PrimaryKey = true - } else if val, ok := ef.TagSettings["PRIMARY_KEY"]; ok && utils.CheckTruth(val) { - ef.PrimaryKey = true - } else { + if !utils.CheckTruth(ef.TagSettings["PRIMARYKEY"], ef.TagSettings["PRIMARY_KEY"]) { ef.PrimaryKey = false if val, ok := ef.TagSettings["AUTOINCREMENT"]; !ok || !utils.CheckTruth(val) { ef.AutoIncrement = false } - if ef.DefaultValue == "" { + if !ef.AutoIncrement && ef.DefaultValue == "" { ef.HasDefaultValue = false } } @@ -472,9 +485,6 @@ func (field *Field) setupValuerAndSetter() { oldValuerOf := field.ValueOf field.ValueOf = func(ctx context.Context, v reflect.Value) (interface{}, bool) { value, zero := oldValuerOf(ctx, v) - if zero { - return value, zero - } s, ok := value.(SerializerValuerInterface) if !ok { @@ -487,7 +497,7 @@ func (field *Field) setupValuerAndSetter() { Destination: v, Context: ctx, fieldValue: value, - }, false + }, zero } } @@ -587,8 +597,6 @@ func (field *Field) setupValuerAndSetter() { case **bool: if data != nil && *data != nil { field.ReflectValueOf(ctx, value).SetBool(**data) - } else { - field.ReflectValueOf(ctx, value).SetBool(false) } case bool: field.ReflectValueOf(ctx, value).SetBool(data) @@ -608,8 +616,22 @@ func (field *Field) setupValuerAndSetter() { case **int64: if data != nil && *data != nil { field.ReflectValueOf(ctx, value).SetInt(**data) - } else { - field.ReflectValueOf(ctx, value).SetInt(0) + } + case **int: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetInt(int64(**data)) + } + case **int8: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetInt(int64(**data)) + } + case **int16: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetInt(int64(**data)) + } + case **int32: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetInt(int64(**data)) } case int64: field.ReflectValueOf(ctx, value).SetInt(data) @@ -647,7 +669,7 @@ func (field *Field) setupValuerAndSetter() { if field.AutoCreateTime == UnixNanosecond || field.AutoUpdateTime == UnixNanosecond { field.ReflectValueOf(ctx, value).SetInt(data.UnixNano()) } else if field.AutoCreateTime == UnixMillisecond || field.AutoUpdateTime == UnixMillisecond { - field.ReflectValueOf(ctx, value).SetInt(data.UnixNano() / 1e6) + field.ReflectValueOf(ctx, value).SetInt(data.UnixMilli()) } else { field.ReflectValueOf(ctx, value).SetInt(data.Unix()) } @@ -656,7 +678,7 @@ func (field *Field) setupValuerAndSetter() { if field.AutoCreateTime == UnixNanosecond || field.AutoUpdateTime == UnixNanosecond { field.ReflectValueOf(ctx, value).SetInt(data.UnixNano()) } else if field.AutoCreateTime == UnixMillisecond || field.AutoUpdateTime == UnixMillisecond { - field.ReflectValueOf(ctx, value).SetInt(data.UnixNano() / 1e6) + field.ReflectValueOf(ctx, value).SetInt(data.UnixMilli()) } else { field.ReflectValueOf(ctx, value).SetInt(data.Unix()) } @@ -674,8 +696,22 @@ func (field *Field) setupValuerAndSetter() { case **uint64: if data != nil && *data != nil { field.ReflectValueOf(ctx, value).SetUint(**data) - } else { - field.ReflectValueOf(ctx, value).SetUint(0) + } + case **uint: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetUint(uint64(**data)) + } + case **uint8: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetUint(uint64(**data)) + } + case **uint16: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetUint(uint64(**data)) + } + case **uint32: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetUint(uint64(**data)) } case uint64: field.ReflectValueOf(ctx, value).SetUint(data) @@ -707,7 +743,7 @@ func (field *Field) setupValuerAndSetter() { if field.AutoCreateTime == UnixNanosecond || field.AutoUpdateTime == UnixNanosecond { field.ReflectValueOf(ctx, value).SetUint(uint64(data.UnixNano())) } else if field.AutoCreateTime == UnixMillisecond || field.AutoUpdateTime == UnixMillisecond { - field.ReflectValueOf(ctx, value).SetUint(uint64(data.UnixNano() / 1e6)) + field.ReflectValueOf(ctx, value).SetUint(uint64(data.UnixMilli())) } else { field.ReflectValueOf(ctx, value).SetUint(uint64(data.Unix())) } @@ -728,8 +764,10 @@ func (field *Field) setupValuerAndSetter() { case **float64: if data != nil && *data != nil { field.ReflectValueOf(ctx, value).SetFloat(**data) - } else { - field.ReflectValueOf(ctx, value).SetFloat(0) + } + case **float32: + if data != nil && *data != nil { + field.ReflectValueOf(ctx, value).SetFloat(float64(**data)) } case float64: field.ReflectValueOf(ctx, value).SetFloat(data) @@ -774,8 +812,6 @@ func (field *Field) setupValuerAndSetter() { case **string: if data != nil && *data != nil { field.ReflectValueOf(ctx, value).SetString(**data) - } else { - field.ReflectValueOf(ctx, value).SetString("") } case string: field.ReflectValueOf(ctx, value).SetString(data) @@ -823,7 +859,7 @@ func (field *Field) setupValuerAndSetter() { field.Set = func(ctx context.Context, value reflect.Value, v interface{}) error { switch data := v.(type) { case **time.Time: - if data != nil { + if data != nil && *data != nil { field.ReflectValueOf(ctx, value).Set(reflect.ValueOf(*data)) } case time.Time: @@ -859,14 +895,12 @@ func (field *Field) setupValuerAndSetter() { reflectV := reflect.ValueOf(v) if !reflectV.IsValid() { field.ReflectValueOf(ctx, value).Set(reflect.New(field.FieldType).Elem()) + } else if reflectV.Kind() == reflect.Ptr && reflectV.IsNil() { + return } else if reflectV.Type().AssignableTo(field.FieldType) { field.ReflectValueOf(ctx, value).Set(reflectV) } else if reflectV.Kind() == reflect.Ptr { - if reflectV.IsNil() || !reflectV.IsValid() { - field.ReflectValueOf(ctx, value).Set(reflect.New(field.FieldType).Elem()) - } else { - return field.Set(ctx, value, reflectV.Elem().Interface()) - } + return field.Set(ctx, value, reflectV.Elem().Interface()) } else { fieldValue := field.ReflectValueOf(ctx, value) if fieldValue.IsNil() { @@ -887,14 +921,12 @@ func (field *Field) setupValuerAndSetter() { reflectV := reflect.ValueOf(v) if !reflectV.IsValid() { field.ReflectValueOf(ctx, value).Set(reflect.New(field.FieldType).Elem()) + } else if reflectV.Kind() == reflect.Ptr && reflectV.IsNil() { + return } else if reflectV.Type().AssignableTo(field.FieldType) { field.ReflectValueOf(ctx, value).Set(reflectV) } else if reflectV.Kind() == reflect.Ptr { - if reflectV.IsNil() || !reflectV.IsValid() { - field.ReflectValueOf(ctx, value).Set(reflect.New(field.FieldType).Elem()) - } else { - return field.Set(ctx, value, reflectV.Elem().Interface()) - } + return field.Set(ctx, value, reflectV.Elem().Interface()) } else { if valuer, ok := v.(driver.Valuer); ok { v, _ = valuer.Value() @@ -923,6 +955,8 @@ func (field *Field) setupValuerAndSetter() { sameElemType = field.FieldType == reflect.ValueOf(field.Serializer).Type().Elem() } + serializerValue := reflect.Indirect(reflect.ValueOf(field.Serializer)) + serializerType := serializerValue.Type() field.Set = func(ctx context.Context, value reflect.Value, v interface{}) (err error) { if s, ok := v.(*serializer); ok { if s.fieldValue != nil { @@ -930,11 +964,12 @@ func (field *Field) setupValuerAndSetter() { } else if err = s.Serializer.Scan(ctx, field, value, s.value); err == nil { if sameElemType { field.ReflectValueOf(ctx, value).Set(reflect.ValueOf(s.Serializer).Elem()) - s.Serializer = reflect.New(reflect.Indirect(reflect.ValueOf(field.Serializer)).Type()).Interface().(SerializerInterface) } else if sameType { field.ReflectValueOf(ctx, value).Set(reflect.ValueOf(s.Serializer)) - s.Serializer = reflect.New(reflect.Indirect(reflect.ValueOf(field.Serializer)).Type()).Interface().(SerializerInterface) } + si := reflect.New(serializerType) + si.Elem().Set(serializerValue) + s.Serializer = si.Interface().(SerializerInterface) } } else { err = oldFieldSetter(ctx, value, v) @@ -946,11 +981,15 @@ func (field *Field) setupValuerAndSetter() { func (field *Field) setupNewValuePool() { if field.Serializer != nil { + serializerValue := reflect.Indirect(reflect.ValueOf(field.Serializer)) + serializerType := serializerValue.Type() field.NewValuePool = &sync.Pool{ New: func() interface{} { + si := reflect.New(serializerType) + si.Elem().Set(serializerValue) return &serializer{ Field: field, - Serializer: field.Serializer, + Serializer: si.Interface().(SerializerInterface), } }, } diff --git a/vendor/gorm.io/gorm/schema/index.go b/vendor/gorm.io/gorm/schema/index.go index 5003c74..f4f3675 100644 --- a/vendor/gorm.io/gorm/schema/index.go +++ b/vendor/gorm.io/gorm/schema/index.go @@ -13,8 +13,8 @@ type Index struct { Type string // btree, hash, gist, spgist, gin, and brin Where string Comment string - Option string // WITH PARSER parser_name - Fields []IndexOption + Option string // WITH PARSER parser_name + Fields []IndexOption // Note: IndexOption's Field maybe the same } type IndexOption struct { @@ -65,7 +65,11 @@ func (schema *Schema) ParseIndexes() map[string]Index { } } } - + for _, index := range indexes { + if index.Class == "UNIQUE" && len(index.Fields) == 1 { + index.Fields[0].Field.UniqueIndex = index.Name + } + } return indexes } diff --git a/vendor/gorm.io/gorm/schema/interfaces.go b/vendor/gorm.io/gorm/schema/interfaces.go index a75a33c..306d4f4 100644 --- a/vendor/gorm.io/gorm/schema/interfaces.go +++ b/vendor/gorm.io/gorm/schema/interfaces.go @@ -4,6 +4,12 @@ import ( "gorm.io/gorm/clause" ) +// ConstraintInterface database constraint interface +type ConstraintInterface interface { + GetName() string + Build() (sql string, vars []interface{}) +} + // GormDataTypeInterface gorm data type interface type GormDataTypeInterface interface { GormDataType() string diff --git a/vendor/gorm.io/gorm/schema/naming.go b/vendor/gorm.io/gorm/schema/naming.go index a258bee..6248bde 100644 --- a/vendor/gorm.io/gorm/schema/naming.go +++ b/vendor/gorm.io/gorm/schema/naming.go @@ -8,6 +8,8 @@ import ( "unicode/utf8" "github.com/jinzhu/inflection" + "golang.org/x/text/cases" + "golang.org/x/text/language" ) // Namer namer interface @@ -19,6 +21,7 @@ type Namer interface { RelationshipFKName(Relationship) string CheckerName(table, column string) string IndexName(table, column string) string + UniqueName(table, column string) string } // Replacer replacer interface like strings.Replacer @@ -26,12 +29,15 @@ type Replacer interface { Replace(name string) string } +var _ Namer = (*NamingStrategy)(nil) + // NamingStrategy tables, columns naming strategy type NamingStrategy struct { - TablePrefix string - SingularTable bool - NameReplacer Replacer - NoLowerCase bool + TablePrefix string + SingularTable bool + NameReplacer Replacer + NoLowerCase bool + IdentifierMaxLength int } // TableName convert string to table name @@ -84,17 +90,26 @@ func (ns NamingStrategy) IndexName(table, column string) string { return ns.formatName("idx", table, ns.toDBName(column)) } +// UniqueName generate unique constraint name +func (ns NamingStrategy) UniqueName(table, column string) string { + return ns.formatName("uni", table, ns.toDBName(column)) +} + func (ns NamingStrategy) formatName(prefix, table, name string) string { formattedName := strings.ReplaceAll(strings.Join([]string{ prefix, table, name, }, "_"), ".", "_") - if utf8.RuneCountInString(formattedName) > 64 { + if ns.IdentifierMaxLength == 0 { + ns.IdentifierMaxLength = 64 + } + + if utf8.RuneCountInString(formattedName) > ns.IdentifierMaxLength { h := sha1.New() h.Write([]byte(formattedName)) bs := h.Sum(nil) - formattedName = formattedName[0:56] + hex.EncodeToString(bs)[:8] + formattedName = formattedName[0:ns.IdentifierMaxLength-8] + hex.EncodeToString(bs)[:8] } return formattedName } @@ -108,7 +123,7 @@ var ( func init() { commonInitialismsForReplacer := make([]string, 0, len(commonInitialisms)) for _, initialism := range commonInitialisms { - commonInitialismsForReplacer = append(commonInitialismsForReplacer, initialism, strings.Title(strings.ToLower(initialism))) + commonInitialismsForReplacer = append(commonInitialismsForReplacer, initialism, cases.Title(language.Und).String(initialism)) } commonInitialismsReplacer = strings.NewReplacer(commonInitialismsForReplacer...) } @@ -173,9 +188,9 @@ func (ns NamingStrategy) toDBName(name string) string { } func (ns NamingStrategy) toSchemaName(name string) string { - result := strings.ReplaceAll(strings.Title(strings.ReplaceAll(name, "_", " ")), " ", "") + result := strings.ReplaceAll(cases.Title(language.Und, cases.NoLower).String(strings.ReplaceAll(name, "_", " ")), " ", "") for _, initialism := range commonInitialisms { - result = regexp.MustCompile(strings.Title(strings.ToLower(initialism))+"([A-Z]|$|_)").ReplaceAllString(result, initialism+"$1") + result = regexp.MustCompile(cases.Title(language.Und, cases.NoLower).String(strings.ToLower(initialism))+"([A-Z]|$|_)").ReplaceAllString(result, initialism+"$1") } return result } diff --git a/vendor/gorm.io/gorm/schema/relationship.go b/vendor/gorm.io/gorm/schema/relationship.go index 0aa33e5..32676b3 100644 --- a/vendor/gorm.io/gorm/schema/relationship.go +++ b/vendor/gorm.io/gorm/schema/relationship.go @@ -7,6 +7,9 @@ import ( "strings" "github.com/jinzhu/inflection" + "golang.org/x/text/cases" + "golang.org/x/text/language" + "gorm.io/gorm/clause" ) @@ -27,6 +30,8 @@ type Relationships struct { HasMany []*Relationship Many2Many []*Relationship Relations map[string]*Relationship + + EmbeddedRelations map[string]*Relationships } type Relationship struct { @@ -74,8 +79,8 @@ func (schema *Schema) parseRelation(field *Field) *Relationship { return nil } - if polymorphic := field.TagSettings["POLYMORPHIC"]; polymorphic != "" { - schema.buildPolymorphicRelation(relation, field, polymorphic) + if hasPolymorphicRelation(field.TagSettings) { + schema.buildPolymorphicRelation(relation, field) } else if many2many := field.TagSettings["MANY2MANY"]; many2many != "" { schema.buildMany2ManyRelation(relation, field, many2many) } else if belongsTo := field.TagSettings["BELONGSTO"]; belongsTo != "" { @@ -87,7 +92,8 @@ func (schema *Schema) parseRelation(field *Field) *Relationship { case reflect.Slice: schema.guessRelation(relation, field, guessHas) default: - schema.err = fmt.Errorf("unsupported data type %v for %v on field %s", relation.FieldSchema, schema, field.Name) + schema.err = fmt.Errorf("unsupported data type %v for %v on field %s", relation.FieldSchema, schema, + field.Name) } } @@ -106,7 +112,7 @@ func (schema *Schema) parseRelation(field *Field) *Relationship { } if schema.err == nil { - schema.Relationships.Relations[relation.Name] = relation + schema.setRelation(relation) switch relation.Type { case HasOne: schema.Relationships.HasOne = append(schema.Relationships.HasOne, relation) @@ -122,34 +128,100 @@ func (schema *Schema) parseRelation(field *Field) *Relationship { return relation } +// hasPolymorphicRelation check if has polymorphic relation +// 1. `POLYMORPHIC` tag +// 2. `POLYMORPHICTYPE` and `POLYMORPHICID` tag +func hasPolymorphicRelation(tagSettings map[string]string) bool { + if _, ok := tagSettings["POLYMORPHIC"]; ok { + return true + } + + _, hasType := tagSettings["POLYMORPHICTYPE"] + _, hasId := tagSettings["POLYMORPHICID"] + + return hasType && hasId +} + +func (schema *Schema) setRelation(relation *Relationship) { + // set non-embedded relation + if rel := schema.Relationships.Relations[relation.Name]; rel != nil { + if len(rel.Field.BindNames) > 1 { + schema.Relationships.Relations[relation.Name] = relation + } + } else { + schema.Relationships.Relations[relation.Name] = relation + } + + // set embedded relation + if len(relation.Field.EmbeddedBindNames) <= 1 { + return + } + relationships := &schema.Relationships + for i, name := range relation.Field.EmbeddedBindNames { + if i < len(relation.Field.EmbeddedBindNames)-1 { + if relationships.EmbeddedRelations == nil { + relationships.EmbeddedRelations = map[string]*Relationships{} + } + if r := relationships.EmbeddedRelations[name]; r == nil { + relationships.EmbeddedRelations[name] = &Relationships{} + } + relationships = relationships.EmbeddedRelations[name] + } else { + if relationships.Relations == nil { + relationships.Relations = map[string]*Relationship{} + } + relationships.Relations[relation.Name] = relation + } + } +} + // User has many Toys, its `Polymorphic` is `Owner`, Pet has one Toy, its `Polymorphic` is `Owner` -// type User struct { -// Toys []Toy `gorm:"polymorphic:Owner;"` -// } -// type Pet struct { -// Toy Toy `gorm:"polymorphic:Owner;"` -// } -// type Toy struct { -// OwnerID int -// OwnerType string -// } -func (schema *Schema) buildPolymorphicRelation(relation *Relationship, field *Field, polymorphic string) { +// +// type User struct { +// Toys []Toy `gorm:"polymorphic:Owner;"` +// } +// type Pet struct { +// Toy Toy `gorm:"polymorphic:Owner;"` +// } +// type Toy struct { +// OwnerID int +// OwnerType string +// } +func (schema *Schema) buildPolymorphicRelation(relation *Relationship, field *Field) { + polymorphic := field.TagSettings["POLYMORPHIC"] + relation.Polymorphic = &Polymorphic{ - Value: schema.Table, - PolymorphicType: relation.FieldSchema.FieldsByName[polymorphic+"Type"], - PolymorphicID: relation.FieldSchema.FieldsByName[polymorphic+"ID"], + Value: schema.Table, + } + + var ( + typeName = polymorphic + "Type" + typeId = polymorphic + "ID" + ) + + if value, ok := field.TagSettings["POLYMORPHICTYPE"]; ok { + typeName = strings.TrimSpace(value) } + if value, ok := field.TagSettings["POLYMORPHICID"]; ok { + typeId = strings.TrimSpace(value) + } + + relation.Polymorphic.PolymorphicType = relation.FieldSchema.FieldsByName[typeName] + relation.Polymorphic.PolymorphicID = relation.FieldSchema.FieldsByName[typeId] + if value, ok := field.TagSettings["POLYMORPHICVALUE"]; ok { relation.Polymorphic.Value = strings.TrimSpace(value) } if relation.Polymorphic.PolymorphicType == nil { - schema.err = fmt.Errorf("invalid polymorphic type %v for %v on field %s, missing field %s", relation.FieldSchema, schema, field.Name, polymorphic+"Type") + schema.err = fmt.Errorf("invalid polymorphic type %v for %v on field %s, missing field %s", + relation.FieldSchema, schema, field.Name, polymorphic+"Type") } if relation.Polymorphic.PolymorphicID == nil { - schema.err = fmt.Errorf("invalid polymorphic type %v for %v on field %s, missing field %s", relation.FieldSchema, schema, field.Name, polymorphic+"ID") + schema.err = fmt.Errorf("invalid polymorphic type %v for %v on field %s, missing field %s", + relation.FieldSchema, schema, field.Name, polymorphic+"ID") } if schema.err == nil { @@ -161,10 +233,17 @@ func (schema *Schema) buildPolymorphicRelation(relation *Relationship, field *Fi primaryKeyField := schema.PrioritizedPrimaryField if len(relation.foreignKeys) > 0 { if primaryKeyField = schema.LookUpField(relation.foreignKeys[0]); primaryKeyField == nil || len(relation.foreignKeys) > 1 { - schema.err = fmt.Errorf("invalid polymorphic foreign keys %+v for %v on field %s", relation.foreignKeys, schema, field.Name) + schema.err = fmt.Errorf("invalid polymorphic foreign keys %+v for %v on field %s", relation.foreignKeys, + schema, field.Name) } } + if primaryKeyField == nil { + schema.err = fmt.Errorf("invalid polymorphic type %v for %v on field %s, missing primaryKey field", + relation.FieldSchema, schema, field.Name) + return + } + // use same data type for foreign keys if copyableDataType(primaryKeyField.DataType) { relation.Polymorphic.PolymorphicID.DataType = primaryKeyField.DataType @@ -191,7 +270,8 @@ func (schema *Schema) buildMany2ManyRelation(relation *Relationship, field *Fiel err error joinTableFields []reflect.StructField fieldsMap = map[string]*Field{} - ownFieldsMap = map[string]bool{} // fix self join many2many + ownFieldsMap = map[string]*Field{} // fix self join many2many + referFieldsMap = map[string]*Field{} joinForeignKeys = toColumns(field.TagSettings["JOINFOREIGNKEY"]) joinReferences = toColumns(field.TagSettings["JOINREFERENCES"]) ) @@ -224,12 +304,12 @@ func (schema *Schema) buildMany2ManyRelation(relation *Relationship, field *Fiel } for idx, ownField := range ownForeignFields { - joinFieldName := strings.Title(schema.Name) + ownField.Name + joinFieldName := cases.Title(language.Und, cases.NoLower).String(schema.Name) + ownField.Name if len(joinForeignKeys) > idx { - joinFieldName = strings.Title(joinForeignKeys[idx]) + joinFieldName = cases.Title(language.Und, cases.NoLower).String(joinForeignKeys[idx]) } - ownFieldsMap[joinFieldName] = true + ownFieldsMap[joinFieldName] = ownField fieldsMap[joinFieldName] = ownField joinTableFields = append(joinTableFields, reflect.StructField{ Name: joinFieldName, @@ -241,10 +321,7 @@ func (schema *Schema) buildMany2ManyRelation(relation *Relationship, field *Fiel } for idx, relField := range refForeignFields { - joinFieldName := strings.Title(relation.FieldSchema.Name) + relField.Name - if len(joinReferences) > idx { - joinFieldName = strings.Title(joinReferences[idx]) - } + joinFieldName := cases.Title(language.Und, cases.NoLower).String(relation.FieldSchema.Name) + relField.Name if _, ok := ownFieldsMap[joinFieldName]; ok { if field.Name != relation.FieldSchema.Name { @@ -254,23 +331,32 @@ func (schema *Schema) buildMany2ManyRelation(relation *Relationship, field *Fiel } } - fieldsMap[joinFieldName] = relField - joinTableFields = append(joinTableFields, reflect.StructField{ - Name: joinFieldName, - PkgPath: relField.StructField.PkgPath, - Type: relField.StructField.Type, - Tag: removeSettingFromTag(appendSettingFromTag(relField.StructField.Tag, "primaryKey"), - "column", "autoincrement", "index", "unique", "uniqueindex"), - }) + if len(joinReferences) > idx { + joinFieldName = cases.Title(language.Und, cases.NoLower).String(joinReferences[idx]) + } + + referFieldsMap[joinFieldName] = relField + + if _, ok := fieldsMap[joinFieldName]; !ok { + fieldsMap[joinFieldName] = relField + joinTableFields = append(joinTableFields, reflect.StructField{ + Name: joinFieldName, + PkgPath: relField.StructField.PkgPath, + Type: relField.StructField.Type, + Tag: removeSettingFromTag(appendSettingFromTag(relField.StructField.Tag, "primaryKey"), + "column", "autoincrement", "index", "unique", "uniqueindex"), + }) + } } joinTableFields = append(joinTableFields, reflect.StructField{ - Name: strings.Title(schema.Name) + field.Name, + Name: cases.Title(language.Und, cases.NoLower).String(schema.Name) + field.Name, Type: schema.ModelType, Tag: `gorm:"-"`, }) - if relation.JoinTable, err = Parse(reflect.New(reflect.StructOf(joinTableFields)).Interface(), schema.cacheStore, schema.namer); err != nil { + if relation.JoinTable, err = Parse(reflect.New(reflect.StructOf(joinTableFields)).Interface(), schema.cacheStore, + schema.namer); err != nil { schema.err = err } relation.JoinTable.Name = many2many @@ -317,31 +403,37 @@ func (schema *Schema) buildMany2ManyRelation(relation *Relationship, field *Fiel f.Size = fieldsMap[f.Name].Size } relation.JoinTable.PrimaryFields = append(relation.JoinTable.PrimaryFields, f) - ownPrimaryField := schema == fieldsMap[f.Name].Schema && ownFieldsMap[f.Name] - if ownPrimaryField { + if of, ok := ownFieldsMap[f.Name]; ok { joinRel := relation.JoinTable.Relationships.Relations[relName] joinRel.Field = relation.Field joinRel.References = append(joinRel.References, &Reference{ - PrimaryKey: fieldsMap[f.Name], + PrimaryKey: of, ForeignKey: f, }) - } else { + + relation.References = append(relation.References, &Reference{ + PrimaryKey: of, + ForeignKey: f, + OwnPrimaryKey: true, + }) + } + + if rf, ok := referFieldsMap[f.Name]; ok { joinRefRel := relation.JoinTable.Relationships.Relations[relRefName] if joinRefRel.Field == nil { joinRefRel.Field = relation.Field } joinRefRel.References = append(joinRefRel.References, &Reference{ - PrimaryKey: fieldsMap[f.Name], + PrimaryKey: rf, ForeignKey: f, }) - } - relation.References = append(relation.References, &Reference{ - PrimaryKey: fieldsMap[f.Name], - ForeignKey: f, - OwnPrimaryKey: ownPrimaryField, - }) + relation.References = append(relation.References, &Reference{ + PrimaryKey: rf, + ForeignKey: f, + }) + } } } } @@ -383,7 +475,8 @@ func (schema *Schema) guessRelation(relation *Relationship, field *Field, cgl gu schema.guessRelation(relation, field, guessEmbeddedHas) // case guessEmbeddedHas: default: - schema.err = fmt.Errorf("invalid field found for struct %v's field %s: define a valid foreign key for relations or implement the Valuer/Scanner interface", schema, field.Name) + schema.err = fmt.Errorf("invalid field found for struct %v's field %s: define a valid foreign key for relations or implement the Valuer/Scanner interface", + schema, field.Name) } } @@ -391,34 +484,31 @@ func (schema *Schema) guessRelation(relation *Relationship, field *Field, cgl gu case guessBelongs: primarySchema, foreignSchema = relation.FieldSchema, schema case guessEmbeddedBelongs: - if field.OwnerSchema != nil { - primarySchema, foreignSchema = relation.FieldSchema, field.OwnerSchema - } else { + if field.OwnerSchema == nil { reguessOrErr() return } + primarySchema, foreignSchema = relation.FieldSchema, field.OwnerSchema case guessHas: case guessEmbeddedHas: - if field.OwnerSchema != nil { - primarySchema, foreignSchema = field.OwnerSchema, relation.FieldSchema - } else { + if field.OwnerSchema == nil { reguessOrErr() return } + primarySchema, foreignSchema = field.OwnerSchema, relation.FieldSchema } if len(relation.foreignKeys) > 0 { for _, foreignKey := range relation.foreignKeys { - if f := foreignSchema.LookUpField(foreignKey); f != nil { - foreignFields = append(foreignFields, f) - } else { + f := foreignSchema.LookUpField(foreignKey) + if f == nil { reguessOrErr() return } + foreignFields = append(foreignFields, f) } } else { - var primaryFields []*Field - var primarySchemaName = primarySchema.Name + primarySchemaName := primarySchema.Name if primarySchemaName == "" { primarySchemaName = relation.FieldSchema.Name } @@ -433,6 +523,7 @@ func (schema *Schema) guessRelation(relation *Relationship, field *Field, cgl gu primaryFields = primarySchema.PrimaryFields } + primaryFieldLoop: for _, primaryField := range primaryFields { lookUpName := primarySchemaName + primaryField.Name if gl == guessBelongs { @@ -441,23 +532,33 @@ func (schema *Schema) guessRelation(relation *Relationship, field *Field, cgl gu lookUpNames := []string{lookUpName} if len(primaryFields) == 1 { - lookUpNames = append(lookUpNames, strings.TrimSuffix(lookUpName, primaryField.Name)+"ID", strings.TrimSuffix(lookUpName, primaryField.Name)+"Id", schema.namer.ColumnName(foreignSchema.Table, strings.TrimSuffix(lookUpName, primaryField.Name)+"ID")) + lookUpNames = append(lookUpNames, strings.TrimSuffix(lookUpName, primaryField.Name)+"ID", + strings.TrimSuffix(lookUpName, primaryField.Name)+"Id", schema.namer.ColumnName(foreignSchema.Table, + strings.TrimSuffix(lookUpName, primaryField.Name)+"ID")) } + for _, name := range lookUpNames { + if f := foreignSchema.LookUpFieldByBindName(field.BindNames, name); f != nil { + foreignFields = append(foreignFields, f) + primaryFields = append(primaryFields, primaryField) + continue primaryFieldLoop + } + } for _, name := range lookUpNames { if f := foreignSchema.LookUpField(name); f != nil { foreignFields = append(foreignFields, f) primaryFields = append(primaryFields, primaryField) - break + continue primaryFieldLoop } } } } - if len(foreignFields) == 0 { + switch { + case len(foreignFields) == 0: reguessOrErr() return - } else if len(relation.primaryKeys) > 0 { + case len(relation.primaryKeys) > 0: for idx, primaryKey := range relation.primaryKeys { if f := primarySchema.LookUpField(primaryKey); f != nil { if len(primaryFields) < idx+1 { @@ -471,7 +572,7 @@ func (schema *Schema) guessRelation(relation *Relationship, field *Field, cgl gu return } } - } else if len(primaryFields) == 0 { + case len(primaryFields) == 0: if len(foreignFields) == 1 && primarySchema.PrioritizedPrimaryField != nil { primaryFields = append(primaryFields, primarySchema.PrioritizedPrimaryField) } else if len(primarySchema.PrimaryFields) == len(foreignFields) { @@ -507,6 +608,7 @@ func (schema *Schema) guessRelation(relation *Relationship, field *Field, cgl gu } } +// Constraint is ForeignKey Constraint type Constraint struct { Name string Field *Field @@ -518,6 +620,31 @@ type Constraint struct { OnUpdate string } +func (constraint *Constraint) GetName() string { return constraint.Name } + +func (constraint *Constraint) Build() (sql string, vars []interface{}) { + sql = "CONSTRAINT ? FOREIGN KEY ? REFERENCES ??" + if constraint.OnDelete != "" { + sql += " ON DELETE " + constraint.OnDelete + } + + if constraint.OnUpdate != "" { + sql += " ON UPDATE " + constraint.OnUpdate + } + + foreignKeys := make([]interface{}, 0, len(constraint.ForeignKeys)) + for _, field := range constraint.ForeignKeys { + foreignKeys = append(foreignKeys, clause.Column{Name: field.DBName}) + } + + references := make([]interface{}, 0, len(constraint.References)) + for _, field := range constraint.References { + references = append(references, clause.Column{Name: field.DBName}) + } + vars = append(vars, clause.Table{Name: constraint.Name}, foreignKeys, clause.Table{Name: constraint.ReferenceSchema.Table}, references) + return +} + func (rel *Relationship) ParseConstraint() *Constraint { str := rel.Field.TagSettings["CONSTRAINT"] if str == "-" { diff --git a/vendor/gorm.io/gorm/schema/schema.go b/vendor/gorm.io/gorm/schema/schema.go index eca113e..db23679 100644 --- a/vendor/gorm.io/gorm/schema/schema.go +++ b/vendor/gorm.io/gorm/schema/schema.go @@ -6,12 +6,27 @@ import ( "fmt" "go/ast" "reflect" + "strings" "sync" "gorm.io/gorm/clause" "gorm.io/gorm/logger" ) +type callbackType string + +const ( + callbackTypeBeforeCreate callbackType = "BeforeCreate" + callbackTypeBeforeUpdate callbackType = "BeforeUpdate" + callbackTypeAfterCreate callbackType = "AfterCreate" + callbackTypeAfterUpdate callbackType = "AfterUpdate" + callbackTypeBeforeSave callbackType = "BeforeSave" + callbackTypeAfterSave callbackType = "AfterSave" + callbackTypeBeforeDelete callbackType = "BeforeDelete" + callbackTypeAfterDelete callbackType = "AfterDelete" + callbackTypeAfterFind callbackType = "AfterFind" +) + // ErrUnsupportedDataType unsupported data type var ErrUnsupportedDataType = errors.New("unsupported data type") @@ -25,6 +40,7 @@ type Schema struct { PrimaryFieldDBNames []string Fields []*Field FieldsByName map[string]*Field + FieldsByBindName map[string]*Field // embedded fields is 'Embed.Field' FieldsByDBName map[string]*Field FieldsWithDefaultDBValue []*Field // fields with default value assigned by database Relationships Relationships @@ -51,9 +67,10 @@ func (schema Schema) String() string { } func (schema Schema) MakeSlice() reflect.Value { - slice := reflect.MakeSlice(reflect.SliceOf(reflect.PtrTo(schema.ModelType)), 0, 20) + slice := reflect.MakeSlice(reflect.SliceOf(reflect.PointerTo(schema.ModelType)), 0, 20) results := reflect.New(slice.Type()) results.Elem().Set(slice) + return results } @@ -67,10 +84,35 @@ func (schema Schema) LookUpField(name string) *Field { return nil } +// LookUpFieldByBindName looks for the closest field in the embedded struct. +// +// type Struct struct { +// Embedded struct { +// ID string // is selected by LookUpFieldByBindName([]string{"Embedded", "ID"}, "ID") +// } +// ID string // is selected by LookUpFieldByBindName([]string{"ID"}, "ID") +// } +func (schema Schema) LookUpFieldByBindName(bindNames []string, name string) *Field { + if len(bindNames) == 0 { + return nil + } + for i := len(bindNames) - 1; i >= 0; i-- { + find := strings.Join(bindNames[:i], ".") + "." + name + if field, ok := schema.FieldsByBindName[find]; ok { + return field + } + } + return nil +} + type Tabler interface { TableName() string } +type TablerWithNamer interface { + TableName(Namer) string +} + // Parse get data type from dialector func Parse(dest interface{}, cacheStore *sync.Map, namer Namer) (*Schema, error) { return ParseWithSpecialTableName(dest, cacheStore, namer, "") @@ -112,7 +154,7 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam schemaCacheKey = modelType } - // Load exist schmema cache, return if exists + // Load exist schema cache, return if exists if v, ok := cacheStore.Load(schemaCacheKey); ok { s := v.(*Schema) // Wait for the initialization of other goroutines to complete @@ -125,6 +167,9 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam if tabler, ok := modelValue.Interface().(Tabler); ok { tableName = tabler.TableName() } + if tabler, ok := modelValue.Interface().(TablerWithNamer); ok { + tableName = tabler.TableName(namer) + } if en, ok := namer.(embeddedNamer); ok { tableName = en.Table } @@ -133,20 +178,21 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam } schema := &Schema{ - Name: modelType.Name(), - ModelType: modelType, - Table: tableName, - FieldsByName: map[string]*Field{}, - FieldsByDBName: map[string]*Field{}, - Relationships: Relationships{Relations: map[string]*Relationship{}}, - cacheStore: cacheStore, - namer: namer, - initialized: make(chan struct{}), + Name: modelType.Name(), + ModelType: modelType, + Table: tableName, + FieldsByName: map[string]*Field{}, + FieldsByBindName: map[string]*Field{}, + FieldsByDBName: map[string]*Field{}, + Relationships: Relationships{Relations: map[string]*Relationship{}}, + cacheStore: cacheStore, + namer: namer, + initialized: make(chan struct{}), } // When the schema initialization is completed, the channel will be closed defer close(schema.initialized) - // Load exist schmema cache, return if exists + // Load exist schema cache, return if exists if v, ok := cacheStore.Load(schemaCacheKey); ok { s := v.(*Schema) // Wait for the initialization of other goroutines to complete @@ -169,6 +215,7 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam field.DBName = namer.ColumnName(schema.Table, field.Name) } + bindName := field.BindName() if field.DBName != "" { // nonexistence or shortest path or first appear prioritized if has permission if v, ok := schema.FieldsByDBName[field.DBName]; !ok || ((field.Creatable || field.Updatable || field.Readable) && len(field.BindNames) < len(v.BindNames)) { @@ -177,6 +224,7 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam } schema.FieldsByDBName[field.DBName] = field schema.FieldsByName[field.Name] = field + schema.FieldsByBindName[bindName] = field if v != nil && v.PrimaryKey { for idx, f := range schema.PrimaryFields { @@ -195,6 +243,9 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam if of, ok := schema.FieldsByName[field.Name]; !ok || of.TagSettings["-"] == "-" { schema.FieldsByName[field.Name] = field } + if of, ok := schema.FieldsByBindName[bindName]; !ok || of.TagSettings["-"] == "-" { + schema.FieldsByBindName[bindName] = field + } field.setupValuerAndSetter() } @@ -214,8 +265,18 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam } } - if schema.PrioritizedPrimaryField == nil && len(schema.PrimaryFields) == 1 { - schema.PrioritizedPrimaryField = schema.PrimaryFields[0] + if schema.PrioritizedPrimaryField == nil { + if len(schema.PrimaryFields) == 1 { + schema.PrioritizedPrimaryField = schema.PrimaryFields[0] + } else if len(schema.PrimaryFields) > 1 { + // If there are multiple primary keys, the AUTOINCREMENT field is prioritized + for _, field := range schema.PrimaryFields { + if field.AutoIncrement { + schema.PrioritizedPrimaryField = field + break + } + } + } } for _, field := range schema.PrimaryFields { @@ -223,7 +284,7 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam } for _, field := range schema.Fields { - if field.HasDefaultValue && field.DefaultValueInterface == nil { + if field.DataType != "" && field.HasDefaultValue && field.DefaultValueInterface == nil { schema.FieldsWithDefaultDBValue = append(schema.FieldsWithDefaultDBValue, field) } } @@ -242,14 +303,20 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam } } - callbacks := []string{"BeforeCreate", "AfterCreate", "BeforeUpdate", "AfterUpdate", "BeforeSave", "AfterSave", "BeforeDelete", "AfterDelete", "AfterFind"} - for _, name := range callbacks { - if methodValue := modelValue.MethodByName(name); methodValue.IsValid() { + callbackTypes := []callbackType{ + callbackTypeBeforeCreate, callbackTypeAfterCreate, + callbackTypeBeforeUpdate, callbackTypeAfterUpdate, + callbackTypeBeforeSave, callbackTypeAfterSave, + callbackTypeBeforeDelete, callbackTypeAfterDelete, + callbackTypeAfterFind, + } + for _, cbName := range callbackTypes { + if methodValue := callBackToMethodValue(modelValue, cbName); methodValue.IsValid() { switch methodValue.Type().String() { case "func(*gorm.DB) error": // TODO hack - reflect.Indirect(reflect.ValueOf(schema)).FieldByName(name).SetBool(true) + reflect.Indirect(reflect.ValueOf(schema)).FieldByName(string(cbName)).SetBool(true) default: - logger.Default.Warn(context.Background(), "Model %v don't match %vInterface, should be `%v(*gorm.DB) error`. Please see https://gorm.io/docs/hooks.html", schema, name, name) + logger.Default.Warn(context.Background(), "Model %v don't match %vInterface, should be `%v(*gorm.DB) error`. Please see https://gorm.io/docs/hooks.html", schema, cbName, cbName) } } } @@ -271,11 +338,12 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam if _, embedded := schema.cacheStore.Load(embeddedCacheKey); !embedded { for _, field := range schema.Fields { - if field.DataType == "" && (field.Creatable || field.Updatable || field.Readable) { + if field.DataType == "" && field.GORMDataType == "" && (field.Creatable || field.Updatable || field.Readable) { if schema.parseRelation(field); schema.err != nil { return schema, schema.err } else { schema.FieldsByName[field.Name] = field + schema.FieldsByBindName[field.BindName()] = field } } @@ -302,6 +370,39 @@ func ParseWithSpecialTableName(dest interface{}, cacheStore *sync.Map, namer Nam return schema, schema.err } +// This unrolling is needed to show to the compiler the exact set of methods +// that can be used on the modelType. +// Prior to go1.22 any use of MethodByName would cause the linker to +// abandon dead code elimination for the entire binary. +// As of go1.22 the compiler supports one special case of a string constant +// being passed to MethodByName. For enterprise customers or those building +// large binaries, this gives a significant reduction in binary size. +// https://github.com/golang/go/issues/62257 +func callBackToMethodValue(modelType reflect.Value, cbType callbackType) reflect.Value { + switch cbType { + case callbackTypeBeforeCreate: + return modelType.MethodByName(string(callbackTypeBeforeCreate)) + case callbackTypeAfterCreate: + return modelType.MethodByName(string(callbackTypeAfterCreate)) + case callbackTypeBeforeUpdate: + return modelType.MethodByName(string(callbackTypeBeforeUpdate)) + case callbackTypeAfterUpdate: + return modelType.MethodByName(string(callbackTypeAfterUpdate)) + case callbackTypeBeforeSave: + return modelType.MethodByName(string(callbackTypeBeforeSave)) + case callbackTypeAfterSave: + return modelType.MethodByName(string(callbackTypeAfterSave)) + case callbackTypeBeforeDelete: + return modelType.MethodByName(string(callbackTypeBeforeDelete)) + case callbackTypeAfterDelete: + return modelType.MethodByName(string(callbackTypeAfterDelete)) + case callbackTypeAfterFind: + return modelType.MethodByName(string(callbackTypeAfterFind)) + default: + return reflect.ValueOf(nil) + } +} + func getOrParse(dest interface{}, cacheStore *sync.Map, namer Namer) (*Schema, error) { modelType := reflect.ValueOf(dest).Type() for modelType.Kind() == reflect.Slice || modelType.Kind() == reflect.Array || modelType.Kind() == reflect.Ptr { diff --git a/vendor/gorm.io/gorm/schema/serializer.go b/vendor/gorm.io/gorm/schema/serializer.go index 758a642..0fafbcb 100644 --- a/vendor/gorm.io/gorm/schema/serializer.go +++ b/vendor/gorm.io/gorm/schema/serializer.go @@ -70,8 +70,7 @@ type SerializerValuerInterface interface { } // JSONSerializer json serializer -type JSONSerializer struct { -} +type JSONSerializer struct{} // Scan implements serializer interface func (JSONSerializer) Scan(ctx context.Context, field *Field, dst reflect.Value, dbValue interface{}) (err error) { @@ -85,10 +84,15 @@ func (JSONSerializer) Scan(ctx context.Context, field *Field, dst reflect.Value, case string: bytes = []byte(v) default: - return fmt.Errorf("failed to unmarshal JSONB value: %#v", dbValue) + bytes, err = json.Marshal(v) + if err != nil { + return err + } } - err = json.Unmarshal(bytes, fieldValue.Interface()) + if len(bytes) > 0 { + err = json.Unmarshal(bytes, fieldValue.Interface()) + } } field.ReflectValueOf(ctx, dst).Set(fieldValue.Elem()) @@ -98,12 +102,17 @@ func (JSONSerializer) Scan(ctx context.Context, field *Field, dst reflect.Value, // Value implements serializer interface func (JSONSerializer) Value(ctx context.Context, field *Field, dst reflect.Value, fieldValue interface{}) (interface{}, error) { result, err := json.Marshal(fieldValue) + if string(result) == "null" { + if field.TagSettings["NOT NULL"] != "" { + return "", nil + } + return nil, err + } return string(result), err } // UnixSecondSerializer json serializer -type UnixSecondSerializer struct { -} +type UnixSecondSerializer struct{} // Scan implements serializer interface func (UnixSecondSerializer) Scan(ctx context.Context, field *Field, dst reflect.Value, dbValue interface{}) (err error) { @@ -117,9 +126,15 @@ func (UnixSecondSerializer) Scan(ctx context.Context, field *Field, dst reflect. // Value implements serializer interface func (UnixSecondSerializer) Value(ctx context.Context, field *Field, dst reflect.Value, fieldValue interface{}) (result interface{}, err error) { + rv := reflect.ValueOf(fieldValue) switch v := fieldValue.(type) { - case int64, int, uint, uint64, int32, uint32, int16, uint16, *int64, *int, *uint, *uint64, *int32, *uint32, *int16, *uint16: - result = time.Unix(reflect.Indirect(reflect.ValueOf(v)).Int(), 0) + case int64, int, uint, uint64, int32, uint32, int16, uint16: + result = time.Unix(reflect.Indirect(rv).Int(), 0).UTC() + case *int64, *int, *uint, *uint64, *int32, *uint32, *int16, *uint16: + if rv.IsZero() { + return nil, nil + } + result = time.Unix(reflect.Indirect(rv).Int(), 0).UTC() default: err = fmt.Errorf("invalid field type %#v for UnixSecondSerializer, only int, uint supported", v) } @@ -127,8 +142,7 @@ func (UnixSecondSerializer) Value(ctx context.Context, field *Field, dst reflect } // GobSerializer gob serializer -type GobSerializer struct { -} +type GobSerializer struct{} // Scan implements serializer interface func (GobSerializer) Scan(ctx context.Context, field *Field, dst reflect.Value, dbValue interface{}) (err error) { @@ -142,8 +156,10 @@ func (GobSerializer) Scan(ctx context.Context, field *Field, dst reflect.Value, default: return fmt.Errorf("failed to unmarshal gob value: %#v", dbValue) } - decoder := gob.NewDecoder(bytes.NewBuffer(bytesValue)) - err = decoder.Decode(fieldValue.Interface()) + if len(bytesValue) > 0 { + decoder := gob.NewDecoder(bytes.NewBuffer(bytesValue)) + err = decoder.Decode(fieldValue.Interface()) + } } field.ReflectValueOf(ctx, dst).Set(fieldValue.Elem()) return diff --git a/vendor/gorm.io/gorm/schema/utils.go b/vendor/gorm.io/gorm/schema/utils.go index acf1a73..7fdda18 100644 --- a/vendor/gorm.io/gorm/schema/utils.go +++ b/vendor/gorm.io/gorm/schema/utils.go @@ -115,6 +115,11 @@ func GetIdentityFieldValuesMap(ctx context.Context, reflectValue reflect.Value, notZero, zero bool ) + if reflectValue.Kind() == reflect.Ptr || + reflectValue.Kind() == reflect.Interface { + reflectValue = reflectValue.Elem() + } + switch reflectValue.Kind() { case reflect.Struct: results = [][]interface{}{make([]interface{}, len(fields))} @@ -133,7 +138,7 @@ func GetIdentityFieldValuesMap(ctx context.Context, reflectValue reflect.Value, for i := 0; i < reflectValue.Len(); i++ { elem := reflectValue.Index(i) elemKey := elem.Interface() - if elem.Kind() != reflect.Ptr { + if elem.Kind() != reflect.Ptr && elem.CanAddr() { elemKey = elem.Addr().Interface() } diff --git a/vendor/gorm.io/gorm/soft_delete.go b/vendor/gorm.io/gorm/soft_delete.go index 6d64628..5673d3b 100644 --- a/vendor/gorm.io/gorm/soft_delete.go +++ b/vendor/gorm.io/gorm/soft_delete.go @@ -6,6 +6,7 @@ import ( "encoding/json" "reflect" + "github.com/jinzhu/now" "gorm.io/gorm/clause" "gorm.io/gorm/schema" ) @@ -45,11 +46,21 @@ func (n *DeletedAt) UnmarshalJSON(b []byte) error { } func (DeletedAt) QueryClauses(f *schema.Field) []clause.Interface { - return []clause.Interface{SoftDeleteQueryClause{Field: f}} + return []clause.Interface{SoftDeleteQueryClause{Field: f, ZeroValue: parseZeroValueTag(f)}} +} + +func parseZeroValueTag(f *schema.Field) sql.NullString { + if v, ok := f.TagSettings["ZEROVALUE"]; ok { + if _, err := now.Parse(v); err == nil { + return sql.NullString{String: v, Valid: true} + } + } + return sql.NullString{Valid: false} } type SoftDeleteQueryClause struct { - Field *schema.Field + ZeroValue sql.NullString + Field *schema.Field } func (sd SoftDeleteQueryClause) Name() string { @@ -78,18 +89,19 @@ func (sd SoftDeleteQueryClause) ModifyStatement(stmt *Statement) { } stmt.AddClause(clause.Where{Exprs: []clause.Expression{ - clause.Eq{Column: clause.Column{Table: clause.CurrentTable, Name: sd.Field.DBName}, Value: nil}, + clause.Eq{Column: clause.Column{Table: clause.CurrentTable, Name: sd.Field.DBName}, Value: sd.ZeroValue}, }}) stmt.Clauses["soft_delete_enabled"] = clause.Clause{} } } func (DeletedAt) UpdateClauses(f *schema.Field) []clause.Interface { - return []clause.Interface{SoftDeleteUpdateClause{Field: f}} + return []clause.Interface{SoftDeleteUpdateClause{Field: f, ZeroValue: parseZeroValueTag(f)}} } type SoftDeleteUpdateClause struct { - Field *schema.Field + ZeroValue sql.NullString + Field *schema.Field } func (sd SoftDeleteUpdateClause) Name() string { @@ -109,11 +121,12 @@ func (sd SoftDeleteUpdateClause) ModifyStatement(stmt *Statement) { } func (DeletedAt) DeleteClauses(f *schema.Field) []clause.Interface { - return []clause.Interface{SoftDeleteDeleteClause{Field: f}} + return []clause.Interface{SoftDeleteDeleteClause{Field: f, ZeroValue: parseZeroValueTag(f)}} } type SoftDeleteDeleteClause struct { - Field *schema.Field + ZeroValue sql.NullString + Field *schema.Field } func (sd SoftDeleteDeleteClause) Name() string { diff --git a/vendor/gorm.io/gorm/statement.go b/vendor/gorm.io/gorm/statement.go index 850af6c..39e05d0 100644 --- a/vendor/gorm.io/gorm/statement.go +++ b/vendor/gorm.io/gorm/statement.go @@ -30,8 +30,9 @@ type Statement struct { Clauses map[string]clause.Clause BuildClauses []string Distinct bool - Selects []string // selected columns - Omits []string // omit columns + Selects []string // selected columns + Omits []string // omit columns + ColumnMapping map[string]string // map columns Joins []join Preloads map[string][]interface{} Settings sync.Map @@ -49,9 +50,12 @@ type Statement struct { } type join struct { - Name string - Conds []interface{} - On *clause.Where + Name string + Conds []interface{} + On *clause.Where + Selects []string + Omits []string + JoinType clause.JoinType } // StatementModifier statement modifier interface @@ -117,6 +121,8 @@ func (stmt *Statement) QuoteTo(writer clause.Writer, field interface{}) { write(v.Raw, stmt.Schema.PrioritizedPrimaryField.DBName) } else if len(stmt.Schema.DBNames) > 0 { write(v.Raw, stmt.Schema.DBNames[0]) + } else { + stmt.DB.AddError(ErrModelAccessibleFieldsRequired) //nolint:typecheck,errcheck } } else { write(v.Raw, v.Name) @@ -179,6 +185,10 @@ func (stmt *Statement) AddVar(writer clause.Writer, vars ...interface{}) { } else { stmt.AddVar(writer, v.GormValue(stmt.Context, stmt.DB)) } + case clause.Interface: + c := clause.Clause{Name: v.Name()} + v.MergeClause(&c) + c.Build(stmt) case clause.Expression: v.Build(stmt) case driver.Valuer: @@ -304,6 +314,9 @@ func (stmt *Statement) BuildCondition(query interface{}, args ...interface{}) [] conds := make([]clause.Expression, 0, 4) args = append([]interface{}{query}, args...) for idx, arg := range args { + if arg == nil { + continue + } if valuer, ok := arg.(driver.Valuer); ok { arg, _ = valuer.Value() } @@ -312,9 +325,7 @@ func (stmt *Statement) BuildCondition(query interface{}, args ...interface{}) [] case clause.Expression: conds = append(conds, v) case *DB: - for _, scope := range v.Statement.scopes { - v = scope(v) - } + v.executeScopes() if cs, ok := v.Statement.Clauses["WHERE"]; ok { if where, ok := cs.Expression.(clause.Where); ok { @@ -437,8 +448,9 @@ func (stmt *Statement) BuildCondition(query interface{}, args ...interface{}) [] if len(values) > 0 { conds = append(conds, clause.IN{Column: clause.PrimaryColumn, Values: values}) + return []clause.Expression{clause.And(conds...)} } - return conds + return nil } } @@ -447,7 +459,10 @@ func (stmt *Statement) BuildCondition(query interface{}, args ...interface{}) [] } } - return conds + if len(conds) > 0 { + return []clause.Expression{clause.And(conds...)} + } + return nil } // Build build sql with clauses names @@ -499,6 +514,7 @@ func (stmt *Statement) clone() *Statement { Distinct: stmt.Distinct, Selects: stmt.Selects, Omits: stmt.Omits, + ColumnMapping: stmt.ColumnMapping, Preloads: map[string][]interface{}{}, ConnPool: stmt.ConnPool, Schema: stmt.Schema, @@ -540,8 +556,9 @@ func (stmt *Statement) clone() *Statement { } // SetColumn set column's value -// stmt.SetColumn("Name", "jinzhu") // Hooks Method -// stmt.SetColumn("Name", "jinzhu", true) // Callbacks Method +// +// stmt.SetColumn("Name", "jinzhu") // Hooks Method +// stmt.SetColumn("Name", "jinzhu", true) // Callbacks Method func (stmt *Statement) SetColumn(name string, value interface{}, fromCallbacks ...bool) { if v, ok := stmt.Dest.(map[string]interface{}); ok { v[name] = value @@ -650,54 +667,62 @@ func (stmt *Statement) Changed(fields ...string) bool { return false } -var nameMatcher = regexp.MustCompile(`^[\W]?(?:[a-z_0-9]+?)[\W]?\.[\W]?([a-z_0-9]+?)[\W]?$`) +var matchName = func() func(tableColumn string) (table, column string) { + nameMatcher := regexp.MustCompile(`^(?:\W?(\w+?)\W?\.)?(?:(\*)|\W?(\w+?)\W?)$`) + return func(tableColumn string) (table, column string) { + if matches := nameMatcher.FindStringSubmatch(tableColumn); len(matches) == 4 { + table = matches[1] + star := matches[2] + columnName := matches[3] + if star != "" { + return table, star + } + return table, columnName + } + return "", "" + } +}() // SelectAndOmitColumns get select and omit columns, select -> true, omit -> false func (stmt *Statement) SelectAndOmitColumns(requireCreate, requireUpdate bool) (map[string]bool, bool) { results := map[string]bool{} notRestricted := false - // select columns - for _, column := range stmt.Selects { + processColumn := func(column string, result bool) { if stmt.Schema == nil { - results[column] = true + results[column] = result } else if column == "*" { - notRestricted = true + notRestricted = result for _, dbName := range stmt.Schema.DBNames { - results[dbName] = true + results[dbName] = result } } else if column == clause.Associations { for _, rel := range stmt.Schema.Relationships.Relations { - results[rel.Name] = true + results[rel.Name] = result } } else if field := stmt.Schema.LookUpField(column); field != nil && field.DBName != "" { - results[field.DBName] = true - } else if matches := nameMatcher.FindStringSubmatch(column); len(matches) == 2 { - results[matches[1]] = true + results[field.DBName] = result + } else if table, col := matchName(column); col != "" && (table == stmt.Table || table == "") { + if col == "*" { + for _, dbName := range stmt.Schema.DBNames { + results[dbName] = result + } + } else { + results[col] = result + } } else { - results[column] = true + results[column] = result } } + // select columns + for _, column := range stmt.Selects { + processColumn(column, true) + } + // omit columns - for _, omit := range stmt.Omits { - if stmt.Schema == nil { - results[omit] = false - } else if omit == "*" { - for _, dbName := range stmt.Schema.DBNames { - results[dbName] = false - } - } else if omit == clause.Associations { - for _, rel := range stmt.Schema.Relationships.Relations { - results[rel.Name] = false - } - } else if field := stmt.Schema.LookUpField(omit); field != nil && field.DBName != "" { - results[field.DBName] = false - } else if matches := nameMatcher.FindStringSubmatch(omit); len(matches) == 2 { - results[matches[1]] = false - } else { - results[omit] = false - } + for _, column := range stmt.Omits { + processColumn(column, false) } if stmt.Schema != nil { diff --git a/vendor/gorm.io/gorm/utils/utils.go b/vendor/gorm.io/gorm/utils/utils.go index 296917b..fc615d7 100644 --- a/vendor/gorm.io/gorm/utils/utils.go +++ b/vendor/gorm.io/gorm/utils/utils.go @@ -3,8 +3,8 @@ package utils import ( "database/sql/driver" "fmt" + "path/filepath" "reflect" - "regexp" "runtime" "strconv" "strings" @@ -16,16 +16,32 @@ var gormSourceDir string func init() { _, file, _, _ := runtime.Caller(0) // compatible solution to get gorm source directory with various operating systems - gormSourceDir = regexp.MustCompile(`utils.utils\.go`).ReplaceAllString(file, "") + gormSourceDir = sourceDir(file) +} + +func sourceDir(file string) string { + dir := filepath.Dir(file) + dir = filepath.Dir(dir) + + s := filepath.Dir(dir) + if filepath.Base(s) != "gorm.io" { + s = dir + } + return filepath.ToSlash(s) + "/" } // FileWithLineNum return the file name and line number of the current file func FileWithLineNum() string { - // the second caller usually from gorm internal, so set i start from 2 - for i := 2; i < 15; i++ { - _, file, line, ok := runtime.Caller(i) - if ok && (!strings.HasPrefix(file, gormSourceDir) || strings.HasSuffix(file, "_test.go")) { - return file + ":" + strconv.FormatInt(int64(line), 10) + pcs := [13]uintptr{} + // the third caller usually from gorm internal + len := runtime.Callers(3, pcs[:]) + frames := runtime.CallersFrames(pcs[:len]) + for i := 0; i < len; i++ { + // second return value is "more", not "ok" + frame, _ := frames.Next() + if (!strings.HasPrefix(frame.File, gormSourceDir) || + strings.HasSuffix(frame.File, "_test.go")) && !strings.HasSuffix(frame.File, ".gen.go") { + return string(strconv.AppendInt(append([]byte(frame.File), ':'), int64(frame.Line), 10)) } } @@ -62,7 +78,11 @@ func ToStringKey(values ...interface{}) string { case uint: results[idx] = strconv.FormatUint(uint64(v), 10) default: - results[idx] = fmt.Sprint(reflect.Indirect(reflect.ValueOf(v)).Interface()) + results[idx] = "nil" + vv := reflect.ValueOf(v) + if vv.IsValid() && !vv.IsZero() { + results[idx] = fmt.Sprint(reflect.Indirect(vv).Interface()) + } } } @@ -78,19 +98,28 @@ func Contains(elems []string, elem string) bool { return false } -func AssertEqual(src, dst interface{}) bool { - if !reflect.DeepEqual(src, dst) { - if valuer, ok := src.(driver.Valuer); ok { - src, _ = valuer.Value() - } +func AssertEqual(x, y interface{}) bool { + if reflect.DeepEqual(x, y) { + return true + } + if x == nil || y == nil { + return false + } - if valuer, ok := dst.(driver.Valuer); ok { - dst, _ = valuer.Value() - } + xval := reflect.ValueOf(x) + yval := reflect.ValueOf(y) + if xval.Kind() == reflect.Ptr && xval.IsNil() || + yval.Kind() == reflect.Ptr && yval.IsNil() { + return false + } - return reflect.DeepEqual(src, dst) + if valuer, ok := x.(driver.Valuer); ok { + x, _ = valuer.Value() } - return true + if valuer, ok := y.(driver.Valuer); ok { + y, _ = valuer.Value() + } + return reflect.DeepEqual(x, y) } func ToString(value interface{}) string { @@ -120,3 +149,31 @@ func ToString(value interface{}) string { } return "" } + +const nestedRelationSplit = "__" + +// NestedRelationName nested relationships like `Manager__Company` +func NestedRelationName(prefix, name string) string { + return prefix + nestedRelationSplit + name +} + +// SplitNestedRelationName Split nested relationships to `[]string{"Manager","Company"}` +func SplitNestedRelationName(name string) []string { + return strings.Split(name, nestedRelationSplit) +} + +// JoinNestedRelationNames nested relationships like `Manager__Company` +func JoinNestedRelationNames(relationNames []string) string { + return strings.Join(relationNames, nestedRelationSplit) +} + +// RTrimSlice Right trims the given slice by given length +func RTrimSlice[T any](v []T, trimLen int) []T { + if trimLen >= len(v) { // trimLen greater than slice len means fully sliced + return v[:0] + } + if trimLen < 0 { // negative trimLen is ignored + return v[:] + } + return v[:len(v)-trimLen] +} diff --git a/vendor/modules.txt b/vendor/modules.txt index a8885a7..d645bea 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,29 +1,32 @@ # github.com/AudriusButkevicius/pfilter v0.0.11 ## explicit; go 1.20 github.com/AudriusButkevicius/pfilter -# github.com/VictoriaMetrics/metrics v1.24.0 -## explicit; go 1.20 +# github.com/VictoriaMetrics/metrics v1.35.1 +## explicit; go 1.17 github.com/VictoriaMetrics/metrics # github.com/ccding/go-stun/stun v0.0.0-20200514191101-4dc67bcdb029 ## explicit; go 1.14 github.com/ccding/go-stun/stun -# github.com/cespare/xxhash/v2 v2.1.2 +# github.com/cespare/xxhash/v2 v2.3.0 ## explicit; go 1.11 github.com/cespare/xxhash/v2 +# github.com/chen3feng/safecast v0.0.0-20220908170618-81b2ecd47937 +## explicit; go 1.18 +github.com/chen3feng/safecast # github.com/davecgh/go-spew v1.1.1 ## explicit github.com/davecgh/go-spew/spew # github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f ## explicit github.com/dgryski/go-rendezvous -# github.com/fatih/color v1.15.0 +# github.com/fatih/color v1.17.0 ## explicit; go 1.17 github.com/fatih/color # github.com/go-chi/chi v4.1.2+incompatible ## explicit github.com/go-chi/chi github.com/go-chi/chi/middleware -# github.com/go-chi/chi/v5 v5.0.11 +# github.com/go-chi/chi/v5 v5.1.0 ## explicit; go 1.14 github.com/go-chi/chi/v5 github.com/go-chi/chi/v5/middleware @@ -49,20 +52,20 @@ github.com/go-redis/redis/v8/internal/pool github.com/go-redis/redis/v8/internal/proto github.com/go-redis/redis/v8/internal/rand github.com/go-redis/redis/v8/internal/util -# github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 -## explicit; go 1.13 -github.com/go-task/slim-sprig -# github.com/gocarina/gocsv v0.0.0-20230616125104-99d496ca653d +# github.com/go-task/slim-sprig/v3 v3.0.0 +## explicit; go 1.20 +github.com/go-task/slim-sprig/v3 +# github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 ## explicit; go 1.13 github.com/gocarina/gocsv -# github.com/google/pprof v0.0.0-20230821062121-407c9e7a662f -## explicit; go 1.19 +# github.com/google/pprof v0.0.0-20241017200806-017d972448fc +## explicit; go 1.22 github.com/google/pprof/profile -# github.com/google/uuid v1.3.1 +# github.com/google/uuid v1.6.0 ## explicit github.com/google/uuid -# github.com/hashicorp/yamux v0.1.1 -## explicit; go 1.15 +# github.com/hashicorp/yamux v0.1.2 +## explicit; go 1.20 github.com/hashicorp/yamux # github.com/inconshreveable/mousetrap v1.1.0 ## explicit; go 1.18 @@ -70,34 +73,30 @@ github.com/inconshreveable/mousetrap # github.com/ivanpirog/coloredcobra v1.0.1 ## explicit; go 1.15 github.com/ivanpirog/coloredcobra -# github.com/jackc/chunkreader/v2 v2.0.1 -## explicit; go 1.12 -github.com/jackc/chunkreader/v2 -# github.com/jackc/pgconn v1.14.3 -## explicit; go 1.17 -github.com/jackc/pgconn -github.com/jackc/pgconn/internal/ctxwatch -github.com/jackc/pgconn/stmtcache -# github.com/jackc/pgio v1.0.0 -## explicit; go 1.12 -github.com/jackc/pgio # github.com/jackc/pgpassfile v1.0.0 ## explicit; go 1.12 github.com/jackc/pgpassfile -# github.com/jackc/pgproto3/v2 v2.3.3 -## explicit; go 1.12 -github.com/jackc/pgproto3/v2 -# github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a +# github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 ## explicit; go 1.14 github.com/jackc/pgservicefile -# github.com/jackc/pgtype v1.14.0 -## explicit; go 1.13 -github.com/jackc/pgtype -# github.com/jackc/pgx/v4 v4.18.2 -## explicit; go 1.17 -github.com/jackc/pgx/v4 -github.com/jackc/pgx/v4/internal/sanitize -github.com/jackc/pgx/v4/stdlib +# github.com/jackc/pgx/v5 v5.7.1 +## explicit; go 1.21 +github.com/jackc/pgx/v5 +github.com/jackc/pgx/v5/internal/iobufpool +github.com/jackc/pgx/v5/internal/pgio +github.com/jackc/pgx/v5/internal/sanitize +github.com/jackc/pgx/v5/internal/stmtcache +github.com/jackc/pgx/v5/pgconn +github.com/jackc/pgx/v5/pgconn/ctxwatch +github.com/jackc/pgx/v5/pgconn/internal/bgreader +github.com/jackc/pgx/v5/pgproto3 +github.com/jackc/pgx/v5/pgtype +github.com/jackc/pgx/v5/pgxpool +github.com/jackc/pgx/v5/stdlib +# github.com/jackc/puddle/v2 v2.2.2 +## explicit; go 1.19 +github.com/jackc/puddle/v2 +github.com/jackc/puddle/v2/internal/genstack # github.com/jinzhu/inflection v1.0.0 ## explicit github.com/jinzhu/inflection @@ -107,11 +106,11 @@ github.com/jinzhu/now # github.com/json-iterator/go v1.1.12 ## explicit; go 1.12 github.com/json-iterator/go -# github.com/klauspost/cpuid/v2 v2.2.5 +# github.com/klauspost/cpuid/v2 v2.2.8 ## explicit; go 1.15 github.com/klauspost/cpuid/v2 -# github.com/klauspost/reedsolomon v1.11.8 -## explicit; go 1.17 +# github.com/klauspost/reedsolomon v1.12.4 +## explicit; go 1.21 github.com/klauspost/reedsolomon # github.com/lib/pq v1.10.9 ## explicit; go 1.13 @@ -121,7 +120,7 @@ github.com/lib/pq/scram # github.com/mattn/go-colorable v0.1.13 ## explicit; go 1.15 github.com/mattn/go-colorable -# github.com/mattn/go-isatty v0.0.19 +# github.com/mattn/go-isatty v0.0.20 ## explicit; go 1.15 github.com/mattn/go-isatty # github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d @@ -133,8 +132,8 @@ github.com/modern-go/concurrent # github.com/modern-go/reflect2 v1.0.2 ## explicit; go 1.12 github.com/modern-go/reflect2 -# github.com/onsi/ginkgo/v2 v2.12.0 -## explicit; go 1.18 +# github.com/onsi/ginkgo/v2 v2.20.2 +## explicit; go 1.22 github.com/onsi/ginkgo/v2/config github.com/onsi/ginkgo/v2/formatter github.com/onsi/ginkgo/v2/ginkgo @@ -151,8 +150,8 @@ github.com/onsi/ginkgo/v2/internal/interrupt_handler github.com/onsi/ginkgo/v2/internal/parallel_support github.com/onsi/ginkgo/v2/reporters github.com/onsi/ginkgo/v2/types -# github.com/pires/go-proxyproto v0.6.2 -## explicit; go 1.13 +# github.com/pires/go-proxyproto v0.8.0 +## explicit; go 1.18 github.com/pires/go-proxyproto # github.com/pkg/errors v0.9.1 ## explicit @@ -160,14 +159,13 @@ github.com/pkg/errors # github.com/pmezard/go-difflib v1.0.0 ## explicit github.com/pmezard/go-difflib/difflib -# github.com/quic-go/quic-go v0.42.0 -## explicit; go 1.21 +# github.com/quic-go/quic-go v0.48.0 +## explicit; go 1.22 github.com/quic-go/quic-go github.com/quic-go/quic-go/internal/ackhandler github.com/quic-go/quic-go/internal/congestion github.com/quic-go/quic-go/internal/flowcontrol github.com/quic-go/quic-go/internal/handshake -github.com/quic-go/quic-go/internal/logutils github.com/quic-go/quic-go/internal/protocol github.com/quic-go/quic-go/internal/qerr github.com/quic-go/quic-go/internal/qtls @@ -177,12 +175,14 @@ github.com/quic-go/quic-go/internal/utils/ringbuffer github.com/quic-go/quic-go/internal/wire github.com/quic-go/quic-go/logging github.com/quic-go/quic-go/quicvarint +# github.com/rogpeppe/go-internal v1.13.1 +## explicit; go 1.22 # github.com/sirupsen/logrus v1.9.3 ## explicit; go 1.13 github.com/sirupsen/logrus github.com/sirupsen/logrus/hooks/syslog -# github.com/skycoin/dmsg v1.3.28 -## explicit; go 1.21 +# github.com/skycoin/dmsg v1.3.29-0.20241019182716-022283c93835 +## explicit; go 1.22 github.com/skycoin/dmsg/internal/servermetrics github.com/skycoin/dmsg/pkg/direct github.com/skycoin/dmsg/pkg/disc @@ -231,7 +231,7 @@ github.com/skycoin/skywire-utilities/pkg/netutil github.com/skycoin/skywire-utilities/pkg/networkmonitor github.com/skycoin/skywire-utilities/pkg/storeconfig github.com/skycoin/skywire-utilities/pkg/tcpproxy -# github.com/spf13/cobra v1.7.0 +# github.com/spf13/cobra v1.8.1 ## explicit; go 1.15 github.com/spf13/cobra # github.com/spf13/pflag v1.0.5 @@ -263,15 +263,15 @@ github.com/valyala/histogram # github.com/xtaci/kcp-go v5.4.20+incompatible ## explicit github.com/xtaci/kcp-go -# go.etcd.io/bbolt v1.3.7 -## explicit; go 1.17 +# go.etcd.io/bbolt v1.3.11 +## explicit; go 1.22 go.etcd.io/bbolt -# go.uber.org/mock v0.4.0 -## explicit; go 1.20 +# go.uber.org/mock v0.5.0 +## explicit; go 1.22 go.uber.org/mock/mockgen go.uber.org/mock/mockgen/model -# golang.org/x/crypto v0.20.0 -## explicit; go 1.18 +# golang.org/x/crypto v0.28.0 +## explicit; go 1.20 golang.org/x/crypto/blake2b golang.org/x/crypto/blake2s golang.org/x/crypto/blowfish @@ -279,7 +279,6 @@ golang.org/x/crypto/cast5 golang.org/x/crypto/chacha20 golang.org/x/crypto/chacha20poly1305 golang.org/x/crypto/curve25519 -golang.org/x/crypto/curve25519/internal/field golang.org/x/crypto/hkdf golang.org/x/crypto/internal/alias golang.org/x/crypto/internal/poly1305 @@ -290,33 +289,36 @@ golang.org/x/crypto/ssh/terminal golang.org/x/crypto/tea golang.org/x/crypto/twofish golang.org/x/crypto/xtea -# golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 -## explicit; go 1.20 +# golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c +## explicit; go 1.22.0 golang.org/x/exp/rand -# golang.org/x/mod v0.12.0 -## explicit; go 1.17 +# golang.org/x/mod v0.21.0 +## explicit; go 1.22.0 golang.org/x/mod/internal/lazyregexp golang.org/x/mod/modfile golang.org/x/mod/module golang.org/x/mod/semver -# golang.org/x/net v0.21.0 +# golang.org/x/net v0.30.0 ## explicit; go 1.18 golang.org/x/net/bpf golang.org/x/net/internal/iana golang.org/x/net/internal/socket golang.org/x/net/ipv4 golang.org/x/net/ipv6 -# golang.org/x/sys v0.20.0 +# golang.org/x/sync v0.8.0 +## explicit; go 1.18 +golang.org/x/sync/errgroup +golang.org/x/sync/semaphore +# golang.org/x/sys v0.26.0 ## explicit; go 1.18 golang.org/x/sys/cpu -golang.org/x/sys/execabs golang.org/x/sys/plan9 golang.org/x/sys/unix golang.org/x/sys/windows -# golang.org/x/term v0.17.0 +# golang.org/x/term v0.25.0 ## explicit; go 1.18 golang.org/x/term -# golang.org/x/text v0.14.0 +# golang.org/x/text v0.19.0 ## explicit; go 1.18 golang.org/x/text/cases golang.org/x/text/internal @@ -331,29 +333,39 @@ golang.org/x/text/transform golang.org/x/text/unicode/bidi golang.org/x/text/unicode/norm golang.org/x/text/width -# golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 -## explicit; go 1.18 +# golang.org/x/tools v0.26.0 +## explicit; go 1.22.0 +golang.org/x/tools/cover golang.org/x/tools/go/ast/astutil golang.org/x/tools/go/ast/inspector +golang.org/x/tools/go/gcexportdata +golang.org/x/tools/go/packages +golang.org/x/tools/go/types/objectpath +golang.org/x/tools/go/types/typeutil golang.org/x/tools/imports +golang.org/x/tools/internal/aliases golang.org/x/tools/internal/event golang.org/x/tools/internal/event/core golang.org/x/tools/internal/event/keys golang.org/x/tools/internal/event/label -golang.org/x/tools/internal/event/tag -golang.org/x/tools/internal/fastwalk +golang.org/x/tools/internal/gcimporter golang.org/x/tools/internal/gocommand golang.org/x/tools/internal/gopathwalk golang.org/x/tools/internal/imports +golang.org/x/tools/internal/packagesinternal +golang.org/x/tools/internal/pkgbits +golang.org/x/tools/internal/stdlib golang.org/x/tools/internal/typeparams +golang.org/x/tools/internal/typesinternal +golang.org/x/tools/internal/versions # gopkg.in/yaml.v3 v3.0.1 ## explicit gopkg.in/yaml.v3 -# gorm.io/driver/postgres v1.3.8 -## explicit; go 1.14 +# gorm.io/driver/postgres v1.5.9 +## explicit; go 1.19 gorm.io/driver/postgres -# gorm.io/gorm v1.23.8 -## explicit; go 1.14 +# gorm.io/gorm v1.25.12 +## explicit; go 1.18 gorm.io/gorm gorm.io/gorm/callbacks gorm.io/gorm/clause