-
Notifications
You must be signed in to change notification settings - Fork 62
GCM for SM4
Sun Yimin edited this page Jun 27, 2023
·
13 revisions
为sm4实现的GCM汇编代码是从AES GCM实现中摘抄的。主要为以下三个函数:
//go:noescape
func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
//go:noescape
func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
//go:noescape
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
- gcmSm4Finish没有改变,和gcmAesFinish一模一样;
- precomputeTableAsm和gcmAesInit的区别在于前者没有加密部分,输入参数就是加密结果;
- gcmSm4Data和gcmAesData的差别在于前者那个T参数同时作为输入输出,而后者只作为输出。
主要困难:
- 底层的CTR加密数据不是block对齐的,更不是4、8blocks对齐的,所以尾加密及异或运算处理比较麻烦;
- AMD64同时支持AVX2和NON-AVX2,代码量比较大,比较复杂;
- 和GHASH的混合处理,提高性能; 计划先把下面方法转成ASM:
// counterCrypt crypts in to out using g.cipher in counter mode.
func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
mask := make([]byte, g.cipher.blocksSize)
counters := make([]byte, g.cipher.blocksSize)
for len(in) >= g.cipher.blocksSize {
for i := 0; i < g.cipher.batchBlocks; i++ {
copy(counters[i*gcmBlockSize:(i+1)*gcmBlockSize], counter[:])
gcmInc32(counter)
}
g.cipher.EncryptBlocks(mask, counters)
xor.XorWords(out, in, mask[:])
out = out[g.cipher.blocksSize:]
in = in[g.cipher.blocksSize:]
}
if len(in) > 0 {
blocks := (len(in) + gcmBlockSize - 1) / gcmBlockSize
for i := 0; i < blocks; i++ {
copy(counters[i*gcmBlockSize:], counter[:])
gcmInc32(counter)
}
g.cipher.EncryptBlocks(mask, counters)
xor.XorBytes(out, in, mask[:blocks*gcmBlockSize])
}
}
最后再处理和GHASH混合处理。
2022年1月14日
gcmSm4Init方法,已完成AMD64和ARM64开发,为了支持golang1.15.x,不得不放弃使用VMOVQ指令。
gcmSm4Enc方法,初步完成AMD64架构非AVX(2)版本开发,正进行更多测试和优化。
2022年1月18日
gcmSm4Enc, gcmSm4Dec, 已完成AMD64架构下非AVX(2)版本及AVX(2)版本,代码有点臃肿;ARM64版本也已完成,优化的方向为矩阵行列转换。
ARM64矩阵转换
// 从高位到低位
// s0 = s0.S3, s0.S2, s0.S1, s0.S0
// s1 = s1.S3, s1.S2, s1.S1, s1.S0
// s2 = s2.S3, s2.S2, s2.S1, s2.S0
// s3 = s3.S3, s3.S2, s3.S1, s3.S0
#define transpose_4x4(s0, s1, s2, s3) \
zip1 RTMP0.4s, s0.4s, s1.4s; \ // RTMP0 = s1.S1, s0.S1, s1.S0, s0.S0
zip1 RTMP1.4s, s2.4s, s3.4s; \ // RTMP1 = s3.S1, s2.S1, s3.S0, s2.S0
zip2 RTMP2.4s, s0.4s, s1.4s; \ // RTMP2 = s1.S3, s0.S3, s1.S2, s0.S2
zip2 RTMP3.4s, s2.4s, s3.4s; \ // RTMP3 = s3.S3, s2.S3, s3.S2, s2.S2
zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ // s0 = s3.S0, s2.S0, s1.S0, s0.S0
zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ // s1 = s3.S1, s2.S1, s1.S1, s0.S1
zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ // s2 = s3.S2, s2.S2, s1.S2, s0.S2
zip2 s3.2d, RTMP2.2d, RTMP3.2d; // s3 = s3.S3, s2.S3, s1.S3, s0.S3
#define rotate_clockwise_90(s0, s1, s2, s3) \
zip1 RTMP0.4s, s1.4s, s0.4s; \ // RTMP0 = s0.S1, s1.S1, s0.S0, s1.S0
zip2 RTMP1.4s, s1.4s, s0.4s; \ // RTMP1 = s0.S3, s1.S3, s0.S2, s1.S2
zip1 RTMP2.4s, s3.4s, s2.4s; \ // RTMP2 = s2.S1, s3.S1, s2.S0, s3.S0
zip2 RTMP3.4s, s3.4s, s2.4s; \ // RTMP3 = s2.S3, s3.S3, s2.S2, s3.S2
zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ // s0 = s0.S0, s1.S0, s2.S0, s3.S0
zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ // s1 = s0.S1, s1.S1, s2.S1, s3.S1
zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ // s2 = s0.S2, s1.S2, s2.S2, s3.S2
zip2 s3.2d, RTMP3.2d, RTMP1.2d; // s3 = s0.S3, s1.S3, s2.S3, s3.S3
Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode