-
Notifications
You must be signed in to change notification settings - Fork 288
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
mask.go: Use SIMD masking for amd64 and arm64
goos: windows goarch: amd64 pkg: nhooyr.io/websocket cpu: Intel(R) Core(TM) i5-9300H CPU @ 2.40GHz Benchmark_mask/2/basic-8 425339004 2.795 ns/op 715.66 MB/s Benchmark_mask/2/nhooyr-8 379937766 3.186 ns/op 627.78 MB/s Benchmark_mask/2/gorilla-8 392164167 3.071 ns/op 651.24 MB/s Benchmark_mask/2/gobwas-8 310037222 3.880 ns/op 515.46 MB/s Benchmark_mask/3/basic-8 321408024 3.806 ns/op 788.32 MB/s Benchmark_mask/3/nhooyr-8 350726338 3.478 ns/op 862.58 MB/s Benchmark_mask/3/gorilla-8 332217727 3.634 ns/op 825.43 MB/s Benchmark_mask/3/gobwas-8 247376214 4.886 ns/op 614.01 MB/s Benchmark_mask/4/basic-8 261182472 4.582 ns/op 872.91 MB/s Benchmark_mask/4/nhooyr-8 381830712 3.262 ns/op 1226.05 MB/s Benchmark_mask/4/gorilla-8 272616304 4.395 ns/op 910.04 MB/s Benchmark_mask/4/gobwas-8 204574558 5.855 ns/op 683.19 MB/s Benchmark_mask/8/basic-8 191330037 6.162 ns/op 1298.24 MB/s Benchmark_mask/8/nhooyr-8 369694992 3.285 ns/op 2435.65 MB/s Benchmark_mask/8/gorilla-8 175388466 6.743 ns/op 1186.48 MB/s Benchmark_mask/8/gobwas-8 241719933 4.886 ns/op 1637.45 MB/s Benchmark_mask/16/basic-8 100000000 10.92 ns/op 1464.83 MB/s Benchmark_mask/16/nhooyr-8 272565096 4.436 ns/op 3606.98 MB/s Benchmark_mask/16/gorilla-8 100000000 11.20 ns/op 1428.53 MB/s Benchmark_mask/16/gobwas-8 221356798 5.405 ns/op 2960.45 MB/s Benchmark_mask/32/basic-8 61476984 20.40 ns/op 1568.80 MB/s Benchmark_mask/32/nhooyr-8 238665572 5.050 ns/op 6337.22 MB/s Benchmark_mask/32/gorilla-8 100000000 12.09 ns/op 2647.28 MB/s Benchmark_mask/32/gobwas-8 186077235 6.477 ns/op 4940.36 MB/s Benchmark_mask/128/basic-8 14629720 80.90 ns/op 1582.19 MB/s Benchmark_mask/128/nhooyr-8 181241968 6.565 ns/op 19497.98 MB/s Benchmark_mask/128/gorilla-8 68308342 16.76 ns/op 7639.37 MB/s Benchmark_mask/128/gobwas-8 94582026 12.97 ns/op 9872.11 MB/s Benchmark_mask/512/basic-8 3921001 305.6 ns/op 1675.55 MB/s Benchmark_mask/512/nhooyr-8 123102199 9.721 ns/op 52669.11 MB/s Benchmark_mask/512/gorilla-8 32355914 38.18 ns/op 13411.43 MB/s Benchmark_mask/512/gobwas-8 31528501 37.80 ns/op 13544.37 MB/s Benchmark_mask/4096/basic-8 491804 2381 ns/op 1720.39 MB/s Benchmark_mask/4096/nhooyr-8 26159691 46.98 ns/op 87187.73 MB/s Benchmark_mask/4096/gorilla-8 4898440 243.6 ns/op 16817.89 MB/s Benchmark_mask/4096/gobwas-8 4336398 277.2 ns/op 14776.40 MB/s Benchmark_mask/16384/basic-8 113842 9623 ns/op 1702.66 MB/s Benchmark_mask/16384/nhooyr-8 8088847 154.5 ns/op 106058.18 MB/s Benchmark_mask/16384/gorilla-8 1282993 933.6 ns/op 17549.90 MB/s Benchmark_mask/16384/gobwas-8 997347 1086 ns/op 15093.49 MB/s We're about 4-5x faster then gorilla now.
- Loading branch information
1 parent
63c0405
commit d4b8f48
Showing
7 changed files
with
257 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
module nhooyr.io/websocket | ||
|
||
go 1.19 | ||
|
||
require golang.org/x/sys v0.13.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= | ||
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
#include "textflag.h" | ||
|
||
// func maskAsm(b *byte, len int, key uint32) | ||
TEXT ·maskAsm(SB), NOSPLIT, $0-28 | ||
// AX = b | ||
// CX = len (left length) | ||
// SI = key (uint32) | ||
// DI = uint64(SI) | uint64(SI)<<32 | ||
MOVQ b+0(FP), AX | ||
MOVQ len+8(FP), CX | ||
MOVL key+16(FP), SI | ||
|
||
// calculate the DI | ||
// DI = SI<<32 | SI | ||
MOVL SI, DI | ||
MOVQ DI, DX | ||
SHLQ $32, DI | ||
ORQ DX, DI | ||
|
||
CMPQ CX, $15 | ||
JLE less_than_16 | ||
CMPQ CX, $63 | ||
JLE less_than_64 | ||
CMPQ CX, $128 | ||
JLE sse | ||
TESTQ $31, AX | ||
JNZ unaligned | ||
|
||
aligned: | ||
CMPB ·useAVX2(SB), $1 | ||
JE avx2 | ||
JMP sse | ||
|
||
unaligned_loop_1byte: | ||
XORB SI, (AX) | ||
INCQ AX | ||
DECQ CX | ||
ROLL $24, SI | ||
TESTQ $7, AX | ||
JNZ unaligned_loop_1byte | ||
|
||
// calculate DI again since SI was modified | ||
// DI = SI<<32 | SI | ||
MOVL SI, DI | ||
MOVQ DI, DX | ||
SHLQ $32, DI | ||
ORQ DX, DI | ||
|
||
TESTQ $31, AX | ||
JZ aligned | ||
|
||
unaligned: | ||
TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b. | ||
JNZ unaligned_loop_1byte | ||
|
||
unaligned_loop: | ||
// we don't need to check the CX since we know it's above 128 | ||
XORQ DI, (AX) | ||
ADDQ $8, AX | ||
SUBQ $8, CX | ||
TESTQ $31, AX | ||
JNZ unaligned_loop | ||
JMP aligned | ||
|
||
avx2: | ||
CMPQ CX, $0x80 | ||
JL sse | ||
VMOVQ DI, X0 | ||
VPBROADCASTQ X0, Y0 | ||
|
||
avx2_loop: | ||
VPXOR (AX), Y0, Y1 | ||
VPXOR 32(AX), Y0, Y2 | ||
VPXOR 64(AX), Y0, Y3 | ||
VPXOR 96(AX), Y0, Y4 | ||
VMOVDQU Y1, (AX) | ||
VMOVDQU Y2, 32(AX) | ||
VMOVDQU Y3, 64(AX) | ||
VMOVDQU Y4, 96(AX) | ||
ADDQ $0x80, AX | ||
SUBQ $0x80, CX | ||
CMPQ CX, $0x80 | ||
JAE avx2_loop // loop if CX >= 0x80 | ||
|
||
sse: | ||
CMPQ CX, $0x40 | ||
JL less_than_64 | ||
MOVQ DI, X0 | ||
PUNPCKLQDQ X0, X0 | ||
|
||
sse_loop: | ||
MOVOU 0*16(AX), X1 | ||
MOVOU 1*16(AX), X2 | ||
MOVOU 2*16(AX), X3 | ||
MOVOU 3*16(AX), X4 | ||
PXOR X0, X1 | ||
PXOR X0, X2 | ||
PXOR X0, X3 | ||
PXOR X0, X4 | ||
MOVOU X1, 0*16(AX) | ||
MOVOU X2, 1*16(AX) | ||
MOVOU X3, 2*16(AX) | ||
MOVOU X4, 3*16(AX) | ||
ADDQ $0x40, AX | ||
SUBQ $0x40, CX | ||
CMPQ CX, $0x40 | ||
JAE sse_loop | ||
|
||
less_than_64: | ||
TESTQ $32, CX | ||
JZ less_than_32 | ||
XORQ DI, (AX) | ||
XORQ DI, 8(AX) | ||
XORQ DI, 16(AX) | ||
XORQ DI, 24(AX) | ||
ADDQ $32, AX | ||
|
||
less_than_32: | ||
TESTQ $16, CX | ||
JZ less_than_16 | ||
XORQ DI, (AX) | ||
XORQ DI, 8(AX) | ||
ADDQ $16, AX | ||
|
||
less_than_16: | ||
TESTQ $8, CX | ||
JZ less_than_8 | ||
XORQ DI, (AX) | ||
ADDQ $8, AX | ||
|
||
less_than_8: | ||
TESTQ $4, CX | ||
JZ less_than_4 | ||
XORL SI, (AX) | ||
ADDQ $4, AX | ||
|
||
less_than_4: | ||
TESTQ $2, CX | ||
JZ less_than_2 | ||
XORW SI, (AX) | ||
ROLL $16, SI | ||
ADDQ $2, AX | ||
|
||
less_than_2: | ||
TESTQ $1, CX | ||
JZ done | ||
XORB SI, (AX) | ||
ROLL $24, SI | ||
|
||
done: | ||
MOVL SI, ret+24(FP) | ||
RET |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#include "textflag.h" | ||
|
||
// func maskAsm(b *byte,len, int, key uint32) | ||
TEXT ·maskAsm(SB), NOSPLIT, $0-28 | ||
// R0 = b | ||
// R1 = len | ||
// R2 = uint64(key)<<32 | uint64(key) | ||
// R3 = key (uint32) | ||
MOVD b_ptr+0(FP), R0 | ||
MOVD b_len+8(FP), R1 | ||
MOVWU key+16(FP), R3 | ||
MOVD R3, R2 | ||
ORR R2<<32, R2, R2 | ||
VDUP R2, V0.D2 | ||
CMP $64, R1 | ||
BLT less_than_64 | ||
|
||
// todo: optimize unaligned case | ||
loop_64: | ||
VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16] | ||
VEOR V1.B16, V0.B16, V1.B16 | ||
VEOR V2.B16, V0.B16, V2.B16 | ||
VEOR V3.B16, V0.B16, V3.B16 | ||
VEOR V4.B16, V0.B16, V4.B16 | ||
VST1.P [V1.B16, V2.B16, V3.B16, V4.B16], 64(R0) | ||
SUBS $64, R1 | ||
CMP $64, R1 | ||
BGE loop_64 | ||
|
||
less_than_64: | ||
// quick end | ||
CBZ R1, end | ||
TBZ $5, R1, less_than32 | ||
VLD1 (R0), [V1.B16, V2.B16] | ||
VEOR V1.B16, V0.B16, V1.B16 | ||
VEOR V2.B16, V0.B16, V2.B16 | ||
VST1.P [V1.B16, V2.B16], 32(R0) | ||
|
||
less_than32: | ||
TBZ $4, R1, less_than16 | ||
LDP (R0), (R11, R12) | ||
EOR R11, R2, R11 | ||
EOR R12, R2, R12 | ||
STP.P (R11, R12), 16(R0) | ||
|
||
less_than16: | ||
TBZ $3, R1, less_than8 | ||
MOVD (R0), R11 | ||
EOR R2, R11, R11 | ||
MOVD.P R11, 8(R0) | ||
|
||
less_than8: | ||
TBZ $2, R1, less_than4 | ||
MOVWU (R0), R11 | ||
EORW R2, R11, R11 | ||
MOVWU.P R11, 4(R0) | ||
|
||
less_than4: | ||
TBZ $1, R1, less_than2 | ||
MOVHU (R0), R11 | ||
EORW R3, R11, R11 | ||
MOVHU.P R11, 2(R0) | ||
RORW $16, R3 | ||
|
||
less_than2: | ||
TBZ $0, R1, end | ||
MOVBU (R0), R11 | ||
EORW R3, R11, R11 | ||
MOVBU.P R11, 1(R0) | ||
RORW $8, R3 | ||
|
||
end: | ||
MOVWU R3, ret+24(FP) | ||
RET |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
//go:build !appengine && (amd64 || arm64) | ||
// +build !appengine | ||
// +build amd64 arm64 | ||
|
||
package websocket | ||
|
||
import "golang.org/x/sys/cpu" | ||
|
||
func mask(key uint32, b []byte) uint32 { | ||
if len(b) > 0 { | ||
return maskAsm(&b[0], len(b), key) | ||
} | ||
return key | ||
} | ||
|
||
var useAVX2 = cpu.X86.HasAVX2 | ||
|
||
//go:noescape | ||
func maskAsm(b *byte, len int, key uint32) uint32 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
//go:build appengine || (!amd64 && !arm64 && !js) | ||
|
||
package websocket | ||
|
||
func mask(key uint32, b []byte) uint32 { | ||
return maskGo(key, b) | ||
} |