本篇介绍
AVX是SSE的扩展版,用了256字节的ymm寄存器,本篇看下AVX相关的指令。
AVX
AVX(Advanced Vector Extensions)用的是ymm寄存器,每个256字节。 先看一个数据操运算的例子:
代码语言:javascript复制; avx_unaligned.asm
extern printf
section .data
spvector1 dd 1.1
dd 2.1
dd 3.1
dd 4.1
dd 5.1
dd 6.1
dd 7.1
dd 8.1
spvector2 dd 1.2
dd 1.2
dd 3.2
dd 4.2
dd 5.2
dd 6.2
dd 7.2
dd 8.2
dpvector1 dq 1.1
dq 2.2
dq 3.3
dq 4.4
dpvector2 dq 5.5
dq 6.6
dq 7.7
dq 8.8
fmt1 db "Single Precision Vector 1:",10,0
fmt2 db 10,"Single Precision Vector 2:",10,0
fmt3 db 10,"Sum of Single Precision Vector 1 and Vector 2:",10,0
fmt4 db 10,"Double Precision Vector 1:",10,0
fmt5 db 10,"Double Precision Vector 2:",10,0
fmt6 db 10,"Sum of Double Precision Vector 1 and Vector 2:",10,0
section .bss
spvector_res resd 8
dpvector_res resq 4
section .text
global main
main:
push rbp
mov rbp,rsp
;SINGLE PRECISION FLOATING POINT VECTORS
;load vector1 in the register ymm0
vmovups ymm0, [spvector1]
;extract ymm0
vextractf128 xmm2,ymm0,0 ;first part of ymm0
vextractf128 xmm2,ymm0,1 ;second part of ymm0
;load vector2 in the register ymm1
vmovups ymm1, [spvector2]
;extract ymm1
vextractf128 xmm2,ymm1,0
vextractf128 xmm2,ymm1,1
;add 2 single precision floating point vectors
vaddps ymm2,ymm0,ymm1
vmovups [spvector_res],ymm2
;print the vectors
mov rdi,fmt1
call printf
mov rsi,spvector1
call printspfpv
mov rdi,fmt2
call printf
mov rsi,spvector2
call printspfpv
mov rdi,fmt3
call printf
mov rsi,spvector_res
call printspfpv
;DOUBLE PRECISION FLOATING POINT VECTORS
;load vector1 in the register ymm0
vmovups ymm0, [dpvector1]
mov rdi,fmt4
;extract ymm0
vextractf128 xmm2,ymm0,0 ;first part of ymm0
vextractf128 xmm2,ymm0,1 ;second part of ymm0
;load vector2 in the register ymm1
vmovups ymm1, [dpvector2]
;extract ymm1
vextractf128 xmm2,ymm1,0
vextractf128 xmm2,ymm1,1
; add 2 double precision floating point vectors
vaddpd ymm2,ymm0,ymm1
vmovupd [dpvector_res],ymm2
;print the vectors
mov rdi,fmt4
call printf
mov rsi,dpvector1
call printdpfpv
mov rdi,fmt5
call printf
mov rsi,dpvector2
call printdpfpv
mov rdi,fmt6
call printf
mov rsi,dpvector_res
call printdpfpv
leave
ret
printspfpv:
section .data
.NL db 10,0
.fmt1 db "%.1f, ",0
section .text
push rbp
mov rbp,rsp
push rcx
push rbx
mov rcx,8
mov rbx,0
.loop:
movss xmm0,[rsi rbx]
cvtss2sd xmm0,xmm0
mov rdi,.fmt1
push rsi
push rcx
call printf
pop rcx
pop rsi
add rbx,4
loop .loop
xor rax,rax
mov rdi,.NL
call printf
pop rbx
pop rcx
leave
ret
printdpfpv:
section .data
.NL db 10,0
.fmt db "%.1f, %.1f, %.1f, %.1f",0
section .text
push rbp
mov rbp,rsp
movsd xmm0, [rsi]
movsd xmm1, [rsi 8]
movsd xmm2, [rsi 16]
movsd xmm3, [rsi 24]
mov rdi,.fmt
mov rax,4 ; four floats
call printf
mov rdi,.NL
call printf
leave
ret
结果如下:
Single Precision Vector 1:
1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1,
Single Precision Vector 2:
1.2, 1.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2,
Sum of Single Precision Vector 1 and Vector 2:
2.3, 3.3, 6.3, 8.3, 10.3, 12.3, 14.3, 16.3,
Double Precision Vector 1:
1.1, 2.2, 3.3, 4.4
Double Precision Vector 2:
5.5, 6.6, 7.7, 8.8
Sum of Double Precision Vector 1 and Vector 2:
6.6, 8.8, 11.0, 13.2
vmovups可以将未对齐的数据拷贝到ymm寄存器中。 vextractf128 可以将ymm中的数据提取出来,每次128字节。 接下来再看一个矩阵转置的例子,对比普通指令和AVX的差异,就可以看到AVX指令的性能优势了:
代码语言:javascript复制; transpose.asm
extern printf
section .data
fmt0 db "4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE",10,0
fmt1 db 10,"This is the matrix:",10,0
fmt2 db 10,"This is the transpose (sequential version): ",10,0
fmt3 db 10,"This is the transpose (AVX version): ",10,0
fmt4 db 10,"Number of loops: %d",10,0
fmt5 db "Sequential version elapsed cycles: %d",10,0
fmt6 db "AVX Shuffle version elapsed cycles: %d",10,0
align 32
matrix dq 1., 2., 3., 4.
dq 5., 6., 7., 8.
dq 9., 10., 11., 12.
dq 13., 14., 15., 16.
loops dq 10000
section .bss
alignb 32
transpose resq 16
bahi_cy resq 1 ;timers for avx version
balo_cy resq 1
eahi_cy resq 1
ealo_cy resq 1
bshi_cy resq 1 ;timers for sequential version
bslo_cy resq 1
eshi_cy resq 1
eslo_cy resq 1
section .text
global main
main:
push rbp
mov rbp,rsp
; print title
mov rdi, fmt0
call printf
; print matrix
mov rdi,fmt1
call printf
mov rsi,matrix
call printm4x4
; SEQUENTIAL VERSION
; compute transpose
mov rdi, matrix
mov rsi, transpose
mov rdx, [loops]
;start measuring the cycles
cpuid
rdtsc
mov [bshi_cy],edx
mov [bslo_cy],eax
call seq_transpose
;stop measuring the cycles
rdtscp
mov [eshi_cy],edx
mov [eslo_cy],eax
cpuid
;print the result
mov rdi,fmt2
call printf
mov rsi,transpose
call printm4x4
; AVX VERSION
; compute transpose
mov rdi, matrix
mov rsi, transpose
mov rdx, [loops]
;start measuring the cycles
cpuid
rdtsc
mov [bahi_cy],edx
mov [balo_cy],eax
call AVX_transpose
;stop measuring the cycles
rdtscp
mov [eahi_cy],edx
mov [ealo_cy],eax
cpuid
;print the result
mov rdi,fmt3
call printf
mov rsi,transpose
call printm4x4
;print the loops
mov rdi,fmt4
mov rsi,[loops]
call printf
;print the cycles
;cycles sequential version
mov rdx,[eslo_cy]
mov rsi,[eshi_cy]
shl rsi,32
or rsi,rdx ;rsi contains end time
mov r8,[bslo_cy]
mov r9,[bshi_cy]
shl r9,32
or r9,r8 ;r9 contains start time
sub rsi,r9 ;rsi contains elapsed
;print the timing result
mov rdi,fmt5
call printf
;cycles AVX blend version
mov rdx,[ealo_cy]
mov rsi,[eahi_cy]
shl rsi,32
or rsi,rdx ;rsi contains end time
mov r8,[balo_cy]
mov r9,[bahi_cy]
shl r9,32
or r9,r8 ;r9 contains start time
sub rsi,r9 ;rsi contains elapsed
;print the timing result
mov rdi,fmt6
call printf
leave
ret
;---------------------------------------------------------------
seq_transpose:
push rbp
mov rbp,rsp
.loopx: ; the number of loops
pxor xmm0,xmm0
xor r10,r10
xor rax,rax
mov r12,4
.loopo:
push rcx
mov r13,4
.loopi:
movsd xmm0, [rdi r10]
movsd [rsi rax], xmm0
add r10,8
add rax,32
dec r13
jnz .loopi
add rax,8
xor rax,10000000b ;rax - 128
inc rbx
dec r12
jnz .loopo
dec rdx
jnz .loopx
leave
ret
;---------------------------------------------------------------
AVX_transpose:
push rbp
mov rbp,rsp
.loopx: ; the number of loops
;load matrix into the registers
vmovapd ymm0,[rdi] ; 1 2 3 4
vmovapd ymm1,[rdi 32] ; 5 6 7 8
vmovapd ymm2,[rdi 64] ; 9 10 11 12
vmovapd ymm3,[rdi 96] ; 13 14 15 16
;shuffle
vshufpd ymm12,ymm0,ymm1, 0000b ; 1 5 3 7
vshufpd ymm13,ymm0,ymm1, 1111b ; 2 6 4 8
vshufpd ymm14,ymm2,ymm3, 0000b ; 9 13 11 15
vshufpd ymm15,ymm2,ymm3, 1111b ; 10 14 12 16
;permutate
vperm2f128 ymm0,ymm12,ymm14, 00100000b ; 1 5 9 13
vperm2f128 ymm1,ymm13,ymm15, 00100000b ; 2 6 10 14
vperm2f128 ymm2,ymm12,ymm14, 00110001b ; 3 7 11 15
vperm2f128 ymm3,ymm13,ymm15, 00110001b ; 4 8 12 16
;write to memory
vmovapd [rsi], ymm0
vmovapd [rsi 32],ymm1
vmovapd [rsi 64],ymm2
vmovapd [rsi 96],ymm3
dec rdx
jnz .loopx
leave
ret
;---------------------------------------------------------------
printm4x4:
section .data
.fmt db "%f",9,"%f",9, "%f",9,"%f",10,0
section .text
push rbp
mov rbp,rsp
push rbx ;callee saved
push r15 ;callee saved
mov rdi,.fmt
mov rcx,4
xor rbx,rbx ;row counter
.loop:
movsd xmm0, [rsi rbx]
movsd xmm1, [rsi rbx 8]
movsd xmm2, [rsi rbx 16]
movsd xmm3, [rsi rbx 24]
mov rax,4 ; four floats
push rcx ;caller saved
push rsi ;caller saved
push rdi ;caller saved
;align stack if needed
xor r15,r15
test rsp,0fh ;last byte is 8 (not aligned)?
setnz r15b ;set if not aligned
shl r15,3 ;multiply by 8
sub rsp,r15 ;substract 0 or 8
call printf
add rsp,r15 ;add 0 or 8
pop rdi
pop rsi
pop rcx
add rbx,32 ;next row
loop .loop
pop r15
pop rbx
leave
ret
结果如下:
4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE
This is the matrix:
1.000000 2.000000 3.000000 4.000000
5.000000 6.000000 7.000000 8.000000
9.000000 10.000000 11.000000 12.000000
13.000000 14.000000 15.000000 16.000000
This is the transpose (sequential version):
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
4.000000 8.000000 12.000000 16.000000
This is the transpose (AVX version):
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
4.000000 8.000000 12.000000 16.000000
Number of loops: 10000
Sequential version elapsed cycles: 8654387
AVX Shuffle version elapsed cycles: 814357
总结
本次汇编学习到此就先结束了,有了这些基础,相信接下来看汇编代码会容易很多。