]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/go/arrow/math/uint64_sse4_amd64.s
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / go / arrow / math / uint64_sse4_amd64.s
CommitLineData
1d09f67e
TL
1//+build !noasm !appengine
2// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
3
4TEXT ยท_sum_uint64_sse4(SB), $0-24
5
6 MOVQ buf+0(FP), DI
7 MOVQ len+8(FP), SI
8 MOVQ res+16(FP), DX
9
10 WORD $0x8548; BYTE $0xf6 // test rsi, rsi
11 JE LBB0_1
12 LONG $0x03fe8348 // cmp rsi, 3
13 JBE LBB0_3
14 WORD $0x8949; BYTE $0xf1 // mov r9, rsi
15 LONG $0xfce18349 // and r9, -4
16 JE LBB0_3
17 LONG $0xfc418d4d // lea r8, [r9 - 4]
18 WORD $0x8944; BYTE $0xc0 // mov eax, r8d
19 WORD $0xe8c1; BYTE $0x02 // shr eax, 2
20 WORD $0xc0ff // inc eax
21 LONG $0x03e08348 // and rax, 3
22 JE LBB0_8
23 WORD $0xf748; BYTE $0xd8 // neg rax
24 LONG $0xc0ef0f66 // pxor xmm0, xmm0
25 WORD $0xc931 // xor ecx, ecx
26 LONG $0xc9ef0f66 // pxor xmm1, xmm1
27
28LBB0_10:
29 LONG $0x146f0ff3; BYTE $0xcf // movdqu xmm2, oword [rdi + 8*rcx]
30 LONG $0x5c6f0ff3; WORD $0x10cf // movdqu xmm3, oword [rdi + 8*rcx + 16]
31 LONG $0xc2d40f66 // paddq xmm0, xmm2
32 LONG $0xcbd40f66 // paddq xmm1, xmm3
33 LONG $0x04c18348 // add rcx, 4
34 WORD $0xff48; BYTE $0xc0 // inc rax
35 JNE LBB0_10
36 JMP LBB0_11
37
38LBB0_3:
39 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
40 WORD $0xc031 // xor eax, eax
41
42LBB0_4:
43 LONG $0xcf0c8d4a // lea rcx, [rdi + 8*r9]
44 WORD $0x294c; BYTE $0xce // sub rsi, r9
45
46LBB0_5:
47 WORD $0x0348; BYTE $0x01 // add rax, qword [rcx]
48 LONG $0x08c18348 // add rcx, 8
49 WORD $0xff48; BYTE $0xce // dec rsi
50 JNE LBB0_5
51 JMP LBB0_15
52
53LBB0_1:
54 WORD $0xc031 // xor eax, eax
55
56LBB0_15:
57 WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax
58 RET
59
60LBB0_8:
61 WORD $0xc931 // xor ecx, ecx
62 LONG $0xc0ef0f66 // pxor xmm0, xmm0
63 LONG $0xc9ef0f66 // pxor xmm1, xmm1
64
65LBB0_11:
66 LONG $0x0cf88349 // cmp r8, 12
67 JB LBB0_14
68 WORD $0x894c; BYTE $0xc8 // mov rax, r9
69 WORD $0x2948; BYTE $0xc8 // sub rax, rcx
70 LONG $0xcf4c8d48; BYTE $0x70 // lea rcx, [rdi + 8*rcx + 112]
71
72LBB0_13:
73 LONG $0x516f0ff3; BYTE $0x90 // movdqu xmm2, oword [rcx - 112]
74 LONG $0x596f0ff3; BYTE $0xa0 // movdqu xmm3, oword [rcx - 96]
75 LONG $0x616f0ff3; BYTE $0xb0 // movdqu xmm4, oword [rcx - 80]
76 LONG $0x696f0ff3; BYTE $0xc0 // movdqu xmm5, oword [rcx - 64]
77 LONG $0xd0d40f66 // paddq xmm2, xmm0
78 LONG $0xd9d40f66 // paddq xmm3, xmm1
79 LONG $0x716f0ff3; BYTE $0xd0 // movdqu xmm6, oword [rcx - 48]
80 LONG $0x796f0ff3; BYTE $0xe0 // movdqu xmm7, oword [rcx - 32]
81 LONG $0xf4d40f66 // paddq xmm6, xmm4
82 LONG $0xf2d40f66 // paddq xmm6, xmm2
83 LONG $0xfdd40f66 // paddq xmm7, xmm5
84 LONG $0xfbd40f66 // paddq xmm7, xmm3
85 LONG $0x416f0ff3; BYTE $0xf0 // movdqu xmm0, oword [rcx - 16]
86 LONG $0x096f0ff3 // movdqu xmm1, oword [rcx]
87 LONG $0xc6d40f66 // paddq xmm0, xmm6
88 LONG $0xcfd40f66 // paddq xmm1, xmm7
89 LONG $0x80e98348 // sub rcx, -128
90 LONG $0xf0c08348 // add rax, -16
91 JNE LBB0_13
92
93LBB0_14:
94 LONG $0xc1d40f66 // paddq xmm0, xmm1
95 LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78
96 LONG $0xc8d40f66 // paddq xmm1, xmm0
97 LONG $0x7e0f4866; BYTE $0xc8 // movq rax, xmm1
98 WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
99 JNE LBB0_4
100 JMP LBB0_15