]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* match.S -- x86 assembly version of the zlib longest_match() function. |
2 | * Optimized for the Intel 686 chips (PPro and later). | |
3 | * | |
4 | * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com> | |
5 | * | |
6 | * This software is provided 'as-is', without any express or implied | |
7 | * warranty. In no event will the author be held liable for any damages | |
8 | * arising from the use of this software. | |
9 | * | |
10 | * Permission is granted to anyone to use this software for any purpose, | |
11 | * including commercial applications, and to alter it and redistribute it | |
12 | * freely, subject to the following restrictions: | |
13 | * | |
14 | * 1. The origin of this software must not be misrepresented; you must not | |
15 | * claim that you wrote the original software. If you use this software | |
16 | * in a product, an acknowledgment in the product documentation would be | |
17 | * appreciated but is not required. | |
18 | * 2. Altered source versions must be plainly marked as such, and must not be | |
19 | * misrepresented as being the original software. | |
20 | * 3. This notice may not be removed or altered from any source distribution. | |
21 | */ | |
22 | ||
23 | #ifndef NO_UNDERLINE | |
24 | #define match_init _match_init | |
25 | #define longest_match _longest_match | |
26 | #endif | |
27 | ||
28 | #define MAX_MATCH (258) | |
29 | #define MIN_MATCH (3) | |
30 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) | |
31 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) | |
32 | ||
33 | /* stack frame offsets */ | |
34 | ||
35 | #define chainlenwmask 0 /* high word: current chain len */ | |
36 | /* low word: s->wmask */ | |
37 | #define window 4 /* local copy of s->window */ | |
38 | #define windowbestlen 8 /* s->window + bestlen */ | |
39 | #define scanstart 16 /* first two bytes of string */ | |
40 | #define scanend 12 /* last two bytes of string */ | |
41 | #define scanalign 20 /* dword-misalignment of string */ | |
42 | #define nicematch 24 /* a good enough match size */ | |
43 | #define bestlen 28 /* size of best match so far */ | |
44 | #define scan 32 /* ptr to string wanting match */ | |
45 | ||
46 | #define LocalVarsSize (36) | |
47 | /* saved ebx 36 */ | |
48 | /* saved edi 40 */ | |
49 | /* saved esi 44 */ | |
50 | /* saved ebp 48 */ | |
51 | /* return address 52 */ | |
52 | #define deflatestate 56 /* the function arguments */ | |
53 | #define curmatch 60 | |
54 | ||
55 | /* All the +zlib1222add offsets are due to the addition of fields | |
56 | * in zlib in the deflate_state structure since the asm code was first written | |
57 | * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). | |
58 | * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). | |
59 | * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). | |
60 | */ | |
61 | ||
62 | #define zlib1222add (8) | |
63 | ||
64 | #define dsWSize (36+zlib1222add) | |
65 | #define dsWMask (44+zlib1222add) | |
66 | #define dsWindow (48+zlib1222add) | |
67 | #define dsPrev (56+zlib1222add) | |
68 | #define dsMatchLen (88+zlib1222add) | |
69 | #define dsPrevMatch (92+zlib1222add) | |
70 | #define dsStrStart (100+zlib1222add) | |
71 | #define dsMatchStart (104+zlib1222add) | |
72 | #define dsLookahead (108+zlib1222add) | |
73 | #define dsPrevLen (112+zlib1222add) | |
74 | #define dsMaxChainLen (116+zlib1222add) | |
75 | #define dsGoodMatch (132+zlib1222add) | |
76 | #define dsNiceMatch (136+zlib1222add) | |
77 | ||
78 | ||
79 | .file "match.S" | |
80 | ||
81 | .globl match_init, longest_match | |
82 | ||
83 | .text | |
84 | ||
85 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ | |
86 | .cfi_sections .debug_frame | |
87 | ||
88 | longest_match: | |
89 | ||
90 | .cfi_startproc | |
91 | /* Save registers that the compiler may be using, and adjust %esp to */ | |
92 | /* make room for our stack frame. */ | |
93 | ||
94 | pushl %ebp | |
95 | .cfi_def_cfa_offset 8 | |
96 | .cfi_offset ebp, -8 | |
97 | pushl %edi | |
98 | .cfi_def_cfa_offset 12 | |
99 | pushl %esi | |
100 | .cfi_def_cfa_offset 16 | |
101 | pushl %ebx | |
102 | .cfi_def_cfa_offset 20 | |
103 | subl $LocalVarsSize, %esp | |
104 | .cfi_def_cfa_offset LocalVarsSize+20 | |
105 | ||
106 | /* Retrieve the function arguments. %ecx will hold cur_match */ | |
107 | /* throughout the entire function. %edx will hold the pointer to the */ | |
108 | /* deflate_state structure during the function's setup (before */ | |
109 | /* entering the main loop). */ | |
110 | ||
111 | movl deflatestate(%esp), %edx | |
112 | movl curmatch(%esp), %ecx | |
113 | ||
114 | /* uInt wmask = s->w_mask; */ | |
115 | /* unsigned chain_length = s->max_chain_length; */ | |
116 | /* if (s->prev_length >= s->good_match) { */ | |
117 | /* chain_length >>= 2; */ | |
118 | /* } */ | |
119 | ||
120 | movl dsPrevLen(%edx), %eax | |
121 | movl dsGoodMatch(%edx), %ebx | |
122 | cmpl %ebx, %eax | |
123 | movl dsWMask(%edx), %eax | |
124 | movl dsMaxChainLen(%edx), %ebx | |
125 | jl LastMatchGood | |
126 | shrl $2, %ebx | |
127 | LastMatchGood: | |
128 | ||
129 | /* chainlen is decremented once beforehand so that the function can */ | |
130 | /* use the sign flag instead of the zero flag for the exit test. */ | |
131 | /* It is then shifted into the high word, to make room for the wmask */ | |
132 | /* value, which it will always accompany. */ | |
133 | ||
134 | decl %ebx | |
135 | shll $16, %ebx | |
136 | orl %eax, %ebx | |
137 | movl %ebx, chainlenwmask(%esp) | |
138 | ||
139 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ | |
140 | ||
141 | movl dsNiceMatch(%edx), %eax | |
142 | movl dsLookahead(%edx), %ebx | |
143 | cmpl %eax, %ebx | |
144 | jl LookaheadLess | |
145 | movl %eax, %ebx | |
146 | LookaheadLess: movl %ebx, nicematch(%esp) | |
147 | ||
148 | /* register Bytef *scan = s->window + s->strstart; */ | |
149 | ||
150 | movl dsWindow(%edx), %esi | |
151 | movl %esi, window(%esp) | |
152 | movl dsStrStart(%edx), %ebp | |
153 | lea (%esi,%ebp), %edi | |
154 | movl %edi, scan(%esp) | |
155 | ||
156 | /* Determine how many bytes the scan ptr is off from being */ | |
157 | /* dword-aligned. */ | |
158 | ||
159 | movl %edi, %eax | |
160 | negl %eax | |
161 | andl $3, %eax | |
162 | movl %eax, scanalign(%esp) | |
163 | ||
164 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ | |
165 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ | |
166 | ||
167 | movl dsWSize(%edx), %eax | |
168 | subl $MIN_LOOKAHEAD, %eax | |
169 | subl %eax, %ebp | |
170 | jg LimitPositive | |
171 | xorl %ebp, %ebp | |
172 | LimitPositive: | |
173 | ||
174 | /* int best_len = s->prev_length; */ | |
175 | ||
176 | movl dsPrevLen(%edx), %eax | |
177 | movl %eax, bestlen(%esp) | |
178 | ||
179 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ | |
180 | ||
181 | addl %eax, %esi | |
182 | movl %esi, windowbestlen(%esp) | |
183 | ||
184 | /* register ush scan_start = *(ushf*)scan; */ | |
185 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ | |
186 | /* Posf *prev = s->prev; */ | |
187 | ||
188 | movzwl (%edi), %ebx | |
189 | movl %ebx, scanstart(%esp) | |
190 | movzwl -1(%edi,%eax), %ebx | |
191 | movl %ebx, scanend(%esp) | |
192 | movl dsPrev(%edx), %edi | |
193 | ||
194 | /* Jump into the main loop. */ | |
195 | ||
196 | movl chainlenwmask(%esp), %edx | |
197 | jmp LoopEntry | |
198 | ||
199 | .balign 16 | |
200 | ||
201 | /* do { | |
202 | * match = s->window + cur_match; | |
203 | * if (*(ushf*)(match+best_len-1) != scan_end || | |
204 | * *(ushf*)match != scan_start) continue; | |
205 | * [...] | |
206 | * } while ((cur_match = prev[cur_match & wmask]) > limit | |
207 | * && --chain_length != 0); | |
208 | * | |
209 | * Here is the inner loop of the function. The function will spend the | |
210 | * majority of its time in this loop, and majority of that time will | |
211 | * be spent in the first ten instructions. | |
212 | * | |
213 | * Within this loop: | |
214 | * %ebx = scanend | |
215 | * %ecx = curmatch | |
216 | * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) | |
217 | * %esi = windowbestlen - i.e., (window + bestlen) | |
218 | * %edi = prev | |
219 | * %ebp = limit | |
220 | */ | |
221 | LookupLoop: | |
222 | andl %edx, %ecx | |
223 | movzwl (%edi,%ecx,2), %ecx | |
224 | cmpl %ebp, %ecx | |
225 | jbe LeaveNow | |
226 | subl $0x00010000, %edx | |
227 | js LeaveNow | |
228 | LoopEntry: movzwl -1(%esi,%ecx), %eax | |
229 | cmpl %ebx, %eax | |
230 | jnz LookupLoop | |
231 | movl window(%esp), %eax | |
232 | movzwl (%eax,%ecx), %eax | |
233 | cmpl scanstart(%esp), %eax | |
234 | jnz LookupLoop | |
235 | ||
236 | /* Store the current value of chainlen. */ | |
237 | ||
238 | movl %edx, chainlenwmask(%esp) | |
239 | ||
240 | /* Point %edi to the string under scrutiny, and %esi to the string we */ | |
241 | /* are hoping to match it up with. In actuality, %esi and %edi are */ | |
242 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ | |
243 | /* initialized to -(MAX_MATCH_8 - scanalign). */ | |
244 | ||
245 | movl window(%esp), %esi | |
246 | movl scan(%esp), %edi | |
247 | addl %ecx, %esi | |
248 | movl scanalign(%esp), %eax | |
249 | movl $(-MAX_MATCH_8), %edx | |
250 | lea MAX_MATCH_8(%edi,%eax), %edi | |
251 | lea MAX_MATCH_8(%esi,%eax), %esi | |
252 | ||
253 | /* Test the strings for equality, 8 bytes at a time. At the end, | |
254 | * adjust %edx so that it is offset to the exact byte that mismatched. | |
255 | * | |
256 | * We already know at this point that the first three bytes of the | |
257 | * strings match each other, and they can be safely passed over before | |
258 | * starting the compare loop. So what this code does is skip over 0-3 | |
259 | * bytes, as much as necessary in order to dword-align the %edi | |
260 | * pointer. (%esi will still be misaligned three times out of four.) | |
261 | * | |
262 | * It should be confessed that this loop usually does not represent | |
263 | * much of the total running time. Replacing it with a more | |
264 | * straightforward "rep cmpsb" would not drastically degrade | |
265 | * performance. | |
266 | */ | |
267 | LoopCmps: | |
268 | movl (%esi,%edx), %eax | |
269 | xorl (%edi,%edx), %eax | |
270 | jnz LeaveLoopCmps | |
271 | movl 4(%esi,%edx), %eax | |
272 | xorl 4(%edi,%edx), %eax | |
273 | jnz LeaveLoopCmps4 | |
274 | addl $8, %edx | |
275 | jnz LoopCmps | |
276 | jmp LenMaximum | |
277 | LeaveLoopCmps4: addl $4, %edx | |
278 | LeaveLoopCmps: testl $0x0000FFFF, %eax | |
279 | jnz LenLower | |
280 | addl $2, %edx | |
281 | shrl $16, %eax | |
282 | LenLower: subb $1, %al | |
283 | adcl $0, %edx | |
284 | ||
285 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ | |
286 | /* then automatically accept it as the best possible match and leave. */ | |
287 | ||
288 | lea (%edi,%edx), %eax | |
289 | movl scan(%esp), %edi | |
290 | subl %edi, %eax | |
291 | cmpl $MAX_MATCH, %eax | |
292 | jge LenMaximum | |
293 | ||
294 | /* If the length of the match is not longer than the best match we */ | |
295 | /* have so far, then forget it and return to the lookup loop. */ | |
296 | ||
297 | movl deflatestate(%esp), %edx | |
298 | movl bestlen(%esp), %ebx | |
299 | cmpl %ebx, %eax | |
300 | jg LongerMatch | |
301 | movl windowbestlen(%esp), %esi | |
302 | movl dsPrev(%edx), %edi | |
303 | movl scanend(%esp), %ebx | |
304 | movl chainlenwmask(%esp), %edx | |
305 | jmp LookupLoop | |
306 | ||
307 | /* s->match_start = cur_match; */ | |
308 | /* best_len = len; */ | |
309 | /* if (len >= nice_match) break; */ | |
310 | /* scan_end = *(ushf*)(scan+best_len-1); */ | |
311 | ||
312 | LongerMatch: movl nicematch(%esp), %ebx | |
313 | movl %eax, bestlen(%esp) | |
314 | movl %ecx, dsMatchStart(%edx) | |
315 | cmpl %ebx, %eax | |
316 | jge LeaveNow | |
317 | movl window(%esp), %esi | |
318 | addl %eax, %esi | |
319 | movl %esi, windowbestlen(%esp) | |
320 | movzwl -1(%edi,%eax), %ebx | |
321 | movl dsPrev(%edx), %edi | |
322 | movl %ebx, scanend(%esp) | |
323 | movl chainlenwmask(%esp), %edx | |
324 | jmp LookupLoop | |
325 | ||
326 | /* Accept the current string, with the maximum possible length. */ | |
327 | ||
328 | LenMaximum: movl deflatestate(%esp), %edx | |
329 | movl $MAX_MATCH, bestlen(%esp) | |
330 | movl %ecx, dsMatchStart(%edx) | |
331 | ||
332 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ | |
333 | /* return s->lookahead; */ | |
334 | ||
335 | LeaveNow: | |
336 | movl deflatestate(%esp), %edx | |
337 | movl bestlen(%esp), %ebx | |
338 | movl dsLookahead(%edx), %eax | |
339 | cmpl %eax, %ebx | |
340 | jg LookaheadRet | |
341 | movl %ebx, %eax | |
342 | LookaheadRet: | |
343 | ||
344 | /* Restore the stack and return from whence we came. */ | |
345 | ||
346 | addl $LocalVarsSize, %esp | |
347 | .cfi_def_cfa_offset 20 | |
348 | popl %ebx | |
349 | .cfi_def_cfa_offset 16 | |
350 | popl %esi | |
351 | .cfi_def_cfa_offset 12 | |
352 | popl %edi | |
353 | .cfi_def_cfa_offset 8 | |
354 | popl %ebp | |
355 | .cfi_def_cfa_offset 4 | |
356 | .cfi_endproc | |
357 | match_init: ret |