]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // (C) Copyright John Maddock 2006. |
2 | // Use, modification and distribution are subject to the | |
3 | // Boost Software License, Version 1.0. (See accompanying file | |
4 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
5 | ||
6 | #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 | |
7 | #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 | |
8 | ||
9 | #ifdef _MSC_VER | |
10 | #pragma once | |
11 | #endif | |
12 | ||
13 | #include <emmintrin.h> | |
14 | ||
15 | #if defined(__GNUC__) || defined(__PGI) || defined(__SUNPRO_CC) | |
16 | #define ALIGN16 __attribute__((__aligned__(16))) | |
17 | #else | |
18 | #define ALIGN16 __declspec(align(16)) | |
19 | #endif | |
20 | ||
21 | namespace boost{ namespace math{ namespace lanczos{ | |
22 | ||
23 | template <> | |
24 | inline double lanczos13m53::lanczos_sum<double>(const double& x) | |
25 | { | |
26 | static const ALIGN16 double coeff[26] = { | |
27 | static_cast<double>(2.506628274631000270164908177133837338626L), | |
28 | static_cast<double>(1u), | |
29 | static_cast<double>(210.8242777515793458725097339207133627117L), | |
30 | static_cast<double>(66u), | |
31 | static_cast<double>(8071.672002365816210638002902272250613822L), | |
32 | static_cast<double>(1925u), | |
33 | static_cast<double>(186056.2653952234950402949897160456992822L), | |
34 | static_cast<double>(32670u), | |
35 | static_cast<double>(2876370.628935372441225409051620849613599L), | |
36 | static_cast<double>(357423u), | |
37 | static_cast<double>(31426415.58540019438061423162831820536287L), | |
38 | static_cast<double>(2637558u), | |
39 | static_cast<double>(248874557.8620541565114603864132294232163L), | |
40 | static_cast<double>(13339535u), | |
41 | static_cast<double>(1439720407.311721673663223072794912393972L), | |
42 | static_cast<double>(45995730u), | |
43 | static_cast<double>(6039542586.35202800506429164430729792107L), | |
44 | static_cast<double>(105258076u), | |
45 | static_cast<double>(17921034426.03720969991975575445893111267L), | |
46 | static_cast<double>(150917976u), | |
47 | static_cast<double>(35711959237.35566804944018545154716670596L), | |
48 | static_cast<double>(120543840u), | |
49 | static_cast<double>(42919803642.64909876895789904700198885093L), | |
50 | static_cast<double>(39916800u), | |
51 | static_cast<double>(23531376880.41075968857200767445163675473L), | |
52 | static_cast<double>(0u) | |
53 | }; | |
54 | __m128d vx = _mm_load1_pd(&x); | |
55 | __m128d sum_even = _mm_load_pd(coeff); | |
56 | __m128d sum_odd = _mm_load_pd(coeff+2); | |
57 | __m128d nc_odd, nc_even; | |
58 | __m128d vx2 = _mm_mul_pd(vx, vx); | |
59 | ||
60 | sum_even = _mm_mul_pd(sum_even, vx2); | |
61 | nc_even = _mm_load_pd(coeff + 4); | |
62 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
63 | nc_odd = _mm_load_pd(coeff + 6); | |
64 | sum_even = _mm_add_pd(sum_even, nc_even); | |
65 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
66 | ||
67 | sum_even = _mm_mul_pd(sum_even, vx2); | |
68 | nc_even = _mm_load_pd(coeff + 8); | |
69 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
70 | nc_odd = _mm_load_pd(coeff + 10); | |
71 | sum_even = _mm_add_pd(sum_even, nc_even); | |
72 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
73 | ||
74 | sum_even = _mm_mul_pd(sum_even, vx2); | |
75 | nc_even = _mm_load_pd(coeff + 12); | |
76 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
77 | nc_odd = _mm_load_pd(coeff + 14); | |
78 | sum_even = _mm_add_pd(sum_even, nc_even); | |
79 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
80 | ||
81 | sum_even = _mm_mul_pd(sum_even, vx2); | |
82 | nc_even = _mm_load_pd(coeff + 16); | |
83 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
84 | nc_odd = _mm_load_pd(coeff + 18); | |
85 | sum_even = _mm_add_pd(sum_even, nc_even); | |
86 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
87 | ||
88 | sum_even = _mm_mul_pd(sum_even, vx2); | |
89 | nc_even = _mm_load_pd(coeff + 20); | |
90 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
91 | nc_odd = _mm_load_pd(coeff + 22); | |
92 | sum_even = _mm_add_pd(sum_even, nc_even); | |
93 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
94 | ||
95 | sum_even = _mm_mul_pd(sum_even, vx2); | |
96 | nc_even = _mm_load_pd(coeff + 24); | |
97 | sum_odd = _mm_mul_pd(sum_odd, vx); | |
98 | sum_even = _mm_add_pd(sum_even, nc_even); | |
99 | sum_even = _mm_add_pd(sum_even, sum_odd); | |
100 | ||
101 | ||
102 | double ALIGN16 t[2]; | |
103 | _mm_store_pd(t, sum_even); | |
104 | ||
105 | return t[0] / t[1]; | |
106 | } | |
107 | ||
108 | template <> | |
109 | inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x) | |
110 | { | |
111 | static const ALIGN16 double coeff[26] = { | |
112 | static_cast<double>(0.006061842346248906525783753964555936883222L), | |
113 | static_cast<double>(1u), | |
114 | static_cast<double>(0.5098416655656676188125178644804694509993L), | |
115 | static_cast<double>(66u), | |
116 | static_cast<double>(19.51992788247617482847860966235652136208L), | |
117 | static_cast<double>(1925u), | |
118 | static_cast<double>(449.9445569063168119446858607650988409623L), | |
119 | static_cast<double>(32670u), | |
120 | static_cast<double>(6955.999602515376140356310115515198987526L), | |
121 | static_cast<double>(357423u), | |
122 | static_cast<double>(75999.29304014542649875303443598909137092L), | |
123 | static_cast<double>(2637558u), | |
124 | static_cast<double>(601859.6171681098786670226533699352302507L), | |
125 | static_cast<double>(13339535u), | |
126 | static_cast<double>(3481712.15498064590882071018964774556468L), | |
127 | static_cast<double>(45995730u), | |
128 | static_cast<double>(14605578.08768506808414169982791359218571L), | |
129 | static_cast<double>(105258076u), | |
130 | static_cast<double>(43338889.32467613834773723740590533316085L), | |
131 | static_cast<double>(150917976u), | |
132 | static_cast<double>(86363131.28813859145546927288977868422342L), | |
133 | static_cast<double>(120543840u), | |
134 | static_cast<double>(103794043.1163445451906271053616070238554L), | |
135 | static_cast<double>(39916800u), | |
136 | static_cast<double>(56906521.91347156388090791033559122686859L), | |
137 | static_cast<double>(0u) | |
138 | }; | |
139 | __m128d vx = _mm_load1_pd(&x); | |
140 | __m128d sum_even = _mm_load_pd(coeff); | |
141 | __m128d sum_odd = _mm_load_pd(coeff+2); | |
142 | __m128d nc_odd, nc_even; | |
143 | __m128d vx2 = _mm_mul_pd(vx, vx); | |
144 | ||
145 | sum_even = _mm_mul_pd(sum_even, vx2); | |
146 | nc_even = _mm_load_pd(coeff + 4); | |
147 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
148 | nc_odd = _mm_load_pd(coeff + 6); | |
149 | sum_even = _mm_add_pd(sum_even, nc_even); | |
150 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
151 | ||
152 | sum_even = _mm_mul_pd(sum_even, vx2); | |
153 | nc_even = _mm_load_pd(coeff + 8); | |
154 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
155 | nc_odd = _mm_load_pd(coeff + 10); | |
156 | sum_even = _mm_add_pd(sum_even, nc_even); | |
157 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
158 | ||
159 | sum_even = _mm_mul_pd(sum_even, vx2); | |
160 | nc_even = _mm_load_pd(coeff + 12); | |
161 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
162 | nc_odd = _mm_load_pd(coeff + 14); | |
163 | sum_even = _mm_add_pd(sum_even, nc_even); | |
164 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
165 | ||
166 | sum_even = _mm_mul_pd(sum_even, vx2); | |
167 | nc_even = _mm_load_pd(coeff + 16); | |
168 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
169 | nc_odd = _mm_load_pd(coeff + 18); | |
170 | sum_even = _mm_add_pd(sum_even, nc_even); | |
171 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
172 | ||
173 | sum_even = _mm_mul_pd(sum_even, vx2); | |
174 | nc_even = _mm_load_pd(coeff + 20); | |
175 | sum_odd = _mm_mul_pd(sum_odd, vx2); | |
176 | nc_odd = _mm_load_pd(coeff + 22); | |
177 | sum_even = _mm_add_pd(sum_even, nc_even); | |
178 | sum_odd = _mm_add_pd(sum_odd, nc_odd); | |
179 | ||
180 | sum_even = _mm_mul_pd(sum_even, vx2); | |
181 | nc_even = _mm_load_pd(coeff + 24); | |
182 | sum_odd = _mm_mul_pd(sum_odd, vx); | |
183 | sum_even = _mm_add_pd(sum_even, nc_even); | |
184 | sum_even = _mm_add_pd(sum_even, sum_odd); | |
185 | ||
186 | ||
187 | double ALIGN16 t[2]; | |
188 | _mm_store_pd(t, sum_even); | |
189 | ||
190 | return t[0] / t[1]; | |
191 | } | |
192 | ||
193 | #ifdef _MSC_VER | |
194 | ||
195 | BOOST_STATIC_ASSERT(sizeof(double) == sizeof(long double)); | |
196 | ||
197 | template <> | |
198 | inline long double lanczos13m53::lanczos_sum<long double>(const long double& x) | |
199 | { | |
200 | return lanczos_sum<double>(static_cast<double>(x)); | |
201 | } | |
202 | template <> | |
203 | inline long double lanczos13m53::lanczos_sum_expG_scaled<long double>(const long double& x) | |
204 | { | |
205 | return lanczos_sum_expG_scaled<double>(static_cast<double>(x)); | |
206 | } | |
207 | #endif | |
208 | ||
209 | } // namespace lanczos | |
210 | } // namespace math | |
211 | } // namespace boost | |
212 | ||
213 | #undef ALIGN16 | |
214 | ||
215 | #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS | |
216 | ||
217 | ||
218 | ||
219 | ||
220 |