forked from taschini/crlibm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdouble-extended.h
496 lines (389 loc) · 21.7 KB
/
double-extended.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#ifndef __DOUBLE_EXTENDED_H
#define __DOUBLE_EXTENDED_H
#if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
#include <fpu_control.h>
typedef long double double_ext;
#endif
#if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER)
typedef __fpreg double_ext;
#endif
/* For debugging */
typedef union {
int i[3];
long double d;
} db_ext_number;
#define DE_EXP 2
#define DE_MANTISSA_HI 1
#define DE_MANTISSA_LO 0
#define print_ext(_s, _y) {\
db_ext_number _yy; _yy.d=_y; \
printf("%s %04x %08x %08x \n",_s, 0xffff&_yy.i[DE_EXP], _yy.i[DE_MANTISSA_HI], _yy.i[DE_MANTISSA_LO]); \
}
/**************************************************************************************/
/*********************************Rounding tests***************************************/
/**************************************************************************************/
/* These test work by observing the bits of your double-extended after the 53rd.
mask should be 7ff if you trust your 64 bits (hum)
7fe if you trust 63 (if you have proven that maxepsilon<2^(-63) )
7fc 62
7f8 61
7f0 60 etc
*/
/* Mask constants for rounding test */
#define ACCURATE_TO_64_BITS 0x7ff
#define ACCURATE_TO_63_BITS 0x7fe
#define ACCURATE_TO_62_BITS 0x7fc
#define ACCURATE_TO_61_BITS 0x7f8
#define ACCURATE_TO_60_BITS 0x7f0
#define ACCURATE_TO_59_BITS 0x7e0
#if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
static const unsigned short RN_Double = (_FPU_DEFAULT & ~_FPU_EXTENDED)|_FPU_DOUBLE;
static const unsigned short RN_DoubleDown = (_FPU_DEFAULT & ~_FPU_EXTENDED & ~_FPU_RC_NEAREST)|_FPU_DOUBLE | _FPU_RC_DOWN;
static const unsigned short RN_DoubleUp = (_FPU_DEFAULT & ~_FPU_EXTENDED & ~_FPU_RC_NEAREST)|_FPU_DOUBLE | _FPU_RC_UP;
static const unsigned short RN_DoubleExt = _FPU_DEFAULT;
#define DOUBLE_EXTENDED_MODE _FPU_SETCW(RN_DoubleExt)
#define DOUBLE_UP_MODE _FPU_SETCW(RN_DoubleUp)
#define DOUBLE_DOWN_MODE _FPU_SETCW(RN_DoubleDown)
#define BACK_TO_DOUBLE_MODE _FPU_SETCW(RN_Double)
/*
Two rounding tests to the nearest. On Pentium 3, gcc3.3, the second
is faster by 12 cycles (and also improves the worst-case time by 60
cycles since it doesn't switch processor rounding mode in this
case). However it uses a coarser error estimation.
*/
#define DE_TEST_AND_RETURN_RN_ZIV(y,rncst) \
{ double yh, yl; \
yh = (double) y; \
yl = y-yh; \
BACK_TO_DOUBLE_MODE; \
if(yh==yh + yl*rncst) return yh; \
DOUBLE_EXTENDED_MODE; \
}
#define DE_TEST_AND_RETURN_RN(_y, _mask) \
{ \
db_ext_number _z; double _yd; \
int _lo; \
_z.d = _y; \
_yd = (double) _y; \
_lo = _z.i[DE_MANTISSA_LO] &(_mask); \
if((_lo!=(0x3ff&(_mask))) && (_lo!= (0x400&(_mask)))) { \
BACK_TO_DOUBLE_MODE; \
return _yd; \
} \
}
#define DE_TEST_AND_RETURN_RD(_y, _mask) \
{ \
double _result; int _bits; \
db_ext_number _z; \
_z.d = _y; \
DOUBLE_DOWN_MODE; \
_bits = _z.i[DE_MANTISSA_LO] &(_mask); \
_result = (double)(_y); \
if( (_bits != (0xfff&(_mask))) && (_bits != (0x000&(_mask))) ) { \
BACK_TO_DOUBLE_MODE; \
return _result; \
} \
DOUBLE_EXTENDED_MODE; \
}
#define DE_TEST_AND_RETURN_RU(_y, _mask) \
{ \
double _result; int _bits; \
db_ext_number _z; \
_z.d = _y; \
DOUBLE_UP_MODE; \
_bits = _z.i[DE_MANTISSA_LO] &(_mask); \
_result = (double)(_y); \
if( (_bits != (0xfff&(_mask))) && (_bits != (0x000&(_mask))) ) { \
BACK_TO_DOUBLE_MODE; \
return _result; \
} \
DOUBLE_EXTENDED_MODE; \
}
/* Use this one if you want a final computation step to overlap with
the rounding test. Examples: multiplication by a sign or by a power of 2 */
#define DE_TEST_AND_RETURN_RN2(_ytest, _yreturn, _mask) \
{ \
db_ext_number _z; double _y_return_d; \
int _bits; \
_z.d = _ytest; \
_y_return_d = (double) (_yreturn); \
_bits = _z.i[DE_MANTISSA_LO] &(_mask); \
if((_bits!=(0x3ff&(_mask))) && (_bits!= (0x400&(_mask)))) {\
BACK_TO_DOUBLE_MODE; \
return _y_return_d; \
} \
}
/* Slow macros with two changes of precision, but who cares, they are
used at the end of the second step */
#define RETURN_SUM_ROUNDED_DOWN(_rh, _rl) {\
double _result; \
DOUBLE_DOWN_MODE; \
_result = (_rh+_rl); \
BACK_TO_DOUBLE_MODE; \
return _result; \
}
#define RETURN_SUM_ROUNDED_UP(_rh, _rl) {\
double _result; \
DOUBLE_UP_MODE; \
_result = (_rh+_rl); \
BACK_TO_DOUBLE_MODE; \
return _result; \
}
#else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
#if !defined(CRLIBM_TYPECPU_ITANIUM)
#error "This file should be compiled only for IA32 or IA64 architecture "
#endif
/* TODO Add what it takes to compile under HP-UX */
#if !defined(__INTEL_COMPILER)
#error "Use icc, version 8.1 or higher to compile for IA64 architecture"
#endif
#define DOUBLE_EXTENDED_MODE {}
#define BACK_TO_DOUBLE_MODE {}
#define DE_TEST_AND_RETURN_RN(_y, _mask) \
{ uint64_t _mantissa, _bits; double _yd; \
_yd = _Asm_fma(2/*_FR_D*/, 1.0, _y, 0.0, 0/*SF0*/); \
_mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \
_bits = _mantissa & (0x7ff&(_mask)); \
if(__builtin_expect( \
(_bits!=(0x3ff&(_mask))) && (_bits != (0x400&(_mask))), \
1+1==2)) \
return _yd; \
}
/* Slower by 5 cycles as of 2005... Let us keep it, you never know */
#define DE_TEST_AND_RETURN_RN_ZIV(y,rncst) \
{ double yh, yl; \
yh = (double) y; \
yl = y-yh; \
if(__builtin_expect(yh == yh + yl*rncst, 1+1==2)) return yh; \
}
/* Use this one if you want a final computation step to overlap with
the rounding test. Examples: multiplication by a sign or by a power of 2 */
#define DE_TEST_AND_RETURN_RN2(_ytest, _yreturn, _mask) \
{ uint64_t _mantissa, _bits; \
_mantissa = _Asm_getf(4/*_FR_SIG*/, _ytest); \
_bits = _mantissa & (0x7ff&(_mask)); \
if(__builtin_expect( \
(_bits!=(0x3ff&(_mask))) && (_bits != (0x400&(_mask))), \
1+1==2)) \
return _yreturn; \
}
#define DE_TEST_AND_RETURN_RD(_y, _mask) \
{ uint64_t _mantissa, _bits; double _yd; \
_yd = _Asm_fma(2/*_FR_D*/, -1.0, _y, 0.0, 3/*SF3*/); \
_mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \
_bits = _mantissa & (0x7ff&(_mask)); \
if(__builtin_expect( \
(_bits!=(0x000&(_mask))) && (_bits != (0x7ff&(_mask))), \
1+1==2)) \
return -_yd; \
}
#define DE_TEST_AND_RETURN_RU(_y, _mask) \
{ uint64_t _mantissa, _bits; double _yd; \
_yd = _Asm_fma(2/*_FR_D*/, 1.0, _y, 0.0, 3/*SF3*/); \
_mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \
_bits = _mantissa & (0x7ff&(_mask)); \
if(__builtin_expect( \
(_bits!=(0x000&(_mask))) && (_bits != (0x7ff&(_mask))), \
1+1==2)) \
return _yd; \
}
#define RETURN_SUM_ROUNDED_DOWN(_rh, _rl) \
return -_Asm_fma(2/*_FR_D*/, -1.0, _rh, -_rl, 3/*SF3*/);
#define RETURN_SUM_ROUNDED_UP(_rh, _rl) \
return _Asm_fma(2/*_FR_D*/, 1.0, _rh, _rl, 3/*SF3*/);
#if 0
/* This test doesn'use SF2 and SF3 Kept as a model for Pentium implementation, to erase afterwards */
#define DE_TEST_AND_RETURN_RU(_y, _mask) \
{ \
db_ext_number _z; double _yd; \
unsigned long int _mantissa, _y_double, _isNegative ,_wasRoundedUp, _bits; \
_yd=(double) _y; \
_mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \
_y_double = _Asm_getf(2/*_FR_D*/, _yd); \
_bits = _mantissa & (0x7ff&(_mask)); \
_wasRoundedUp = ((_mantissa >>11) & 1) != (_y_double & 1); \
_bits = _mantissa & (0x7ff&(_mask)); \
_isNegative = _y_double >> 63; \
if(_isNegative) { \
if(_wasRoundedUp) { /* RN was RD */ \
if( _bits != (0x7ff&(_mask)) ) { \
_y_double--; \
return (double) _Asm_setf(2/*_FR_D*/, _y_double); \
} /* else launch accurate phase */ \
} \
else{ /* RN was RU, so need to check */ \
if( _bits != (0x000&(_mask)) ) { \
return _yd; \
} /* else launch accurate phase */ \
} \
} \
else{ /* Positive number */ \
if(_wasRoundedUp) { /* RN was RU */ \
if( _bits != (0x7ff&(_mask)) ) { \
return _yd; \
} /* else launch accurate phase */ \
} \
else{ /* RN was RD, */ \
if( _bits != (0x000&(_mask)) ) { \
_y_double++; /* beware, does not work for -0 */ \
return (double) _Asm_setf(2/*_FR_D*/, _y_double); \
} /* else launch accurate phase */ \
} \
} \
}
#define DE_TEST_AND_RETURN_RD(_y, _mask) \
{ \
db_ext_number _z; double _yd; \
unsigned long int _mantissa, _y_double, _isNegative ,_wasRoundedUp, _bits; \
_yd=(double) _y; \
_mantissa = _Asm_getf(4/*_FR_SIG*/, _y); \
_y_double = _Asm_getf(2/*_FR_D*/, _yd); \
_bits = _mantissa & (0x7ff&(_mask)); \
_wasRoundedUp = ((_mantissa >>11) & 1) != (_y_double & 1); \
_bits = _mantissa & (0x7ff&(_mask)); \
_isNegative = _y_double >> 63; \
if(_isNegative) { \
if(_wasRoundedUp) { /* RN was RD */ \
if( _bits != (0x7ff&(_mask)) ) { \
return (double) _y; \
} /* else launch accurate phase */ \
} \
else{ /* RN was RU */ \
if( _bits != (0x000&(_mask)) ) { \
_y_double++; \
return (double) _Asm_setf(2/*_FR_D*/, _y_double); \
} /* else launch accurate phase */ \
} \
} \
else{ /* Positive number */ \
if(_wasRoundedUp) { /* RN was RU */ \
if( _bits != (0x7ff&(_mask)) ) { \
_y_double--; \
return (double) _Asm_setf(2/*_FR_D*/, _y_double); \
} /* else launch accurate phase */ \
} \
else{ /* RN was RD, */ \
if( _bits != (0x000&(_mask)) ) { \
return (double) _y; \
} /* else launch accurate phase */ \
} \
} \
}
#endif /* 0 */
#endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
/**************************************************************************************/
/************************Double double-extended arithmetic*****************************/
/**************************************************************************************/
#define Add12_ext(prh, prl, a, b) \
{ \
double_ext _z, _a, _b; \
_a = a; _b = b; \
*prh = _a + _b; \
_z = *prh - _a; \
*prl = _b - _z; \
}
#if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
#define Mul12_ext(prh,prl,u,v) \
{ \
const double_ext _c = 4294967297.L; /* 2^32 +1 */ \
double_ext _up, _u1, _u2, _vp, _v1, _v2; \
double_ext _u =u, _v=v; \
\
_up = _u*_c; _vp = _v*_c; \
_u1 = (_u-_up)+_up; _v1 = (_v-_vp)+_vp; \
_u2 = _u-_u1; _v2 = _v-_v1; \
\
*prh = _u*_v; \
*prl = _u1*_v1 - *prh; \
*prl = *prl + _u1*_v2; \
*prl = *prl + _u2*_v1; \
*prl = *prl + _u2*_v2; \
}
#define Mul22_ext(prh,prl, ah,al, bh,bl) \
{ \
double_ext mh, ml; \
Mul12_ext(&mh,&ml,(ah),(bh)); \
ml += (ah)*(bl) + (al)*(bh); \
Add12_ext(prh,prl, mh,ml); \
}
#define FMA22_ext(prh,prl, ah,al, bh,bl, ch,cl) \
{ \
Mul22_ext(prh,prl, (ah),(al), (bh),(bl)); \
Add22_ext(prh,prl, ch,cl, *prh, *prl); \
}
#else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
#define Mul12_ext( prh,prl, a, b ) \
{ \
*prh = (a) * (b); \
*prl = _Asm_fms( 3/*_PC_NONE*/, (a), (b), *prh, 1 ); \
}
#if 0 /* transcription of Alexey's */
#define Mul22_ext( prh,prl, ah,al, bh,bl ) \
{ \
double_ext _t1,_t2,_t3; \
*prh = (ah) * (bh); \
_t1 = (ah)*(bl); \
_t2 = _Asm_fms( 3/*_PC_NONE*/, (ah), (bh), *prh, 1 ); \
_t3 = (al) * (bh) + _t1; \
*prl = (_t2 + _t3); \
}
#else
#define Mul22_ext( prh,prl, ah,al, bh,bl ) \
{ \
double_ext ph, pl; \
ph = (ah)*(bh); \
pl = _Asm_fms( 3/*_PC_NONE*/, ah, bh, ph, 1/*_SF1*/ ); \
pl = (ah)*(bl) + pl; \
pl = (al)*(bh) + pl; \
Add12_ext(prh,prl, ph,pl); \
}
#endif
/* res = a*b + c, assume |a*b| <= |c|
* res, a and b in X format
* c in L format
*/
#if 0
#define FMA22_ext(prh,prl, ah,al, bh,bl, ch,cl) \
{ \
Mul22_ext(prh,prl, (ah),(al), (bh),(bl)); \
Add22_ext(prh,prl, ch,cl, *prh, *prl); \
}
#else
#define FMA22_ext( prh,prl, ah,al, bh,bl, ch,cl) \
{ \
double_ext __xfmagxxx_r_hi__,__xfmagxxx_r_lo__, \
__xfmagxxx_t1__,__xfmagxxx_t2__, \
__xfmagxxx_t3__,__xfmagxxx_t4__; \
__xfmagxxx_r_hi__ = ah * bh + ch; \
__xfmagxxx_t1__ = al * bh + cl; \
__xfmagxxx_t2__ = __xfmagxxx_r_hi__ - ch; \
__xfmagxxx_t3__ = ah * bl + __xfmagxxx_t1__; \
__xfmagxxx_t4__ = _Asm_fms( 3/*_PC_NONE*/, ah, bh, __xfmagxxx_t2__, 1/*_SF1*/ ); \
__xfmagxxx_r_lo__ = (__xfmagxxx_t3__ + __xfmagxxx_t4__); \
*prh = __xfmagxxx_r_hi__; *prl = __xfmagxxx_r_lo__; \
}
#endif
#endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */
#define Div22_ext(prh,prl,xh,xl,yh,yl) \
{ \
double_ext ch,cl,uh,ul; \
ch = (xh)/(yh); \
Mul12_ext(&uh,&ul,ch,(yh)); \
cl = (xh)-uh; \
cl = cl - ul; \
cl = cl + (xl); \
cl = cl - ch*(yl); \
cl = cl / (yh); \
Add12(prh,prl, ch, cl) ; \
}
#define Add22_ext(prh,prl,xh,xl,yh,yl) \
do { \
double_ext _r,_s; \
_r = (xh)+(yh); \
_s = (xh)-_r; \
_s = _s + (yh); \
_s = _s + (yl); \
_s = _s + (xl); \
Add12_ext(prh,prl,_r,_s); \
} while(0)
#endif /* ifndef __DOUBLE_EXTENDED_H*/