_ftol 的优化
_ftol 是什么? 当你写 C 程序的时候,(int)float_v 就会被编译器产生一个对 _ftol 这个 CRT 函数的调用。上个世纪听一个做 3d 的朋友提起过,用 x87 指令实现的 _ftol 会很慢,一般用整数指令提供。当时提在心里,2000 年的时候在 RISC 上做开发 (ARM 指令集) 曾经写过一些整数模拟浮点的函数,曾经写过这个转换函数,日子久了,现在也找不回来代码。不过对浮点的 IEEE 标准还是比较清楚的。去年写过一篇 浮点数的精度控制问题 的帖子放在流言中。当时已经被骂过了。 今天工作时又遇到关于浮点数的问题,再写篇 blog 吧,或许还是找骂贴 :)
懒的重写 _ftol 的整数指令版本了,google 搜了下,发现果然有人也做过。http://www.flipcode.com/cgi-bin/fcarticles.cgi?show=64008 就是这么一个函数:
int ftol(float f)
{
int a = *(int*)(&f);
int sign = (a>>31);
int mantissa = (a&((1<<23)-1))|(1<<23);
int exponent = ((a&0x7fffffff)>>23)-127;
int r = ((unsigned int)(mantissa)<<8)>>(31-exponent);
return ((r ^ (sign)) - sign ) &~ (exponent>>31);
}当是效率比较高的。我想,日子已经过去这么久了。当初朋友跟我提这个事情的事情大约是 98,99 年。上面翻出来的老帖是 01 年的。我现在的机器不错,今年新买的 P4 双核的,还是测试一下比较放心。注:这个函数不能直接替换 CRT 中的 _ftol , CRT 的 _ftol 并不通过堆栈传递参数。 马上随手写了下面的测试程序:
#include <stdio.h>
#define RDTSC _asm _emit 0x0f _asm _emit 0x31
#pragma warning (push)
#pragma warning (disable: 4035)
inline unsigned __int64 timestamp()
{
__asm RDTSC
}
#pragma warning (pop)
int int_chop (float f)
{
int a = *(int*)(&f);
int sign = (a>>31);
int mantissa = (a&((1<<23)-1))|(1<<23);
int exponent = ((a&0x7fffffff)>>23)-127;
int r = ((unsigned int)(mantissa)<<8)>>(31-exponent);
return ((r ^ (sign)) - sign ) &~ (exponent>>31);
}
int test1(float f)
{
return int_chop(f);
}
int test2(float f)
{
return (int)f;
}
int test3(float x)
{
int t;
__asm fld x
__asm fistp t
return t;
}
void test(int t,int (*f)(float))
{
int i;
for (i=0;i<t;i++) {
f(-0.8f);
}
}
void main()
{
int i;
for (i=0;i<3;i++) {
__int64 t;
printf("---timing %d---\n",i);
t=timestamp();
test(100000,test1);
t=timestamp()-t;
printf("use int\t%I64d\n",t);
t=timestamp();
test(100000,test2);
t=timestamp()-t;
printf("(int)\t%I64d\n",t);
t=timestamp();
test(100000,test3);
t=timestamp()-t;
printf("use x87\t%I64d\n",t);
}
}运行结果如下:
---timing 0---
use int 1063972
(int) 6589632
use x87 6050682
---timing 1---
use int 1041530
(int) 6548604
use x87 6010210
---timing 2---
use int 1253211
(int) 6788126
use x87 5679409
测试环境:windows XP 效率最高是 use x87
附内置版 __ftol
0040E7E8 fld dword ptr [ebp+8]
0040E7EB call __ftol (00401048)
__ftol:
00401048 push ebp
00401049 mov ebp,esp
0040104B add esp,0F4h
0040104E wait
0040104F fnstcw word ptr [ebp-2]
00401052 wait
00401053 mov ax,word ptr [ebp-2]
00401057 or ah,0Ch
0040105A mov word ptr [ebp-4],ax
0040105E fldcw word ptr [ebp-4]
00401061 fistp qword ptr [ebp-0Ch]
00401064 fldcw word ptr [ebp-2]
00401067 mov eax,dword ptr [ebp-0Ch]
0040106A mov edx,dword ptr [ebp-8]
0040106D leave
0040106E ret
0040106F int 3
----------------以下还有各种实现的版本
/*Editor's note:
COTD Entry: Faster Float-To-Int Conversion by JCAB [[email protected]] Warning: this COTD is seriously MSVC-only. It might apply to other compilers (in fact, I'm certain it does apply to, at least, other older compilers). But the implementation doesn't. Warning: some assembly required if you want to understand how this is done. Recently, two people have asked around me about the float-to-int conversion routines being very slow. This is a problem endemic to the way the Microsoft C library implements a little function called _ftol(). So, I've been trying out alternative implementations of this routine, and I've come up with several. I've been testing them on a 700 MHZ PC using this programs:*/// This first one checks the alternative implementation for correctness.
// It needs the _ftol() function used (see below) rewritten as
// "int FTOL(float)" and changed accordingly:
volatile int k;int main(int argc, char* argv[])
{
char s[4096]; enum { NUMTESTS = 10000000 };
static float rndbuf[NUMTESTS]; int i;
for (i = 0; i < NUMTESTS; ++i) {
int intg1 = rand();
int intg2 = rand();
int intg3 = rand();
int frac = rand();
rndbuf[i] = float(intg1) - float(intg2) + float(intg3) + float(frac) / RAND_MAX;
} for (i = 0; i < NUMTESTS; ++i) {
k = rndbuf[i];
if (k != FIST(rndbuf[i])) {
sprintf(s, "Bad: %f (%08x) -> %d != %d\n", rndbuf[i], *(int*)&rndbuf[i], int(rndbuf[i]), FIST(rndbuf[i]));
OutputDebugString(s);
}
} return 0;
}// This program checks for performance:
volatile int k;int main(int argc, char* argv[])
{
char s[4096]; enum { NUMTESTS = 10000000 };
static float rndbuf[NUMTESTS]; int i;
for (i = 0; i < NUMTESTS; ++i) {
int intg1 = rand();
int intg2 = rand();
int intg3 = rand();
int frac = rand();
rndbuf[i] = float(intg1) - float(intg2) + float(intg3) + float(frac) / RAND_MAX;
} DWORD time = GetTickCount(); int j;
for (j = 0; j < 10; ++j) {
for (i = 0; i < NUMTESTS; ++i) {
k = rndbuf[i];
}
} time = GetTickCount() - time; sprintf(s, "Time = %d ms\n", time);
OutputDebugString(s); return 0;
}
The different functions I checked were:1- The compiler's own.2- Faster version that uses some global memory and requires rounding mode to be on:
extern "C" __declspec(naked) void __cdecl _ftol()
{
const static int zpfp[2] = { 0xBEFFFFFF, 0x3EFFFFFF }; __asm {
SUB ESP,4
FST DWORD PTR [ESP]
MOV EAX,DWORD PTR [ESP]
SHR EAX,29
AND EAX,4
FADD DWORD PTR [zpfp+EAX]
FISTP DWORD PTR [ESP]
POP EAX
RET
}
}
3- Slower version that uses no global memory and requires rounding mode to be on:
extern "C" __declspec(naked) void __cdecl _ftol()
{
__asm {
SUB ESP,4
FST DWORD PTR [ESP]
MOV EAX,DWORD PTR [ESP]
AND EAX,0x80000000
XOR EAX,0xBEFFFFFF
MOV DWORD PTR [ESP],EAX
FADD DWORD PTR [ESP]
FISTP DWORD PTR [ESP]
POP EAX
RET
}
}
4- Version independent of the rounding mode, but which only works correctly on floats:
extern "C" __declspec(naked) void __cdecl _ftol()
{
__asm {
SUB ESP,4
FSTP DWORD PTR [ESP]
POP EAX
MOV EDX,EAX
MOV ECX,EAX
AND EAX,0x007FFFFF
OR EAX,0x00800000
AND ECX,0x7F800000
JZ zero
SHR ECX,23
SUB ECX,0x96
JC negexp
SHL EAX,CL
JMP shifted
negexp: NEG ECX
SHR EAX,CL
shifted: AND EDX,0x80000000
JNZ negative
RET
negative: NEG EAX
RET
zero: SUB EAX,EAX
RET
}
}
5- Version independent of the rounding mode, but which works correctly on doubles:
extern "C" __declspec(naked) void __cdecl _ftol()
{
__asm {
PUSH EDX
PUSH ECX
SUB ESP,8
FSTP QWORD PTR [ESP]
POP EAX
POP EDX
MOV ECX,EDX
AND EDX,0x000FFFFF
OR EDX,0x00100000
SHL EDX,11
SHR EAX,21
OR EAX,EDX
MOV EDX,ECX
AND ECX,0x7FF00000
JZ zero
SHR ECX,20
SUB ECX,0x41E
JC negexp
SHL EAX,CL
JMP shifted
negexp: NEG ECX
CMP ECX,32
JAE zero
SHR EAX,CL
shifted: AND EDX,0x80000000
POP ECX
POP EDX
JNZ negative
RET
negative: NEG EAX
RET
zero: SUB EAX,EAX
POP ECX
POP EDX
RET
}
}
6- Compiling with -QIfist, which doesn't return the correct result (it rounds, not chops). But it can be made to return the correct result if the rounding mode is changed at program startup and never modified afterwards. Note that I haven't made any efforts at optimizing the different routines. Specifically, might be possible to convert some jumps into CMOV instructions (PPro and up only). Note also that, in order for this replacement routines to work, they must be accessed before the one in the C library. The safest way to do it is to make sure it is in the same CPP file as the main() or WinMain() function. The timing results were:
1- Time = 10805 ms
2- Time = 3916 ms
3- Time = 4227 ms
4- Time = 4707 ms
5- Time = 7531 ms
6- Time = 2343 ms
The different implementations have different trade-offs that make them all potentially viable for different developers. I'd like to see those functions optimized. If you do, please, send me a copy.