_ftol 的优化

_ftol 的优化
_ftol 是什么? 当你写 C 程序的时候,(int)float_v 就会被编译器产生一个对 _ftol 这个 CRT 函数的调用。上个世纪听一个做 3d 的朋友提起过,用 x87 指令实现的 _ftol 会很慢,一般用整数指令提供。当时提在心里,2000 年的时候在 RISC 上做开发 (ARM 指令集) 曾经写过一些整数模拟浮点的函数,曾经写过这个转换函数,日子久了,现在也找不回来代码。不过对浮点的 IEEE 标准还是比较清楚的。去年写过一篇 浮点数的精度控制问题 的帖子放在流言中。当时已经被骂过了。 今天工作时又遇到关于浮点数的问题,再写篇 blog 吧,或许还是找骂贴 :)
懒的重写 _ftol 的整数指令版本了,google 搜了下,发现果然有人也做过。http://www.flipcode.com/cgi-bin/fcarticles.cgi?show=64008 就是这么一个函数:
int ftol(float f)
{
    int a         = *(int*)(&f);
    int sign      = (a>>31);
    int mantissa  = (a&((1<<23)-1))|(1<<23);
    int exponent  = ((a&0x7fffffff)>>23)-127;
    int r         = ((unsigned int)(mantissa)<<8)>>(31-exponent);
    return ((r ^ (sign)) - sign ) &~ (exponent>>31);      
}当是效率比较高的。我想,日子已经过去这么久了。当初朋友跟我提这个事情的事情大约是 98,99 年。上面翻出来的老帖是 01 年的。我现在的机器不错,今年新买的 P4 双核的,还是测试一下比较放心。注:这个函数不能直接替换 CRT 中的 _ftol , CRT 的 _ftol 并不通过堆栈传递参数。 马上随手写了下面的测试程序:
#include <stdio.h>

#define RDTSC  _asm _emit 0x0f _asm _emit 0x31

#pragma warning (push)
#pragma warning (disable: 4035)
inline unsigned __int64 timestamp()
{
 __asm RDTSC
}
#pragma warning (pop)


int int_chop (float f)
{
    int a         = *(int*)(&f);
    int sign      = (a>>31);
    int mantissa  = (a&((1<<23)-1))|(1<<23);
    int exponent  = ((a&0x7fffffff)>>23)-127;
    int r         = ((unsigned int)(mantissa)<<8)>>(31-exponent);
    return ((r ^ (sign)) - sign ) &~ (exponent>>31);   
}

int test1(float f)
{
 return int_chop(f);
}

int test2(float f)
{
 return (int)f;
}

int test3(float  x)
{
 int   t;
 __asm  fld   x 
 __asm  fistp t
 return t;
}


void test(int t,int (*f)(float))
{
 int i;
 for (i=0;i<t;i++) {
  f(-0.8f);
 }
}

void main()
{
 int i;
 for (i=0;i<3;i++) {
  __int64 t;
  printf("---timing %d---\n",i);
  t=timestamp();

  test(100000,test1);

  t=timestamp()-t;

  printf("use int\t%I64d\n",t);

  t=timestamp();

  test(100000,test2);

  t=timestamp()-t;

  printf("(int)\t%I64d\n",t);

  t=timestamp();

  test(100000,test3);

  t=timestamp()-t;

  printf("use x87\t%I64d\n",t);
 }
}运行结果如下:
---timing 0---
use int 1063972
(int)   6589632
use x87 6050682
---timing 1---
use int 1041530
(int)   6548604
use x87 6010210
---timing 2---
use int 1253211
(int)   6788126
use x87 5679409

测试环境:windows XP 效率最高是 use x87

 


附内置版 __ftol
0040E7E8   fld         dword ptr [ebp+8]
0040E7EB   call        __ftol (00401048)
__ftol:
00401048   push        ebp
00401049   mov         ebp,esp
0040104B   add         esp,0F4h
0040104E   wait
0040104F   fnstcw      word ptr [ebp-2]
00401052   wait
00401053   mov         ax,word ptr [ebp-2]
00401057   or          ah,0Ch
0040105A   mov         word ptr [ebp-4],ax
0040105E   fldcw       word ptr [ebp-4]
00401061   fistp       qword ptr [ebp-0Ch]
00401064   fldcw       word ptr [ebp-2]
00401067   mov         eax,dword ptr [ebp-0Ch]
0040106A   mov         edx,dword ptr [ebp-8]
0040106D   leave
0040106E   ret
0040106F   int         3

 

----------------以下还有各种实现的版本

/*Editor's note: 
COTD Entry:  Faster Float-To-Int Conversion by JCAB [[email protected]]   Warning: this COTD is seriously MSVC-only. It might apply to other compilers (in fact, I'm certain it does apply to, at least, other older compilers). But the implementation doesn't.   Warning: some assembly required if you want to understand how this is done.   Recently, two people have asked around me about the float-to-int conversion routines being very slow. This is a problem endemic to the way the Microsoft C library implements a little function called _ftol().   So, I've been trying out alternative implementations of this routine, and I've come up with several. I've been testing them on a 700 MHZ PC using this programs:*/// This first one checks the alternative implementation for correctness.
// It needs the _ftol() function used (see below) rewritten as
// "int FTOL(float)" and changed accordingly:

volatile int k;int main(int argc, char* argv[])
{
 char s[4096]; enum { NUMTESTS = 10000000 };
 static float rndbuf[NUMTESTS]; int i;
 for (i = 0; i < NUMTESTS; ++i) {
  int intg1 = rand();
  int intg2 = rand();
  int intg3 = rand();
  int frac = rand();
  rndbuf[i] = float(intg1) - float(intg2) + float(intg3) + float(frac) / RAND_MAX;
 } for (i = 0; i < NUMTESTS; ++i) {
  k = rndbuf[i];
  if (k != FIST(rndbuf[i])) {
   sprintf(s, "Bad: %f (%08x) -> %d != %d\n", rndbuf[i], *(int*)&rndbuf[i], int(rndbuf[i]), FIST(rndbuf[i]));
   OutputDebugString(s);
  }
 } return 0;
}// This program checks for performance:

volatile int k;int main(int argc, char* argv[])
{
 char s[4096]; enum { NUMTESTS = 10000000 };
 static float rndbuf[NUMTESTS]; int i;
 for (i = 0; i < NUMTESTS; ++i) {
  int intg1 = rand();
  int intg2 = rand();
  int intg3 = rand();
  int frac = rand();
  rndbuf[i] = float(intg1) - float(intg2) + float(intg3) + float(frac) / RAND_MAX;
 } DWORD time = GetTickCount(); int j;
 for (j = 0; j < 10; ++j) {
  for (i = 0; i < NUMTESTS; ++i) {
   k = rndbuf[i];
  }
 } time = GetTickCount() - time; sprintf(s, "Time = %d ms\n", time);
 OutputDebugString(s); return 0;
}
   The different functions I checked were:1- The compiler's own.2- Faster version that uses some global memory and requires rounding mode to be on:
extern "C" __declspec(naked) void __cdecl _ftol()
{
 const static int zpfp[2] = { 0xBEFFFFFF, 0x3EFFFFFF }; __asm {
  SUB ESP,4
  FST DWORD PTR [ESP]
  MOV EAX,DWORD PTR [ESP]
  SHR EAX,29
  AND EAX,4
  FADD DWORD PTR [zpfp+EAX]
  FISTP DWORD PTR [ESP]
  POP EAX
  RET
 }
}
3- Slower version that uses no global memory and requires rounding mode to be on:
extern "C" __declspec(naked) void __cdecl _ftol()
{
 __asm {
  SUB ESP,4
  FST DWORD PTR [ESP]
  MOV EAX,DWORD PTR [ESP]
  AND EAX,0x80000000
  XOR EAX,0xBEFFFFFF
  MOV DWORD PTR [ESP],EAX
  FADD DWORD PTR [ESP]
  FISTP DWORD PTR [ESP]
  POP EAX
  RET
 }
}
4- Version independent of the rounding mode, but which only works correctly on floats:
extern "C" __declspec(naked) void __cdecl _ftol()
{
 __asm {
  SUB ESP,4
  FSTP DWORD PTR [ESP]
  POP EAX
  MOV EDX,EAX
  MOV ECX,EAX
  AND EAX,0x007FFFFF
  OR EAX,0x00800000
  AND ECX,0x7F800000
  JZ zero
  SHR ECX,23
  SUB ECX,0x96
  JC negexp
  SHL EAX,CL
  JMP shifted
negexp: NEG ECX
  SHR EAX,CL
shifted: AND EDX,0x80000000
  JNZ negative
  RET
negative: NEG EAX
  RET
zero:  SUB EAX,EAX
  RET
 }
}
5- Version independent of the rounding mode, but which works correctly on doubles:
extern "C" __declspec(naked) void __cdecl _ftol()
{
 __asm {
  PUSH EDX
  PUSH ECX
  SUB ESP,8
  FSTP QWORD PTR [ESP]
  POP EAX
  POP EDX
  MOV ECX,EDX
  AND EDX,0x000FFFFF
  OR EDX,0x00100000
  SHL EDX,11
  SHR EAX,21
  OR EAX,EDX
  MOV EDX,ECX
  AND ECX,0x7FF00000
  JZ zero
  SHR ECX,20
  SUB ECX,0x41E
  JC negexp
  SHL EAX,CL
  JMP shifted
negexp: NEG ECX
  CMP ECX,32
  JAE zero
  SHR EAX,CL
shifted: AND EDX,0x80000000
  POP ECX
  POP EDX
  JNZ negative
  RET
negative: NEG EAX
  RET
zero:  SUB EAX,EAX
  POP ECX
  POP EDX
  RET
 }
}
6- Compiling with -QIfist, which doesn't return the correct result (it rounds, not chops). But it can be made to return the correct result if the rounding mode is changed at program startup and never modified afterwards.   Note that I haven't made any efforts at optimizing the different routines. Specifically, might be possible to convert some jumps into CMOV instructions (PPro and up only). Note also that, in order for this replacement routines to work, they must be accessed before the one in the C library. The safest way to do it is to make sure it is in the same CPP file as the main() or WinMain() function. The timing results were:
1- Time = 10805 ms
2- Time =  3916 ms
3- Time =  4227 ms
4- Time =  4707 ms
5- Time =  7531 ms
6- Time =  2343 ms
   The different implementations have different trade-offs that make them all potentially viable for different developers.   I'd like to see those functions optimized. If you do, please, send me a copy.

 

 

 


 

你可能感兴趣的:(优化,Microsoft,assembly,performance,float,compiler)