很多年前,调研过浮点数与整数之间的双射问题:
win7 intel x64 cpu vs2013 c语言浮点数精度失真问题
最近重新学习了一下IEEE754标准,也许实际还有很多深刻问题没有被揭示。
计算机程序设计艺术,据说这本书中也有讨论。
参考:https://upimg.baike.so.com/doc/643382-681042.html
float32在线测试页面 :https://www.h-schmidt.net/FloatConverter/IEEE754.html
我用手写计算的方式,算过一遍之后,得到一个结论:
任意数值的32bit,对应的float,都可以被精确地计算,转换为一个无误差对应的字符串,且与printf %.Nf一致,只要N足够大。
eg:float FLT_TRUE_MIN == 0x00000001, 0.00000000000000000000000000000000000000000000140129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125 (149位小数)
所有其他float都是FLT_TRUE_MIN这个数的整数倍。
稍微计算几个值就明白为什么。fraction中第一个1代表0.5,第二个1是0.25,然后是0.125,0.0625,0.03125 …尾数一直可以被2整除。题外话,如果是前苏联的3进制计算机,可能就存在无限循环除不尽的问题。
但是,前面提过,float与u32的双射问题。
显然32bit最多只有4,294,967,295个值。
u32只有10位十进制数,与149位十进制小数到FLT_MAX(340282346638528859811704183484516925440.000000)是无法一一对应的。
那么中间,绝大多数的浮点数数值都没有32bit的对应值,也就是说,人工随便写的一个浮点数字符串,有99%以上的概率转换为float之后,再转换为string,是无法还原的!
但是,c基础库的printf %f和atof有一种不失真、可逆的转换实现!只是这种实现是从32bit到float string,再还原到32bit。
人为造假float值比较容易被识别,只要是不可逆的就一定是人为改过的!
个人觉得IEEE754标准的设计,c语言的实现,并不是看起来那么简单。
稍微提几个方面:硬件电路、编译器、累计误差…这些方面的坑感觉都很深。
下面是个人测试float的代码及测试结果:
#include <stdio.h>
#include <float.h>
#include <math.h>
union u32f32 {
unsigned int u32;
float f32;
struct {
unsigned int fraction : 23;
unsigned int exp : 8;
unsigned int sign : 1;
};
};
static char c_u32[64] = { 0 };
static char c_sign[64] = { 0 };
static char c_exp[64] = { 0 };
static char c_fraction[64] = { 0 };
void to_binary_string_with_spaces(const unsigned int u32, const int bits, char c_temp[64]) {
int valid_bits = bits <= 32 ? bits : 32;
int char_index = 0;
for (int i = valid_bits - 1; i >= 0; i--) {
if (i % 4 == 3 && i != valid_bits - 1) {
c_temp[char_index++] = ' ';
}
c_temp[char_index++] = (u32 & (1U << i)) ? '1' : '0';
}
c_temp[char_index] = '\0';
}
void print_float_binary(const char* p_name, const unsigned int i) {
const union u32f32 u32_f32_obj = { .u32 = i };
to_binary_string_with_spaces(u32_f32_obj.u32, 32, c_u32);
to_binary_string_with_spaces(u32_f32_obj.sign, 1, c_sign);
to_binary_string_with_spaces(u32_f32_obj.exp, 8, c_exp);
to_binary_string_with_spaces(u32_f32_obj.fraction, 23, c_fraction);
printf("float %s == 0x%08x, binary(%s)\n", p_name, u32_f32_obj.u32, c_u32);
printf("float %s == 0x%08x, sign(%s)*(-1) 2^exp(%s-127) fraction(1.%s)\n",
p_name, u32_f32_obj.u32, c_sign, c_exp, c_fraction);
}
int test_float() {
union u32f32 u32_f32_max_obj = { .f32 = FLT_MAX };//3.402823466e+38F
union u32f32 u32_f32_epsilon_obj = { .f32 = FLT_EPSILON };//1.192092896e-07F
union u32f32 u32_f32_min_obj = { .f32 = FLT_MIN };//1.175494351e-38F
union u32f32 u32_f32_true_min_obj = { .f32 = FLT_TRUE_MIN };//1.401298464e-45F
union u32f32 u32_f32_NaN_obj = { .f32 = NAN };//nan
union u32f32 u32_f32_inf_obj = { .f32 = INFINITY };
union u32f32 u32_f32_ne_inf_obj = { .f32 = -INFINITY };
union u32f32 u32_f32_0_obj = { .f32 = 0 };
union u32f32 u32_f32_ne_0_obj = { .f32 = 0,.sign = 1 };
printf("float FLT_MAX == 0x%08x, %f\n", u32_f32_max_obj.u32, u32_f32_max_obj.f32);
printf("float FLT_EPSILON == 0x%08x, %.23f\n", u32_f32_epsilon_obj.u32, u32_f32_epsilon_obj.f32);
printf("float FLT_MIN == 0x%08x, %.126f\n", u32_f32_min_obj.u32, u32_f32_min_obj.f32);
printf("float FLT_TRUE_MIN == 0x%08x, %.149f\n", u32_f32_true_min_obj.u32, u32_f32_true_min_obj.f32);
printf("float NaN == 0x%08x, %f\n", u32_f32_NaN_obj.u32, u32_f32_NaN_obj.f32);
printf("float INFINITY == 0x%08x, %f\n", u32_f32_inf_obj.u32, u32_f32_inf_obj.f32);
printf("float -INFINITY == 0x%08x, %f\n", u32_f32_ne_inf_obj.u32, u32_f32_ne_inf_obj.f32);
printf("float +0 == 0x%08x, %f\n", u32_f32_0_obj.u32, u32_f32_0_obj.f32);
printf("float -0 == 0x%08x, %f\n", u32_f32_ne_0_obj.u32, u32_f32_ne_0_obj.f32);
//
print_float_binary("FLT_MAX ", u32_f32_max_obj.u32);
print_float_binary("FLT_EPSILON ", u32_f32_epsilon_obj.u32);
print_float_binary("FLT_MIN ", u32_f32_min_obj.u32);
print_float_binary("FLT_TRUE_MIN", u32_f32_true_min_obj.u32);
print_float_binary("NaN ", u32_f32_NaN_obj.u32);
print_float_binary("INFINITY ", u32_f32_inf_obj.u32);
print_float_binary("-INFINITY ", u32_f32_ne_inf_obj.u32);
print_float_binary("zero(+0) ", u32_f32_0_obj.u32);
print_float_binary("zero(-0) ", u32_f32_ne_0_obj.u32);
//
union u32f32 u32_f32_obj = { .u32 = 0xBCD3D6D8 };//0xBCD3D6D8 == -0.02585928142070770263671875
printf("float obj_test == 0x%08x, %.60f\n", u32_f32_obj.u32, u32_f32_obj.f32);
print_float_binary("obj_test ", u32_f32_obj.u32);
//
printf("float obj_test == 0x%08x, sign(%s1) 2^exp(%d) fraction(%.30f)\n", u32_f32_obj.u32,
u32_f32_obj.sign ? "-" : "+",
((int)u32_f32_obj.exp) - 127,
1.0 * u32_f32_obj.fraction / (1 << 23) + 1);
printf("float obj_test == 0x%08x, sign(%d) exp(%f) fraction(%.30f)\n", u32_f32_obj.u32,
(-1) * (int)u32_f32_obj.sign,
(pow(2, ((int)u32_f32_obj.exp) - 127)),
(1.0 * u32_f32_obj.fraction / (1 << 23) + 1));
printf("float obj_test == 0x%08x, %.30f\n", u32_f32_obj.u32,
(-1) * (int)u32_f32_obj.sign *
(pow(2, ((int)u32_f32_obj.exp) - 127)) *
(1.0 * u32_f32_obj.fraction / (1 << 23) + 1));
return 0;
}
int main(int argc, char** argv) {
return test_float();
}
float FLT_MAX == 0x7f7fffff, 340282346638528859811704183484516925440.000000
float FLT_EPSILON == 0x34000000, 0.00000011920928955078125
float FLT_MIN == 0x00800000, 0.000000000000000000000000000000000000011754943508222875079687365372222456778186655567720875215087517062784172594547271728515625
float FLT_TRUE_MIN == 0x00000001, 0.00000000000000000000000000000000000000000000140129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125
float NaN == 0x7fc00000, nan
float INFINITY == 0x7f800000, inf
float -INFINITY == 0xff800000, -inf
float +0 == 0x00000000, 0.000000
float -0 == 0x80000000, -0.000000
float FLT_MAX == 0x7f7fffff, binary(0111 1111 0111 1111 1111 1111 1111 1111)
float FLT_MAX == 0x7f7fffff, sign(0)*(-1) 2^exp(1111 1110-127) fraction(1.111 1111 1111 1111 1111 1111)
float FLT_EPSILON == 0x34000000, binary(0011 0100 0000 0000 0000 0000 0000 0000)
float FLT_EPSILON == 0x34000000, sign(0)*(-1) 2^exp(0110 1000-127) fraction(1.000 0000 0000 0000 0000 0000)
float FLT_MIN == 0x00800000, binary(0000 0000 1000 0000 0000 0000 0000 0000)
float FLT_MIN == 0x00800000, sign(0)*(-1) 2^exp(0000 0001-127) fraction(1.000 0000 0000 0000 0000 0000)
float FLT_TRUE_MIN == 0x00000001, binary(0000 0000 0000 0000 0000 0000 0000 0001)
float FLT_TRUE_MIN == 0x00000001, sign(0)*(-1) 2^exp(0000 0000-127) fraction(1.000 0000 0000 0000 0000 0001)
float NaN == 0x7fc00000, binary(0111 1111 1100 0000 0000 0000 0000 0000)
float NaN == 0x7fc00000, sign(0)*(-1) 2^exp(1111 1111-127) fraction(1.100 0000 0000 0000 0000 0000)
float INFINITY == 0x7f800000, binary(0111 1111 1000 0000 0000 0000 0000 0000)
float INFINITY == 0x7f800000, sign(0)*(-1) 2^exp(1111 1111-127) fraction(1.000 0000 0000 0000 0000 0000)
float -INFINITY == 0xff800000, binary(1111 1111 1000 0000 0000 0000 0000 0000)
float -INFINITY == 0xff800000, sign(1)*(-1) 2^exp(1111 1111-127) fraction(1.000 0000 0000 0000 0000 0000)
float zero(+0) == 0x00000000, binary(0000 0000 0000 0000 0000 0000 0000 0000)
float zero(+0) == 0x00000000, sign(0)*(-1) 2^exp(0000 0000-127) fraction(1.000 0000 0000 0000 0000 0000)
float zero(-0) == 0x80000000, binary(1000 0000 0000 0000 0000 0000 0000 0000)
float zero(-0) == 0x80000000, sign(1)*(-1) 2^exp(0000 0000-127) fraction(1.000 0000 0000 0000 0000 0000)
float obj_test == 0xbcd3d6d8, -0.025859281420707702636718750000000000000000000000000000000000
float obj_test == 0xbcd3d6d8, binary(1011 1100 1101 0011 1101 0110 1101 1000)
float obj_test == 0xbcd3d6d8, sign(1)*(-1) 2^exp(0111 1001-127) fraction(1.101 0011 1101 0110 1101 1000)
float obj_test == 0xbcd3d6d8, sign(-1) 2^exp(-6) fraction(1.654994010925292968750000000000)
float obj_test == 0xbcd3d6d8, sign(-1) exp(0.015625) fraction(1.654994010925292968750000000000)
float obj_test == 0xbcd3d6d8, -0.025859281420707702636718750000