right, sorry: I was basing my answer on benchmarks done with the old version.
Here's the deal: powf_neon(x, y)
misbehaves for x < 0.9
and large large-ish values of y
.
Good news is that powf_c()
seems to work fine across the range instead, so there is probably some issue in the inline neon code that should be fixable (the _neon
implementation aims at being an almost-line-by-line re-implementation of the approximate optimized C code in _c
) . I don't have time just now, but can look into that in the future. Also, if you want to tackle that, it's a nice exercise ...
This code
for(float x = 0.5; x < 2; x += 0.2)
{
printf("x = %f\n", x);
for(unsigned int y = 2; y < 10; y++)
{
float fCc = powf_c(x, y);
float fNeon = powf_neon(x, y);
float fC = powf(x, y);
printf("[%u] = %f, %f, %f\n", y, fCc, fNeon, fC);
}
printf("\n");
}
gives
x = 0.500000
[2] = 0.250000, 0.249859, 0.250000
[3] = 0.125000, 0.120060, 0.125000
[4] = 0.062500, 0.009653, 0.062500
[5] = 0.031250, -0.284727, 0.031250
[6] = 0.015625, -1.314646, 0.015625
[7] = 0.007812, -4.416289, 0.007812
[8] = 0.003906, -12.413382, 0.003906
[9] = 0.001953, -30.662472, 0.001953
x = 0.700000
[2] = 0.490001, 0.490001, 0.490000
[3] = 0.343001, 0.342990, 0.343000
[4] = 0.240101, 0.239917, 0.240100
[5] = 0.168071, 0.166735, 0.168070
[6] = 0.117650, 0.111364, 0.117649
[7] = 0.082355, 0.059847, 0.082354
[8] = 0.057648, -0.009017, 0.057648
[9] = 0.040354, -0.131303, 0.040354
x = 0.900000
[2] = 0.810003, 0.810002, 0.810000
[3] = 0.729004, 0.729002, 0.729000
[4] = 0.656105, 0.656103, 0.656100
[5] = 0.590496, 0.590493, 0.590490
[6] = 0.531447, 0.531444, 0.531441
[7] = 0.478303, 0.478300, 0.478297
[8] = 0.430474, 0.430470, 0.430467
[9] = 0.387427, 0.387421, 0.387420
x = 1.100000
[2] = 1.210000, 1.210000, 1.210000
[3] = 1.331000, 1.331000, 1.331000
[4] = 1.464100, 1.464100, 1.464100
[5] = 1.610510, 1.610510, 1.610510
[6] = 1.771561, 1.771561, 1.771561
[7] = 1.948717, 1.948717, 1.948717
[8] = 2.143589, 2.143589, 2.143589
[9] = 2.357948, 2.357949, 2.357948
x = 1.300000
[2] = 1.690000, 1.690000, 1.690000
[3] = 2.197001, 2.197001, 2.197000
[4] = 2.856101, 2.856102, 2.856101
[5] = 3.712932, 3.712932, 3.712931
[6] = 4.826813, 4.826812, 4.826810
[7] = 6.274857, 6.274857, 6.274854
[8] = 8.157315, 8.157315, 8.157310
[9] = 10.604511, 10.604511, 10.604505
x = 1.500000
[2] = 2.250000, 2.249998, 2.250000
[3] = 3.375000, 3.374995, 3.375001
[4] = 5.062500, 5.062491, 5.062502
[5] = 7.593750, 7.593732, 7.593754
[6] = 11.390626, 11.390594, 11.390632
[7] = 17.085939, 17.085884, 17.085949
[8] = 25.628910, 25.628813, 25.628931
[9] = 38.443367, 38.443203, 38.443401
x = 1.700000
[2] = 2.890002, 2.890000, 2.890001
[3] = 4.913006, 4.912999, 4.913002
[4] = 8.352115, 8.352098, 8.352103
[5] = 14.198600, 14.198566, 14.198577
[6] = 24.137630, 24.137562, 24.137585
[7] = 41.033989, 41.033852, 41.033897
[8] = 69.757812, 69.757553, 69.757629
[9] = 118.588333, 118.587822, 118.587982
x = 1.900000
[2] = 3.609996, 3.609982, 3.610001
[3] = 6.858988, 6.858949, 6.859003
[4] = 13.032071, 13.031971, 13.032106
[5] = 24.760921, 24.760685, 24.761005
[6] = 47.045723, 47.045185, 47.045918
[7] = 89.386826, 89.385628, 89.387253
[8] = 169.834869, 169.832275, 169.835800
[9] = 322.686096, 322.680542, 322.688049