AndyCap For the 16 type align I am just using posix_memalign()
Can you show your code please?
AndyCap For the 16 type align I am just using posix_memalign()
Can you show your code please?
Sure:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <memory>
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#define DATA_SIZE (1024*64)
#define ALIGN_SIZE (16)
//#define RANDOM
#define USE_INTRINSICS
#define znew (uZ=36969*(uZ&65535)+(uZ>>16))
#define wnew (uW=18000*(uW&65535)+(uW>>16))
#define MWC ((znew<<16)+wnew )
#ifdef __ARM_NEON
void enableRunFast()
{
printf("NEON enableRunFast\n");
static const unsigned int x = 0x04086060;
static const unsigned int y = 0x03000000;
int r;
asm volatile (
"fmrx %0, fpscr \n\t" //r0 = FPSCR
"and %0, %0, %1 \n\t" //r0 = r0 & 0x04086060
"orr %0, %0, %2 \n\t" //r0 = r0 | 0x03000000
"fmxr fpscr, %0 \n\t" //FPSCR = r0
: "=r"(r)
: "r"(x), "r"(y)
);
}
#endif
int main(void)
{
float *pfData=0;
#ifdef ALIGN_SIZE
printf("Aligning memory to %u\n", ALIGN_SIZE);
posix_memalign((void **)&pfData, ALIGN_SIZE, DATA_SIZE * sizeof(float));
#else
pfData = (float *)malloc(DATA_SIZE * sizeof(float));
#endif
uint32_t uIters = 100000;
#ifdef __ARM_NEON
printf("NEON Enabled\n");
enableRunFast();
#endif
float fStart = 0.0f;
for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
{
pfData[uD] = (float)(uD+1);
fStart += (float)(uD+1);
}
printf("Start = %f\n", fStart);
#ifdef USE_INTRINSICS
printf("Using NEON intrinsics\n");
// sanity test
float fTest[] = {1.0f, 2.0f, 3.0f, 4.0f};
float32x4_t v = vld1q_f32(fTest);
float32x4_t acc = vmulq_n_f32(v, 0.9f);
vst1q_f32(fTest, acc);
for(int i=0; i<4; i++)
printf("[%d] = %f\n", i, fTest[i]);
// process with intrinsics
for(uint32_t uIter = 0; uIter < uIters; uIter++)
{
float *pfPos = pfData;
#ifdef RANDOM
for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
{
uint32_t uIndex = MWC % (DATA_SIZE-2);
float32x2_t v = vld1_f32(&pfData[uIndex]);
float32x2_t acc = vmul_n_f32(v, 0.9999f);
pfData[uIndex] = vget_lane_f32(acc, 0);
#else
for(uint32_t uD = 0; uD < DATA_SIZE; uD+=4)
{
float32x4_t v = vld1q_f32(pfPos);
float32x4_t acc = vmulq_n_f32(v, 0.9999f);
vst1q_f32(pfPos, acc);
pfPos+=4;
#endif
}
}
#else
printf("Using Simple C\n");
// process simple c
for(uint32_t uIter = 0; uIter < ((uIters>>2)<<2); uIter++)
{
for(uint32_t uD = 0; uD < ((DATA_SIZE>>2)<<2); uD++)
{
#ifdef RANDOM
uint32_t uIndex = MWC % DATA_SIZE;
pfData[uIndex] = pfData[uIndex]*0.9999f;
#else
pfData[uD] = pfData[uD]*0.9999f;
#endif
}
}
#endif
//result to stop optimizer removing code.
float fResult = 0.0f;
for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
fResult += pfData[uD];
printf("Result = %f\n", fResult);
return 0;
}
and some output:
not aligned:
root@bela:~/Development# time ./TestFloatSpeed
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using Simple C
Result = 97287.093750
real 1m58.713s
user 1m52.732s
sys 0m1.928s
root@bela:~/Development# time ./TestFloatSpeed
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using NEON intrinsics
[0] = 0.900000
[1] = 1.800000
[2] = 2.700000
[3] = 3.600000
Result = 97287.093750
real 2m3.180s
user 1m57.368s
sys 0m1.932s
Aligned to 16:
root@bela:~/Development# time ./TestFloatSpeed
Aligning memory to 16
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using Simple C
Result = 97287.093750
real 0m50.286s
user 0m47.476s
sys 0m0.840s
root@bela:~/Development# time ./TestFloatSpeed
Aligning memory to 16
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using NEON intrinsics
[0] = 0.900000
[1] = 1.800000
[2] = 2.700000
[3] = 3.600000
Result = 97287.093750
real 0m37.737s
user 0m35.948s
sys 0m0.616s
thanks
AndyCap I have looked into the memory alignment in gcc&clang to see if there is a simple way of enforcing 16 byte alignment without changing code, there doesn't seem to be a catch all way of doing this though.
I am no expert in alignment, but maybe a hack with a dedicated class AlignedFloat
with an alignas() with some operator overloads and then a
#define float AlignedFloat
would make for a single-include fix. Given how includes could be added on the command-line (with -include /path/to/file.h
), then this would require zero modifications of the (C++) source files. However, #define
ing float
seems a horrible idea and I am sure this could cause problems in a number of cases.
I cannot think of an equivalent way to do this in C.
I split out the conversation to a new thread avoid hi-jacking the other one.
or probably just scrap all of that, as I am unlikely to have understood the problem (or its solution)
I think really we also need a way of automatically aligning malloc() (calloc, realloc etc), ::new and arrays[] on the stack and heap.
At least then we could catch code that is working on buffer type data and align the buffers.
If we set single floats to AlignedFloat I wonder what would happen if we had something like:
float fBuffer[16];
Are fBuffer[0] and fBuffer[1] 16 bytes apart, or just the first aligned to 16 bytes?
good point. I told you my approach made no sense! In my defense, I was in the garden on a sunny bank holiday when I wrote that
I guess we could start by making all the float*
buffers we pass into render()
16-byte aligned. https://github.com/BelaPlatform/Bela/issues/551
Sounds like a good change.
I haven't really looked at how Pd integrates with the normal running of the Bela, is it still being done with Heavy?
Heavy remains as a CPU efficient approach. Default behaviour is with libpd, see here https://github.com/BelaPlatform/Bela/wiki/Puredata-and-C--
Thanks