• 16-byte alignment

AndyCap For the 16 type align I am just using posix_memalign()

Can you show your code please?

Sure:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <memory>


#ifdef __ARM_NEON
#include <arm_neon.h>
#endif

#define DATA_SIZE (1024*64)
#define ALIGN_SIZE (16)
//#define RANDOM
#define USE_INTRINSICS


#define znew (uZ=36969*(uZ&65535)+(uZ>>16))
#define wnew (uW=18000*(uW&65535)+(uW>>16))
#define MWC ((znew<<16)+wnew )


#ifdef __ARM_NEON
void enableRunFast()
{
  printf("NEON enableRunFast\n");
  static const unsigned int x = 0x04086060;
  static const unsigned int y = 0x03000000;
  int r;
  asm volatile (
                "fmrx  %0, fpscr      \n\t"  //r0 = FPSCR
                "and  %0, %0, %1      \n\t"  //r0 = r0 & 0x04086060
                "orr  %0, %0, %2      \n\t"  //r0 = r0 | 0x03000000
                "fmxr  fpscr, %0      \n\t"  //FPSCR = r0
                : "=r"(r)
                : "r"(x), "r"(y)
                );
}


#endif

int main(void)
{
	float *pfData=0;
#ifdef ALIGN_SIZE
	printf("Aligning memory to %u\n", ALIGN_SIZE);
	posix_memalign((void **)&pfData, ALIGN_SIZE, DATA_SIZE * sizeof(float));
#else
	pfData = (float *)malloc(DATA_SIZE * sizeof(float));
#endif


  uint32_t uIters = 100000;

#ifdef __ARM_NEON
  printf("NEON Enabled\n");
  enableRunFast();
#endif

  float fStart = 0.0f;
  for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
  {
    pfData[uD] = (float)(uD+1);
    fStart += (float)(uD+1);
  }
  printf("Start = %f\n", fStart);

#ifdef USE_INTRINSICS
  printf("Using NEON intrinsics\n");

  // sanity test
  float fTest[] = {1.0f, 2.0f, 3.0f, 4.0f};
  float32x4_t v = vld1q_f32(fTest);
  float32x4_t acc = vmulq_n_f32(v, 0.9f);
  vst1q_f32(fTest, acc);

  for(int i=0; i<4; i++)
    printf("[%d] = %f\n", i, fTest[i]);

  // process with intrinsics
  for(uint32_t uIter = 0; uIter < uIters; uIter++)
  {
    float *pfPos = pfData;
#ifdef RANDOM
    for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
    {
      uint32_t uIndex = MWC % (DATA_SIZE-2);
      float32x2_t v = vld1_f32(&pfData[uIndex]);
      float32x2_t acc = vmul_n_f32(v, 0.9999f);
      pfData[uIndex] = vget_lane_f32(acc, 0);
#else
      for(uint32_t uD = 0; uD < DATA_SIZE; uD+=4)
      {
      float32x4_t v = vld1q_f32(pfPos);
      float32x4_t acc = vmulq_n_f32(v, 0.9999f);
      vst1q_f32(pfPos, acc);
      pfPos+=4;
#endif
    }
  }
#else
  printf("Using Simple C\n");
  // process simple c
  for(uint32_t uIter = 0; uIter < ((uIters>>2)<<2); uIter++)
  {
    for(uint32_t uD = 0; uD < ((DATA_SIZE>>2)<<2); uD++)
    {
#ifdef RANDOM
      uint32_t uIndex = MWC % DATA_SIZE;
      pfData[uIndex] = pfData[uIndex]*0.9999f;
#else
      pfData[uD] = pfData[uD]*0.9999f;
#endif
    }
  }
#endif

  //result to stop optimizer removing code.
  float fResult = 0.0f;
  for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
    fResult += pfData[uD];

  printf("Result = %f\n", fResult);

  return 0;
}

and some output:

not aligned:

root@bela:~/Development# time ./TestFloatSpeed
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using Simple C
Result = 97287.093750

real	1m58.713s
user	1m52.732s
sys	0m1.928s


root@bela:~/Development# time ./TestFloatSpeed
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using NEON intrinsics
[0] = 0.900000
[1] = 1.800000
[2] = 2.700000
[3] = 3.600000
Result = 97287.093750

real	2m3.180s
user	1m57.368s
sys	0m1.932s

Aligned to 16:

root@bela:~/Development# time ./TestFloatSpeed
Aligning memory to 16
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using Simple C
Result = 97287.093750

real	0m50.286s
user	0m47.476s
sys	0m0.840s


root@bela:~/Development# time ./TestFloatSpeed
Aligning memory to 16
NEON Enabled
NEON enableRunFast
Start = 2147495168.000000
Using NEON intrinsics
[0] = 0.900000
[1] = 1.800000
[2] = 2.700000
[3] = 3.600000
Result = 97287.093750

real	0m37.737s
user	0m35.948s
sys	0m0.616s

AndyCap I have looked into the memory alignment in gcc&clang to see if there is a simple way of enforcing 16 byte alignment without changing code, there doesn't seem to be a catch all way of doing this though.

I am no expert in alignment, but maybe a hack with a dedicated class AlignedFloat with an alignas() with some operator overloads and then a

#define float AlignedFloat

would make for a single-include fix. Given how includes could be added on the command-line (with -include /path/to/file.h), then this would require zero modifications of the (C++) source files. However, #defineing float seems a horrible idea and I am sure this could cause problems in a number of cases.
I cannot think of an equivalent way to do this in C.

I split out the conversation to a new thread avoid hi-jacking the other one.

or probably just scrap all of that, as I am unlikely to have understood the problem (or its solution)

I think really we also need a way of automatically aligning malloc() (calloc, realloc etc), ::new and arrays[] on the stack and heap.

At least then we could catch code that is working on buffer type data and align the buffers.

If we set single floats to AlignedFloat I wonder what would happen if we had something like:

float fBuffer[16];

Are fBuffer[0] and fBuffer[1] 16 bytes apart, or just the first aligned to 16 bytes?

good point. I told you my approach made no sense! In my defense, I was in the garden on a sunny bank holiday when I wrote that 🙂

Sounds like a good change.

I haven't really looked at how Pd integrates with the normal running of the Bela, is it still being done with Heavy?