Sure:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <memory>
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#define DATA_SIZE (1024*64)
#define ALIGN_SIZE (16)
//#define RANDOM
#define USE_INTRINSICS
#define znew (uZ=36969*(uZ&65535)+(uZ>>16))
#define wnew (uW=18000*(uW&65535)+(uW>>16))
#define MWC ((znew<<16)+wnew )
#ifdef __ARM_NEON
void enableRunFast()
{
printf("NEON enableRunFast\n");
static const unsigned int x = 0x04086060;
static const unsigned int y = 0x03000000;
int r;
asm volatile (
"fmrx %0, fpscr \n\t" //r0 = FPSCR
"and %0, %0, %1 \n\t" //r0 = r0 & 0x04086060
"orr %0, %0, %2 \n\t" //r0 = r0 | 0x03000000
"fmxr fpscr, %0 \n\t" //FPSCR = r0
: "=r"(r)
: "r"(x), "r"(y)
);
}
#endif
int main(void)
{
float *pfData=0;
#ifdef ALIGN_SIZE
printf("Aligning memory to %u\n", ALIGN_SIZE);
posix_memalign((void **)&pfData, ALIGN_SIZE, DATA_SIZE * sizeof(float));
#else
pfData = (float *)malloc(DATA_SIZE * sizeof(float));
#endif
uint32_t uIters = 100000;
#ifdef __ARM_NEON
printf("NEON Enabled\n");
enableRunFast();
#endif
float fStart = 0.0f;
for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
{
pfData[uD] = (float)(uD+1);
fStart += (float)(uD+1);
}
printf("Start = %f\n", fStart);
#ifdef USE_INTRINSICS
printf("Using NEON intrinsics\n");
// sanity test
float fTest[] = {1.0f, 2.0f, 3.0f, 4.0f};
float32x4_t v = vld1q_f32(fTest);
float32x4_t acc = vmulq_n_f32(v, 0.9f);
vst1q_f32(fTest, acc);
for(int i=0; i<4; i++)
printf("[%d] = %f\n", i, fTest[i]);
// process with intrinsics
for(uint32_t uIter = 0; uIter < uIters; uIter++)
{
float *pfPos = pfData;
#ifdef RANDOM
for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
{
uint32_t uIndex = MWC % (DATA_SIZE-2);
float32x2_t v = vld1_f32(&pfData[uIndex]);
float32x2_t acc = vmul_n_f32(v, 0.9999f);
pfData[uIndex] = vget_lane_f32(acc, 0);
#else
for(uint32_t uD = 0; uD < DATA_SIZE; uD+=4)
{
float32x4_t v = vld1q_f32(pfPos);
float32x4_t acc = vmulq_n_f32(v, 0.9999f);
vst1q_f32(pfPos, acc);
pfPos+=4;
#endif
}
}
#else
printf("Using Simple C\n");
// process simple c
for(uint32_t uIter = 0; uIter < ((uIters>>2)<<2); uIter++)
{
for(uint32_t uD = 0; uD < ((DATA_SIZE>>2)<<2); uD++)
{
#ifdef RANDOM
uint32_t uIndex = MWC % DATA_SIZE;
pfData[uIndex] = pfData[uIndex]*0.9999f;
#else
pfData[uD] = pfData[uD]*0.9999f;
#endif
}
}
#endif
//result to stop optimizer removing code.
float fResult = 0.0f;
for(uint32_t uD = 0; uD < DATA_SIZE; uD++)
fResult += pfData[uD];
printf("Result = %f\n", fResult);
return 0;
}