Skip to content

Commit 0949ef3

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 65ba029 commit 0949ef3

File tree

1 file changed

+54
-30
lines changed

1 file changed

+54
-30
lines changed

miniaudio.h

Lines changed: 54 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -42903,7 +42903,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4290342903
}
4290442904
}
4290542905

42906-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42906+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4290742907
{
4290842908
ma_uint64 iSample;
4290942909

@@ -43198,10 +43198,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4319843198
sampleCount = frameCount * channels;
4319943199

4320043200
if (volume == 1) {
43201+
#pragma clang loop vectorize(enable)
4320143202
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4320243203
pDst[iSample] += pSrc[iSample];
4320343204
}
4320443205
} else {
43206+
#pragma clang loop vectorize(enable)
4320543207
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4320643208
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4320743209
}
@@ -45502,7 +45504,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4550245504
const float a1 = pBQ->a1.f32;
4550345505
const float a2 = pBQ->a2.f32;
4550445506

45505-
MA_ASSUME(channels > 0);
45507+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45508+
#pragma clang loop unroll(disable)
4550645509
for (c = 0; c < channels; c += 1) {
4550745510
float r1 = pBQ->pR1[c].f32;
4550845511
float r2 = pBQ->pR2[c].f32;
@@ -45534,7 +45537,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4553445537
const ma_int32 a1 = pBQ->a1.s32;
4553545538
const ma_int32 a2 = pBQ->a2.s32;
4553645539

45537-
MA_ASSUME(channels > 0);
45540+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45541+
#pragma clang loop unroll(disable)
4553845542
for (c = 0; c < channels; c += 1) {
4553945543
ma_int32 r1 = pBQ->pR1[c].s32;
4554045544
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45808,22 +45812,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4580845812
return MA_SUCCESS;
4580945813
}
4581045814

45811-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45815+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4581245816
{
4581345817
ma_uint32 c;
4581445818
const ma_uint32 channels = pLPF->channels;
4581545819
const float a = pLPF->a.f32;
4581645820
const float b = 1 - a;
4581745821

45818-
MA_ASSUME(channels > 0);
45822+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45823+
#pragma clang loop unroll(disable)
4581945824
for (c = 0; c < channels; c += 1) {
4582045825
float r1 = pLPF->pR1[c].f32;
45821-
float x = pX[c];
45826+
float x = pX[c];
4582245827
float y;
4582345828

45824-
y = b*x + a*r1;
45829+
y = b * x + a * r1;
4582545830

45826-
pY[c] = y;
45831+
pY[c] = y;
4582745832
pLPF->pR1[c].f32 = y;
4582845833
}
4582945834
}
@@ -45835,7 +45840,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4583545840
const ma_int32 a = pLPF->a.s32;
4583645841
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4583745842

45838-
MA_ASSUME(channels > 0);
45843+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45844+
#pragma clang loop unroll(disable)
4583945845
for (c = 0; c < channels; c += 1) {
4584045846
ma_int32 r1 = pLPF->pR1[c].s32;
4584145847
ma_int32 x = pX[c];
@@ -46688,7 +46694,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4668846694
const float a = 1 - pHPF->a.f32;
4668946695
const float b = 1 - a;
4669046696

46691-
MA_ASSUME(channels > 0);
46697+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4669246698
for (c = 0; c < channels; c += 1) {
4669346699
float r1 = pHPF->pR1[c].f32;
4669446700
float x = pX[c];
@@ -46708,7 +46714,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4670846714
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4670946715
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4671046716

46711-
MA_ASSUME(channels > 0);
46717+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4671246718
for (c = 0; c < channels; c += 1) {
4671346719
ma_int32 r1 = pHPF->pR1[c].s32;
4671446720
ma_int32 x = pX[c];
@@ -48816,6 +48822,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4881648822
ma_uint64 iFrame;
4881748823
ma_uint32 iChannel;
4881848824
ma_uint64 interpolatedFrameCount;
48825+
const ma_uint32 channels = pGainer->config.channels;
4881948826

4882048827
MA_ASSERT(pGainer != NULL);
4882148828

@@ -48855,12 +48862,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4885548862
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4885648863
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4885748864

48858-
if (pGainer->config.channels <= 32) {
48865+
if (channels <= 32) {
4885948866
float pRunningGain[32];
4886048867
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4886148868

4886248869
/* Initialize the running gain. */
48863-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48870+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4886448871
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4886548872
pRunningGainDelta[iChannel] = t * d;
4886648873
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48869,7 +48876,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4886948876
iFrame = 0;
4887048877

4887148878
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48872-
if (pGainer->config.channels == 2) {
48879+
if (channels == 2) {
4887348880
#if defined(MA_SUPPORT_SSE2)
4887448881
if (ma_has_sse2()) {
4887548882
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48917,6 +48924,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4891748924

4891848925
iFrame = unrolledLoopCount << 1;
4891948926
#else
48927+
#pragma clang loop vectorize(enable)
4892048928
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4892148929
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4892248930
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48928,7 +48936,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4892848936
}
4892948937
#endif
4893048938
}
48931-
} else if (pGainer->config.channels == 6) {
48939+
} else if (channels == 6) {
4893248940
#if defined(MA_SUPPORT_SSE2)
4893348941
if (ma_has_sse2()) {
4893448942
/*
@@ -48961,6 +48969,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4896148969
} else
4896248970
#endif
4896348971
{
48972+
#pragma clang loop vectorize(enable)
4896448973
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4896548974
for (iChannel = 0; iChannel < 6; iChannel += 1) {
4896648975
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48972,7 +48981,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4897248981
}
4897348982
}
4897448983
}
48975-
} else if (pGainer->config.channels == 8) {
48984+
} else if (channels == 8) {
4897648985
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4897748986
#if defined(MA_SUPPORT_SSE2)
4897848987
if (ma_has_sse2()) {
@@ -48992,6 +49001,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4899249001
#endif
4899349002
{
4899449003
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
49004+
#pragma clang loop vectorize(enable)
4899549005
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4899649006
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4899749007
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -49005,17 +49015,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4900549015
}
4900649016
}
4900749017

49018+
#pragma clang loop unroll(disable)
4900849019
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
49009-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49010-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
49020+
#pragma clang loop vectorize(enable)
49021+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49022+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4901149023
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4901249024
}
4901349025
}
4901449026
} else {
4901549027
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
49028+
#pragma clang loop unroll(disable)
4901649029
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
49017-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49018-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49030+
#pragma clang loop vectorize(enable)
49031+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49032+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4901949033
}
4902049034

4902149035
a += d;
@@ -49034,18 +49048,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4903449048

4903549049
/* All we need to do here is apply the new gains using an optimized path. */
4903649050
if (pFramesOut != NULL && pFramesIn != NULL) {
49037-
if (pGainer->config.channels <= 32) {
49051+
if (channels <= 32) {
4903849052
float gains[32];
49039-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49053+
#pragma clang loop unroll(disable)
49054+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4904049055
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4904149056
}
4904249057

49043-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
49058+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4904449059
} else {
4904549060
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49061+
#pragma clang loop unroll(disable)
4904649062
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49047-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49048-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49063+
#pragma clang loop vectorize(enable)
49064+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49065+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4904949066
}
4905049067
}
4905149068
}
@@ -51415,7 +51432,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5141551432

5141651433
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5141751434

51418-
MA_ASSUME(channels > 0);
51435+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5141951436
for (c = 0; c < channels; c += 1) {
5142051437
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5142151438
pFrameOut[c] = s;
@@ -51434,7 +51451,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5143451451

5143551452
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5143651453

51437-
MA_ASSUME(channels > 0);
51454+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5143851455
for (c = 0; c < channels; c += 1) {
5143951456
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5144051457
pFrameOut[c] = s;
@@ -52669,6 +52686,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
5266952686
ma_uint64 iFrame;
5267052687
ma_uint32 iChannelOut;
5267152688

52689+
#pragma clang loop unroll(disable)
5267252690
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5267352691
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5267452692
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52689,6 +52707,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
5268952707
ma_uint64 iFrame;
5269052708
ma_uint32 iChannelOut;
5269152709

52710+
#pragma clang loop unroll(disable)
5269252711
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5269352712
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5269452713
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52731,6 +52750,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
5273152750
ma_uint64 iFrame;
5273252751
ma_uint32 iChannelOut;
5273352752

52753+
#pragma clang loop unroll(disable)
5273452754
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5273552755
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5273652756
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52751,6 +52771,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
5275152771
ma_uint64 iFrame;
5275252772
ma_uint32 iChannelOut;
5275352773

52774+
#pragma clang loop unroll(disable)
5275452775
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5275552776
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5275652777
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52985,6 +53006,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5298553006
} else
5298653007
#endif
5298753008
{
53009+
#pragma clang loop vectorize(enable)
5298853010
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5298953011
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5299053012
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -53012,6 +53034,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5301253034
} else
5301353035
#endif
5301453036
{
53037+
#pragma clang loop vectorize(enable)
5301553038
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5301653039
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5301753040
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -53029,6 +53052,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5302953052
} else
5303053053
#endif
5303153054
{
53055+
#pragma clang loop vectorize(enable)
5303253056
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5303353057
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5303453058
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66352,7 +66376,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6635266376
ma_uint64 iFrame;
6635366377
ma_uint32 iChannel;
6635466378
const ma_uint32 channels = pNoise->config.channels;
66355-
MA_ASSUME(channels > 0);
66379+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6635666380

6635766381
if (pNoise->config.format == ma_format_f32) {
6635866382
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66471,7 +66495,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6647166495
ma_uint64 iFrame;
6647266496
ma_uint32 iChannel;
6647366497
const ma_uint32 channels = pNoise->config.channels;
66474-
MA_ASSUME(channels > 0);
66498+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6647566499

6647666500
if (pNoise->config.format == ma_format_f32) {
6647766501
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66553,7 +66577,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6655366577
ma_uint64 iFrame;
6655466578
ma_uint32 iChannel;
6655566579
const ma_uint32 channels = pNoise->config.channels;
66556-
MA_ASSUME(channels > 0);
66580+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6655766581

6655866582
if (pNoise->config.format == ma_format_f32) {
6655966583
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)