Skip to content

Commit eb7106d

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 3b5959f commit eb7106d

File tree

1 file changed

+51
-32
lines changed

1 file changed

+51
-32
lines changed

miniaudio.h

Lines changed: 51 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -42875,7 +42875,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4287542875
}
4287642876
}
4287742877

42878-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42878+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4287942879
{
4288042880
ma_uint64 iSample;
4288142881

@@ -43170,10 +43170,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4317043170
sampleCount = frameCount * channels;
4317143171

4317243172
if (volume == 1) {
43173+
#pragma clang loop vectorize(enable)
4317343174
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4317443175
pDst[iSample] += pSrc[iSample];
4317543176
}
4317643177
} else {
43178+
#pragma clang loop vectorize(enable)
4317743179
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4317843180
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4317943181
}
@@ -45476,7 +45478,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4547645478
const float a1 = pBQ->a1.f32;
4547745479
const float a2 = pBQ->a2.f32;
4547845480

45479-
MA_ASSUME(channels > 0);
45481+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45482+
#pragma clang loop vectorize(assume_safety)
4548045483
for (c = 0; c < channels; c += 1) {
4548145484
float r1 = pBQ->pR1[c].f32;
4548245485
float r2 = pBQ->pR2[c].f32;
@@ -45508,7 +45511,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4550845511
const ma_int32 a1 = pBQ->a1.s32;
4550945512
const ma_int32 a2 = pBQ->a2.s32;
4551045513

45511-
MA_ASSUME(channels > 0);
45514+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45515+
#pragma clang loop vectorize(assume_safety)
4551245516
for (c = 0; c < channels; c += 1) {
4551345517
ma_int32 r1 = pBQ->pR1[c].s32;
4551445518
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45782,22 +45786,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4578245786
return MA_SUCCESS;
4578345787
}
4578445788

45785-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45789+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4578645790
{
4578745791
ma_uint32 c;
4578845792
const ma_uint32 channels = pLPF->channels;
4578945793
const float a = pLPF->a.f32;
4579045794
const float b = 1 - a;
4579145795

45792-
MA_ASSUME(channels > 0);
45796+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45797+
#pragma clang loop vectorize(assume_safety)
4579345798
for (c = 0; c < channels; c += 1) {
4579445799
float r1 = pLPF->pR1[c].f32;
45795-
float x = pX[c];
45800+
float x = pX[c];
4579645801
float y;
4579745802

45798-
y = b*x + a*r1;
45803+
y = b * x + a * r1;
4579945804

45800-
pY[c] = y;
45805+
pY[c] = y;
4580145806
pLPF->pR1[c].f32 = y;
4580245807
}
4580345808
}
@@ -45809,7 +45814,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4580945814
const ma_int32 a = pLPF->a.s32;
4581045815
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4581145816

45812-
MA_ASSUME(channels > 0);
45817+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45818+
#pragma clang loop vectorize(assume_safety)
4581345819
for (c = 0; c < channels; c += 1) {
4581445820
ma_int32 r1 = pLPF->pR1[c].s32;
4581545821
ma_int32 x = pX[c];
@@ -46662,7 +46668,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4666246668
const float a = 1 - pHPF->a.f32;
4666346669
const float b = 1 - a;
4666446670

46665-
MA_ASSUME(channels > 0);
46671+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4666646672
for (c = 0; c < channels; c += 1) {
4666746673
float r1 = pHPF->pR1[c].f32;
4666846674
float x = pX[c];
@@ -46682,7 +46688,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4668246688
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4668346689
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4668446690

46685-
MA_ASSUME(channels > 0);
46691+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4668646692
for (c = 0; c < channels; c += 1) {
4668746693
ma_int32 r1 = pHPF->pR1[c].s32;
4668846694
ma_int32 x = pX[c];
@@ -48790,6 +48796,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4879048796
ma_uint64 iFrame;
4879148797
ma_uint32 iChannel;
4879248798
ma_uint64 interpolatedFrameCount;
48799+
const ma_uint32 channels = pGainer->config.channels;
4879348800

4879448801
MA_ASSERT(pGainer != NULL);
4879548802

@@ -48829,12 +48836,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4882948836
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4883048837
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4883148838

48832-
if (pGainer->config.channels <= 32) {
48839+
if (channels <= 32) {
4883348840
float pRunningGain[32];
4883448841
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4883548842

4883648843
/* Initialize the running gain. */
48837-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48844+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4883848845
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4883948846
pRunningGainDelta[iChannel] = t * d;
4884048847
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48843,7 +48850,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4884348850
iFrame = 0;
4884448851

4884548852
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48846-
if (pGainer->config.channels == 2) {
48853+
if (channels == 2) {
4884748854
#if defined(MA_SUPPORT_SSE2)
4884848855
if (ma_has_sse2()) {
4884948856
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48891,6 +48898,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4889148898

4889248899
iFrame = unrolledLoopCount << 1;
4889348900
#else
48901+
#pragma clang loop vectorize(enable)
4889448902
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4889548903
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4889648904
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48902,7 +48910,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4890248910
}
4890348911
#endif
4890448912
}
48905-
} else if (pGainer->config.channels == 6) {
48913+
} else if (channels == 6) {
4890648914
#if defined(MA_SUPPORT_SSE2)
4890748915
if (ma_has_sse2()) {
4890848916
/*
@@ -48946,7 +48954,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4894648954
}
4894748955
}
4894848956
}
48949-
} else if (pGainer->config.channels == 8) {
48957+
} else if (channels == 8) {
4895048958
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4895148959
#if defined(MA_SUPPORT_SSE2)
4895248960
if (ma_has_sse2()) {
@@ -48967,29 +48975,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4896748975
{
4896848976
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
4896948977
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48978+
#pragma clang loop vectorize(enable)
4897048979
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4897148980
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
4897248981
}
4897348982

4897448983
/* Move the running gain forward towards the new gain. */
48984+
#pragma clang loop vectorize(enable)
4897548985
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4897648986
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4897748987
}
4897848988
}
4897948989
}
4898048990
}
4898148991

48992+
#pragma clang loop unroll(disable)
4898248993
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48983-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48984-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48994+
#pragma clang loop vectorize(enable)
48995+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48996+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4898548997
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4898648998
}
4898748999
}
4898849000
} else {
4898949001
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
49002+
#pragma clang loop unroll(disable)
4899049003
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48991-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48992-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49004+
#pragma clang loop vectorize(enable)
49005+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49006+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4899349007
}
4899449008

4899549009
a += d;
@@ -49008,18 +49022,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4900849022

4900949023
/* All we need to do here is apply the new gains using an optimized path. */
4901049024
if (pFramesOut != NULL && pFramesIn != NULL) {
49011-
if (pGainer->config.channels <= 32) {
49025+
if (channels <= 32) {
4901249026
float gains[32];
49013-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49027+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4901449028
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4901549029
}
4901649030

49017-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
49031+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4901849032
} else {
4901949033
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49034+
#pragma clang loop unroll(disable)
4902049035
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49021-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49022-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49036+
#pragma clang loop vectorize(enable)
49037+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49038+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4902349039
}
4902449040
}
4902549041
}
@@ -51421,7 +51437,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5142151437

5142251438
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5142351439

51424-
MA_ASSUME(channels > 0);
51440+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5142551441
for (c = 0; c < channels; c += 1) {
5142651442
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5142751443
pFrameOut[c] = s;
@@ -51440,7 +51456,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5144051456

5144151457
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5144251458

51443-
MA_ASSUME(channels > 0);
51459+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5144451460
for (c = 0; c < channels; c += 1) {
5144551461
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5144651462
pFrameOut[c] = s;
@@ -51611,7 +51627,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
5161151627
}
5161251628

5161351629

51614-
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51630+
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5161551631
{
5161651632
const float* pFramesInF32;
5161751633
/* */ float* pFramesOutF32;
@@ -51687,7 +51703,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
5168751703
return MA_SUCCESS;
5168851704
}
5168951705

51690-
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51706+
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5169151707
{
5169251708
const float* pFramesInF32;
5169351709
/* */ float* pFramesOutF32;
@@ -53000,6 +53016,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5300053016
#endif
5300153017
{
5300253018
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53019+
#pragma clang loop vectorize(enable)
5300353020
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5300453021
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
5300553022
}
@@ -53027,6 +53044,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5302753044
#endif
5302853045
{
5302953046
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53047+
#pragma clang loop vectorize(enable)
5303053048
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5303153049
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
5303253050
}
@@ -53044,6 +53062,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5304453062
#endif
5304553063
{
5304653064
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53065+
#pragma clang loop vectorize(enable)
5304753066
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5304853067
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
5304953068
}
@@ -66789,7 +66808,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6678966808
ma_uint64 iFrame;
6679066809
ma_uint32 iChannel;
6679166810
const ma_uint32 channels = pNoise->config.channels;
66792-
MA_ASSUME(channels > 0);
66811+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6679366812

6679466813
if (pNoise->config.format == ma_format_f32) {
6679566814
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66908,7 +66927,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6690866927
ma_uint64 iFrame;
6690966928
ma_uint32 iChannel;
6691066929
const ma_uint32 channels = pNoise->config.channels;
66911-
MA_ASSUME(channels > 0);
66930+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6691266931

6691366932
if (pNoise->config.format == ma_format_f32) {
6691466933
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66990,7 +67009,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6699067009
ma_uint64 iFrame;
6699167010
ma_uint32 iChannel;
6699267011
const ma_uint32 channels = pNoise->config.channels;
66993-
MA_ASSUME(channels > 0);
67012+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6699467013

6699567014
if (pNoise->config.format == ma_format_f32) {
6699667015
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)