@@ -42903,7 +42903,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42903
42903
}
42904
42904
}
42905
42905
42906
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42906
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42907
42907
{
42908
42908
ma_uint64 iSample;
42909
42909
@@ -43198,10 +43198,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43198
43198
sampleCount = frameCount * channels;
43199
43199
43200
43200
if (volume == 1) {
43201
+ #pragma clang loop vectorize(enable)
43201
43202
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43202
43203
pDst[iSample] += pSrc[iSample];
43203
43204
}
43204
43205
} else {
43206
+ #pragma clang loop vectorize(enable)
43205
43207
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43206
43208
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43207
43209
}
@@ -45502,7 +45504,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45502
45504
const float a1 = pBQ->a1.f32;
45503
45505
const float a2 = pBQ->a2.f32;
45504
45506
45505
- MA_ASSUME(channels > 0);
45507
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45508
+ #pragma clang loop unroll(disable)
45506
45509
for (c = 0; c < channels; c += 1) {
45507
45510
float r1 = pBQ->pR1[c].f32;
45508
45511
float r2 = pBQ->pR2[c].f32;
@@ -45534,7 +45537,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45534
45537
const ma_int32 a1 = pBQ->a1.s32;
45535
45538
const ma_int32 a2 = pBQ->a2.s32;
45536
45539
45537
- MA_ASSUME(channels > 0);
45540
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45541
+ #pragma clang loop unroll(disable)
45538
45542
for (c = 0; c < channels; c += 1) {
45539
45543
ma_int32 r1 = pBQ->pR1[c].s32;
45540
45544
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45808,22 +45812,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45808
45812
return MA_SUCCESS;
45809
45813
}
45810
45814
45811
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45815
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45812
45816
{
45813
45817
ma_uint32 c;
45814
45818
const ma_uint32 channels = pLPF->channels;
45815
45819
const float a = pLPF->a.f32;
45816
45820
const float b = 1 - a;
45817
45821
45818
- MA_ASSUME(channels > 0);
45822
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45823
+ #pragma clang loop unroll(disable)
45819
45824
for (c = 0; c < channels; c += 1) {
45820
45825
float r1 = pLPF->pR1[c].f32;
45821
- float x = pX[c];
45826
+ float x = pX[c];
45822
45827
float y;
45823
45828
45824
- y = b* x + a* r1;
45829
+ y = b * x + a * r1;
45825
45830
45826
- pY[c] = y;
45831
+ pY[c] = y;
45827
45832
pLPF->pR1[c].f32 = y;
45828
45833
}
45829
45834
}
@@ -45835,7 +45840,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45835
45840
const ma_int32 a = pLPF->a.s32;
45836
45841
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45837
45842
45838
- MA_ASSUME(channels > 0);
45843
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45844
+ #pragma clang loop unroll(disable)
45839
45845
for (c = 0; c < channels; c += 1) {
45840
45846
ma_int32 r1 = pLPF->pR1[c].s32;
45841
45847
ma_int32 x = pX[c];
@@ -46688,7 +46694,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46688
46694
const float a = 1 - pHPF->a.f32;
46689
46695
const float b = 1 - a;
46690
46696
46691
- MA_ASSUME(channels > 0 );
46697
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46692
46698
for (c = 0; c < channels; c += 1) {
46693
46699
float r1 = pHPF->pR1[c].f32;
46694
46700
float x = pX[c];
@@ -46708,7 +46714,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46708
46714
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46709
46715
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46710
46716
46711
- MA_ASSUME(channels > 0 );
46717
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46712
46718
for (c = 0; c < channels; c += 1) {
46713
46719
ma_int32 r1 = pHPF->pR1[c].s32;
46714
46720
ma_int32 x = pX[c];
@@ -48816,6 +48822,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48816
48822
ma_uint64 iFrame;
48817
48823
ma_uint32 iChannel;
48818
48824
ma_uint64 interpolatedFrameCount;
48825
+ const ma_uint32 channels = pGainer->config.channels;
48819
48826
48820
48827
MA_ASSERT(pGainer != NULL);
48821
48828
@@ -48855,12 +48862,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48855
48862
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48856
48863
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48857
48864
48858
- if (pGainer->config. channels <= 32) {
48865
+ if (channels <= 32) {
48859
48866
float pRunningGain[32];
48860
48867
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48861
48868
48862
48869
/* Initialize the running gain. */
48863
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48870
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48864
48871
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48865
48872
pRunningGainDelta[iChannel] = t * d;
48866
48873
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48869,7 +48876,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48869
48876
iFrame = 0;
48870
48877
48871
48878
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48872
- if (pGainer->config. channels == 2) {
48879
+ if (channels == 2) {
48873
48880
#if defined(MA_SUPPORT_SSE2)
48874
48881
if (ma_has_sse2()) {
48875
48882
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48917,6 +48924,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48917
48924
48918
48925
iFrame = unrolledLoopCount << 1;
48919
48926
#else
48927
+ #pragma clang loop vectorize(enable)
48920
48928
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48921
48929
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48922
48930
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48928,7 +48936,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48928
48936
}
48929
48937
#endif
48930
48938
}
48931
- } else if (pGainer->config. channels == 6) {
48939
+ } else if (channels == 6) {
48932
48940
#if defined(MA_SUPPORT_SSE2)
48933
48941
if (ma_has_sse2()) {
48934
48942
/*
@@ -48961,6 +48969,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48961
48969
} else
48962
48970
#endif
48963
48971
{
48972
+ #pragma clang loop vectorize(enable)
48964
48973
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48965
48974
for (iChannel = 0; iChannel < 6; iChannel += 1) {
48966
48975
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48972,7 +48981,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48972
48981
}
48973
48982
}
48974
48983
}
48975
- } else if (pGainer->config. channels == 8) {
48984
+ } else if (channels == 8) {
48976
48985
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48977
48986
#if defined(MA_SUPPORT_SSE2)
48978
48987
if (ma_has_sse2()) {
@@ -48992,6 +49001,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48992
49001
#endif
48993
49002
{
48994
49003
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
49004
+ #pragma clang loop vectorize(enable)
48995
49005
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48996
49006
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48997
49007
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -49005,17 +49015,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
49005
49015
}
49006
49016
}
49007
49017
49018
+ #pragma clang loop unroll(disable)
49008
49019
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
49009
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49010
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
49020
+ #pragma clang loop vectorize(enable)
49021
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49022
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
49011
49023
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
49012
49024
}
49013
49025
}
49014
49026
} else {
49015
49027
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
49028
+ #pragma clang loop unroll(disable)
49016
49029
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
49017
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49018
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49030
+ #pragma clang loop vectorize(enable)
49031
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49032
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49019
49033
}
49020
49034
49021
49035
a += d;
@@ -49034,18 +49048,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
49034
49048
49035
49049
/* All we need to do here is apply the new gains using an optimized path. */
49036
49050
if (pFramesOut != NULL && pFramesIn != NULL) {
49037
- if (pGainer->config. channels <= 32) {
49051
+ if (channels <= 32) {
49038
49052
float gains[32];
49039
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49053
+ #pragma clang loop unroll(disable)
49054
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49040
49055
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49041
49056
}
49042
49057
49043
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
49058
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
49044
49059
} else {
49045
49060
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49061
+ #pragma clang loop unroll(disable)
49046
49062
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49047
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49048
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49063
+ #pragma clang loop vectorize(enable)
49064
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49065
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49049
49066
}
49050
49067
}
49051
49068
}
@@ -51415,7 +51432,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51415
51432
51416
51433
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51417
51434
51418
- MA_ASSUME(channels > 0 );
51435
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51419
51436
for (c = 0; c < channels; c += 1) {
51420
51437
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51421
51438
pFrameOut[c] = s;
@@ -51434,7 +51451,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51434
51451
51435
51452
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51436
51453
51437
- MA_ASSUME(channels > 0 );
51454
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51438
51455
for (c = 0; c < channels; c += 1) {
51439
51456
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51440
51457
pFrameOut[c] = s;
@@ -52669,6 +52686,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
52669
52686
ma_uint64 iFrame;
52670
52687
ma_uint32 iChannelOut;
52671
52688
52689
+ #pragma clang loop unroll(disable)
52672
52690
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52673
52691
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52674
52692
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52689,6 +52707,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
52689
52707
ma_uint64 iFrame;
52690
52708
ma_uint32 iChannelOut;
52691
52709
52710
+ #pragma clang loop unroll(disable)
52692
52711
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52693
52712
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52694
52713
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52731,6 +52750,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
52731
52750
ma_uint64 iFrame;
52732
52751
ma_uint32 iChannelOut;
52733
52752
52753
+ #pragma clang loop unroll(disable)
52734
52754
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52735
52755
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52736
52756
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52751,6 +52771,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
52751
52771
ma_uint64 iFrame;
52752
52772
ma_uint32 iChannelOut;
52753
52773
52774
+ #pragma clang loop unroll(disable)
52754
52775
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52755
52776
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52756
52777
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52985,6 +53006,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52985
53006
} else
52986
53007
#endif
52987
53008
{
53009
+ #pragma clang loop vectorize(enable)
52988
53010
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52989
53011
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52990
53012
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -53012,6 +53034,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53012
53034
} else
53013
53035
#endif
53014
53036
{
53037
+ #pragma clang loop vectorize(enable)
53015
53038
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53016
53039
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
53017
53040
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -53029,6 +53052,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53029
53052
} else
53030
53053
#endif
53031
53054
{
53055
+ #pragma clang loop vectorize(enable)
53032
53056
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53033
53057
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
53034
53058
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66352,7 +66376,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66352
66376
ma_uint64 iFrame;
66353
66377
ma_uint32 iChannel;
66354
66378
const ma_uint32 channels = pNoise->config.channels;
66355
- MA_ASSUME(channels > 0 );
66379
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66356
66380
66357
66381
if (pNoise->config.format == ma_format_f32) {
66358
66382
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66471,7 +66495,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66471
66495
ma_uint64 iFrame;
66472
66496
ma_uint32 iChannel;
66473
66497
const ma_uint32 channels = pNoise->config.channels;
66474
- MA_ASSUME(channels > 0 );
66498
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66475
66499
66476
66500
if (pNoise->config.format == ma_format_f32) {
66477
66501
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66553,7 +66577,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66553
66577
ma_uint64 iFrame;
66554
66578
ma_uint32 iChannel;
66555
66579
const ma_uint32 channels = pNoise->config.channels;
66556
- MA_ASSUME(channels > 0 );
66580
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66557
66581
66558
66582
if (pNoise->config.format == ma_format_f32) {
66559
66583
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments