@@ -42875,7 +42875,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42875
42875
}
42876
42876
}
42877
42877
42878
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42878
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42879
42879
{
42880
42880
ma_uint64 iSample;
42881
42881
@@ -43170,10 +43170,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43170
43170
sampleCount = frameCount * channels;
43171
43171
43172
43172
if (volume == 1) {
43173
+ #pragma clang loop vectorize(enable)
43173
43174
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43174
43175
pDst[iSample] += pSrc[iSample];
43175
43176
}
43176
43177
} else {
43178
+ #pragma clang loop vectorize(enable)
43177
43179
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43178
43180
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43179
43181
}
@@ -45476,7 +45478,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45476
45478
const float a1 = pBQ->a1.f32;
45477
45479
const float a2 = pBQ->a2.f32;
45478
45480
45479
- MA_ASSUME(channels > 0);
45481
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45482
+ #pragma clang loop vectorize(assume_safety)
45480
45483
for (c = 0; c < channels; c += 1) {
45481
45484
float r1 = pBQ->pR1[c].f32;
45482
45485
float r2 = pBQ->pR2[c].f32;
@@ -45508,7 +45511,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45508
45511
const ma_int32 a1 = pBQ->a1.s32;
45509
45512
const ma_int32 a2 = pBQ->a2.s32;
45510
45513
45511
- MA_ASSUME(channels > 0);
45514
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45515
+ #pragma clang loop vectorize(assume_safety)
45512
45516
for (c = 0; c < channels; c += 1) {
45513
45517
ma_int32 r1 = pBQ->pR1[c].s32;
45514
45518
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45782,22 +45786,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45782
45786
return MA_SUCCESS;
45783
45787
}
45784
45788
45785
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45789
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45786
45790
{
45787
45791
ma_uint32 c;
45788
45792
const ma_uint32 channels = pLPF->channels;
45789
45793
const float a = pLPF->a.f32;
45790
45794
const float b = 1 - a;
45791
45795
45792
- MA_ASSUME(channels > 0);
45796
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45797
+ #pragma clang loop vectorize(assume_safety)
45793
45798
for (c = 0; c < channels; c += 1) {
45794
45799
float r1 = pLPF->pR1[c].f32;
45795
- float x = pX[c];
45800
+ float x = pX[c];
45796
45801
float y;
45797
45802
45798
- y = b* x + a* r1;
45803
+ y = b * x + a * r1;
45799
45804
45800
- pY[c] = y;
45805
+ pY[c] = y;
45801
45806
pLPF->pR1[c].f32 = y;
45802
45807
}
45803
45808
}
@@ -45809,7 +45814,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45809
45814
const ma_int32 a = pLPF->a.s32;
45810
45815
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45811
45816
45812
- MA_ASSUME(channels > 0);
45817
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45818
+ #pragma clang loop vectorize(assume_safety)
45813
45819
for (c = 0; c < channels; c += 1) {
45814
45820
ma_int32 r1 = pLPF->pR1[c].s32;
45815
45821
ma_int32 x = pX[c];
@@ -46662,7 +46668,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46662
46668
const float a = 1 - pHPF->a.f32;
46663
46669
const float b = 1 - a;
46664
46670
46665
- MA_ASSUME(channels > 0 );
46671
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46666
46672
for (c = 0; c < channels; c += 1) {
46667
46673
float r1 = pHPF->pR1[c].f32;
46668
46674
float x = pX[c];
@@ -46682,7 +46688,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46682
46688
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46683
46689
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46684
46690
46685
- MA_ASSUME(channels > 0 );
46691
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46686
46692
for (c = 0; c < channels; c += 1) {
46687
46693
ma_int32 r1 = pHPF->pR1[c].s32;
46688
46694
ma_int32 x = pX[c];
@@ -48790,6 +48796,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48790
48796
ma_uint64 iFrame;
48791
48797
ma_uint32 iChannel;
48792
48798
ma_uint64 interpolatedFrameCount;
48799
+ const ma_uint32 channels = pGainer->config.channels;
48793
48800
48794
48801
MA_ASSERT(pGainer != NULL);
48795
48802
@@ -48829,12 +48836,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48829
48836
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48830
48837
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48831
48838
48832
- if (pGainer->config. channels <= 32) {
48839
+ if (channels <= 32) {
48833
48840
float pRunningGain[32];
48834
48841
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48835
48842
48836
48843
/* Initialize the running gain. */
48837
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48844
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48838
48845
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48839
48846
pRunningGainDelta[iChannel] = t * d;
48840
48847
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48843,7 +48850,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48843
48850
iFrame = 0;
48844
48851
48845
48852
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48846
- if (pGainer->config. channels == 2) {
48853
+ if (channels == 2) {
48847
48854
#if defined(MA_SUPPORT_SSE2)
48848
48855
if (ma_has_sse2()) {
48849
48856
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48891,6 +48898,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48891
48898
48892
48899
iFrame = unrolledLoopCount << 1;
48893
48900
#else
48901
+ #pragma clang loop vectorize(enable)
48894
48902
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48895
48903
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48896
48904
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48902,7 +48910,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48902
48910
}
48903
48911
#endif
48904
48912
}
48905
- } else if (pGainer->config. channels == 6) {
48913
+ } else if (channels == 6) {
48906
48914
#if defined(MA_SUPPORT_SSE2)
48907
48915
if (ma_has_sse2()) {
48908
48916
/*
@@ -48946,7 +48954,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48946
48954
}
48947
48955
}
48948
48956
}
48949
- } else if (pGainer->config. channels == 8) {
48957
+ } else if (channels == 8) {
48950
48958
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48951
48959
#if defined(MA_SUPPORT_SSE2)
48952
48960
if (ma_has_sse2()) {
@@ -48967,29 +48975,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48967
48975
{
48968
48976
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48969
48977
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48978
+ #pragma clang loop vectorize(enable)
48970
48979
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48971
48980
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
48972
48981
}
48973
48982
48974
48983
/* Move the running gain forward towards the new gain. */
48984
+ #pragma clang loop vectorize(enable)
48975
48985
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48976
48986
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48977
48987
}
48978
48988
}
48979
48989
}
48980
48990
}
48981
48991
48992
+ #pragma clang loop unroll(disable)
48982
48993
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48983
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48984
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48994
+ #pragma clang loop vectorize(enable)
48995
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48996
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48985
48997
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48986
48998
}
48987
48999
}
48988
49000
} else {
48989
49001
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
49002
+ #pragma clang loop unroll(disable)
48990
49003
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48991
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48992
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49004
+ #pragma clang loop vectorize(enable)
49005
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49006
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48993
49007
}
48994
49008
48995
49009
a += d;
@@ -49008,18 +49022,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
49008
49022
49009
49023
/* All we need to do here is apply the new gains using an optimized path. */
49010
49024
if (pFramesOut != NULL && pFramesIn != NULL) {
49011
- if (pGainer->config. channels <= 32) {
49025
+ if (channels <= 32) {
49012
49026
float gains[32];
49013
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
49027
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49014
49028
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49015
49029
}
49016
49030
49017
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
49031
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
49018
49032
} else {
49019
49033
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49034
+ #pragma clang loop unroll(disable)
49020
49035
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49021
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49022
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49036
+ #pragma clang loop vectorize(enable)
49037
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49038
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49023
49039
}
49024
49040
}
49025
49041
}
@@ -51421,7 +51437,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51421
51437
51422
51438
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51423
51439
51424
- MA_ASSUME(channels > 0 );
51440
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51425
51441
for (c = 0; c < channels; c += 1) {
51426
51442
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51427
51443
pFrameOut[c] = s;
@@ -51440,7 +51456,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51440
51456
51441
51457
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51442
51458
51443
- MA_ASSUME(channels > 0 );
51459
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51444
51460
for (c = 0; c < channels; c += 1) {
51445
51461
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51446
51462
pFrameOut[c] = s;
@@ -51611,7 +51627,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
51611
51627
}
51612
51628
51613
51629
51614
- static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51630
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51615
51631
{
51616
51632
const float* pFramesInF32;
51617
51633
/* */ float* pFramesOutF32;
@@ -51687,7 +51703,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
51687
51703
return MA_SUCCESS;
51688
51704
}
51689
51705
51690
- static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51706
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51691
51707
{
51692
51708
const float* pFramesInF32;
51693
51709
/* */ float* pFramesOutF32;
@@ -53000,6 +53016,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53000
53016
#endif
53001
53017
{
53002
53018
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53019
+ #pragma clang loop vectorize(enable)
53003
53020
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
53004
53021
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
53005
53022
}
@@ -53027,6 +53044,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53027
53044
#endif
53028
53045
{
53029
53046
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53047
+ #pragma clang loop vectorize(enable)
53030
53048
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
53031
53049
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
53032
53050
}
@@ -53044,6 +53062,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53044
53062
#endif
53045
53063
{
53046
53064
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53065
+ #pragma clang loop vectorize(enable)
53047
53066
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
53048
53067
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
53049
53068
}
@@ -66789,7 +66808,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66789
66808
ma_uint64 iFrame;
66790
66809
ma_uint32 iChannel;
66791
66810
const ma_uint32 channels = pNoise->config.channels;
66792
- MA_ASSUME(channels > 0 );
66811
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66793
66812
66794
66813
if (pNoise->config.format == ma_format_f32) {
66795
66814
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66908,7 +66927,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66908
66927
ma_uint64 iFrame;
66909
66928
ma_uint32 iChannel;
66910
66929
const ma_uint32 channels = pNoise->config.channels;
66911
- MA_ASSUME(channels > 0 );
66930
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66912
66931
66913
66932
if (pNoise->config.format == ma_format_f32) {
66914
66933
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66990,7 +67009,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66990
67009
ma_uint64 iFrame;
66991
67010
ma_uint32 iChannel;
66992
67011
const ma_uint32 channels = pNoise->config.channels;
66993
- MA_ASSUME(channels > 0 );
67012
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66994
67013
66995
67014
if (pNoise->config.format == ma_format_f32) {
66996
67015
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments