From e204719e44b1c6a7075ac510b064331e7114adfb Mon Sep 17 00:00:00 2001 From: maroma samsa Date: Fri, 20 Jan 2023 14:40:23 +0000 Subject: [PATCH] Using fixed-point arithmetic Optimize the performance by replacing floating arithmetic with fixed-point arithmetic counterpart, which allows the embedded microprocessors that didn't have floating-point unit (FPU) doing mechanical calculation in high speed and/or low power consumption way. This change will reduce program execution time by 14%. Reference: https://hackmd.io/@maromaSamsa/HkjefPbFs --- tests/line.c | 284 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 247 insertions(+), 37 deletions(-) diff --git a/tests/line.c b/tests/line.c index 60214497..994c7a30 100644 --- a/tests/line.c +++ b/tests/line.c @@ -150,70 +150,280 @@ void svpng(FILE *fp, unsigned w, unsigned h, const uint8_t *img, bool alpha) SVPNG_END(); /* IEND chunk {} */ } -#include // ceilf(), floorf(), fminf(), fmaxf(), sinf(), cosf(), sqrtf() #include // memset() +#define Q (20) +#define Q_PI (1686629713 >> (29 - Q)) + +/* 32-bit Q notation to specify a fixed point number format. */ +typedef int32_t qfixed_t; + +/* 64-bit Q format buffer, for handling overflow cases */ +typedef int64_t qbuf_t; + +/* format convertion: float to Q format */ +#define f2Q(x) ((qfixed_t) ((x) * (1 << Q))) + +/* format convertion: Q format to float */ +#define Q2f(x) (((float) (x)) / (1 << Q)) + +/* format convertion: Q format to int */ +#define Q2I(x) ((int) ((x) >> Q)) + +/* format convertion: int to Q format */ +#define I2Q(x) ((qfixed_t) ((x) << Q)) + +/* maximum value of Q format */ +#define QFMT_MAX 0x7FFFFFFF + +/* minimum value of Q format */ +#define QFMT_MIN 0x80000000 + +/* addition of Q format value */ +static inline qfixed_t q_add(qfixed_t a, qfixed_t b) +{ + qbuf_t tmp = (qbuf_t) a + (qbuf_t) b; + if (tmp > (qbuf_t) QFMT_MAX) + return (qfixed_t) QFMT_MAX; + if (~tmp + 1 >= (qbuf_t) QFMT_MIN) + return (qfixed_t) QFMT_MIN; + return (qfixed_t) tmp; +}; + +/* multiplication of Q format value */ +static inline qfixed_t q_mul(qfixed_t a, qfixed_t b) +{ + qbuf_t tmp = (qbuf_t) a * (qbuf_t) b; + + /* rounding and resize */ + tmp += (qbuf_t) (1 << (Q - 1)); + tmp >>= Q; + + /* check overflow */ + if (tmp > (qbuf_t) QFMT_MAX) + return (qfixed_t) QFMT_MAX; + if (tmp * -1 >= (qbuf_t) QFMT_MIN) + return (qfixed_t) QFMT_MIN; + return (qfixed_t) tmp; +} + +/* division of Q format value */ +static inline qfixed_t q_div(qfixed_t a, qfixed_t b) +{ + qbuf_t tmp = (qbuf_t) a << Q; + if ((tmp >= 0 && b >= 0) || (tmp < 0 && b < 0)) { + tmp += (b >> 1); + } else { + tmp -= (b >> 1); + } + return (qfixed_t) (tmp / b); +} + +/* return the largest integral value that is not greater than x */ +static inline qfixed_t q_floor(qfixed_t x) +{ + qfixed_t mask = (0xFFFFFFFF >> Q) << Q; + return x & mask; +} + +/* return the smallest integral value that is not less than x */ +static inline qfixed_t q_ceil(qfixed_t x) +{ + qfixed_t mask = (0xFFFFFFFF >> Q) << Q; + qfixed_t delta = x & ~mask; + x = x & mask; + return delta ? q_add(x, 1 << Q) : x; +} + +/* return the nonnegative square root of x */ +static inline qfixed_t q_sqrt(qfixed_t x) +{ + if (x <= 0) + return 0; + if (x < I2Q(1) + (1 << (Q / 2 - 1)) && x > I2Q(1) - (1 << (Q / 2 - 1))) + return I2Q(1); + + qfixed_t res = 0; + qfixed_t bit = 1 << 15; + + /* left shift x as more as possible */ + int offset = 0; + for (qfixed_t i = x; !(0x40000000 & i); i <<= 1) + ++offset; + offset = (offset & ~1); + x <<= offset; + + /* shift bit to the highest bit 1' in x */ + while (bit > x) + bit >>= 1; + + for (bit; bit > 0; bit >>= 1) { + int tmp = bit + res; + + /* check overflow: 46341^2 > 2^31 - 1, which is the maximun value */ + if (tmp > 46340) + continue; + + int square = tmp * tmp; + if (square <= x) { + res = tmp; + if (square == x) + break; + } + } /* iter: goto next lower bit to get more precise sqrt value */ + + offset >>= 1; + offset -= (Q >> 1); + return (offset >= 0) ? res >> offset : res << (-offset); +} + +/* get both sin and cos value in the same radius */ +static inline void q_sincos(qfixed_t radius, qfixed_t *sin_t, qfixed_t *cos_t) +{ + int region = (radius / (Q_PI >> 1)) & 0b11; + + /* theta must be pi/2 to 0 and start from x-axis */ + qfixed_t theta = radius % (Q_PI >> 1); + if (region & 0b1) + theta = (Q_PI >> 1) - theta; + + /* start from cos(pi/2) and sin(pi/2) */ + radius = Q_PI >> 1; + qfixed_t cos_rad = 0; + qfixed_t sin_rad = I2Q(1); + + /* start from cos(0) and sin(0) */ + *cos_t = I2Q(1); + *sin_t = 0; + + while (radius > 0) { + if (radius <= theta) { + theta -= radius; + /* + * Trigonometric Identities: + * cos(a + b) = cos(a)*cos(b) - sin(a)sin(b) + * sin(a + b) = sin(a)*cos(b) + cos(a)sin(b) + */ + qfixed_t tmp = q_mul(*cos_t, cos_rad) - q_mul(*sin_t, sin_rad); + *sin_t = q_mul(*sin_t, cos_rad) + q_mul(*cos_t, sin_rad); + *cos_t = tmp; + } + if (theta == 0) + break; + + /* Half angle formula to approach the value */ + radius >>= 1; + sin_rad = q_sqrt((I2Q(1) - cos_rad) >> 1); + cos_rad = q_sqrt((I2Q(1) + cos_rad) >> 1); + } + + if (region == 0b10 || region == 0b01) + *cos_t = ~*cos_t + 1; + if (region & 0b10) + *sin_t = ~*sin_t + 1; +} -#define PI 3.14159265359f #define W 512 #define H 512 static uint8_t img[W * H * 3]; -/* Using signed distnace field (SDF) of capsule shape to perform anti-aliasing +#define max(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +/* + * Using signed distnace field (SDF) of capsule shape to perform anti-aliasing * with single sample per pixel. */ -float capsuleSDF(float px, - float py, - float ax, - float ay, - float bx, - float by, - float r) +qfixed_t capsuleSDF(qfixed_t px, + qfixed_t py, + qfixed_t ax, + qfixed_t ay, + qfixed_t bx, + qfixed_t by, + qfixed_t r) { - float pax = px - ax, pay = py - ay, bax = bx - ax, bay = by - ay; - float h = fmaxf( - fminf((pax * bax + pay * bay) / (bax * bax + bay * bay), 1.0f), 0.0f); - float dx = pax - bax * h, dy = pay - bay * h; - return sqrtf(dx * dx + dy * dy) - r; + qfixed_t pax = q_add(px, -ax); + qfixed_t pay = q_add(py, -ay); + qfixed_t bax = q_add(bx, -ax); + qfixed_t bay = q_add(by, -ay); + + qfixed_t t0 = q_add(q_mul(pax, bax), q_mul(pay, bay)); + qfixed_t t1 = q_add(q_mul(bax, bax), q_mul(bay, bay)); + qfixed_t tmp = min(q_div(t0, t1), I2Q(1)); + qfixed_t h = max(tmp, 0); + + qfixed_t dx = q_add(pax, -q_mul(bax, h)); + qfixed_t dy = q_add(pay, -q_mul(bay, h)); + + tmp = q_add(q_mul(dx, dx), q_mul(dy, dy)); + qfixed_t res = q_add(q_sqrt(tmp), -r); + return res; } +#define PUT(v) \ + p[v] = (uint8_t) Q2I( \ + q_add(p[v] * q_add((1 << Q), -alpha), q_mul(g, alpha) * 255)) + /* Render shapes into the buffer individually with alpha blending. */ -void alphablend(int x, int y, float alpha, float r, float g, float b) +void alphablend(int x, + int y, + qfixed_t alpha, + qfixed_t r, + qfixed_t g, + qfixed_t b) { uint8_t *p = img + (y * W + x) * 3; - p[0] = (uint8_t) (p[0] * (1 - alpha) + r * alpha * 255); - p[1] = (uint8_t) (p[1] * (1 - alpha) + g * alpha * 255); - p[2] = (uint8_t) (p[2] * (1 - alpha) + b * alpha * 255); + PUT(0); + PUT(1); + PUT(2); } /* Use AABB of capsule to reduce the number of samples. */ -void lineSDFAABB(float ax, float ay, float bx, float by, float r) +void lineSDFAABB(qfixed_t ax, qfixed_t ay, qfixed_t bx, qfixed_t by, qfixed_t r) { - int x0 = (int) floorf(fminf(ax, bx) - r); - int x1 = (int) ceilf(fmaxf(ax, bx) + r); - int y0 = (int) floorf(fminf(ay, by) - r); - int y1 = (int) ceilf(fmaxf(ay, by) + r); + int x0 = Q2I(q_floor(q_add(min(ax, bx), -r))); + int x1 = Q2I(q_ceil(q_add(max(ax, bx), r))); + int y0 = Q2I(q_floor(q_add(min(ay, by), -r))); + int y1 = Q2I(q_ceil(q_add(max(ay, by), r))); for (int y = y0; y <= y1; y++) { for (int x = x0; x <= x1; x++) - alphablend( - x, y, - fmaxf(fminf(0.5f - capsuleSDF(x, y, ax, ay, bx, by, r), 1.0f), - 0.0f), - 0.0f, 0.0f, 0.0f); + alphablend(x, y, + max(min((1 << (Q - 1)) - capsuleSDF(I2Q(x), I2Q(y), ax, + ay, bx, by, r), + I2Q(1)), + 0), + 0, 0, 0); } } int main() { memset(img, 255, sizeof(img)); - float cx = W * 0.5f, cy = H * 0.5f; + + /* center of the line drawing, (1 << (Q - 1)) is equal to 0.5 */ + qfixed_t cx = W * (1 << (Q - 1)), cy = H * (1 << (Q - 1)); + /* cos(theta) and sin(theta) value for each line */ + qfixed_t ct, st; + for (int j = 0; j < 5; j++) { - float r1 = fminf(W, H) * (j + 0.5f) * 0.085f; - float r2 = fminf(W, H) * (j + 1.5f) * 0.085f; - float t = j * PI / 64.0f, r = (j + 1) * 0.5f; - for (int i = 1; i <= 64; i++, t += 2.0f * PI / 64.0f) { - float ct = cosf(t), st = sinf(t); - lineSDFAABB(cx + r1 * ct, cy - r1 * st, cx + r2 * ct, cy - r2 * st, - r); + qfixed_t r1 = min(W, H) * q_mul((I2Q(j) + (1 << (Q - 1))), f2Q(0.085f)); + qfixed_t r2 = min(W, H) * q_mul((I2Q(j) + (3 << (Q - 1))), f2Q(0.085f)); + qfixed_t t = j * q_div(Q_PI, I2Q(64)); + qfixed_t r = (j + 1) * (1 << (Q - 1)); + for (int i = 1; i <= 64; i++) { + t = q_add(t, q_mul(I2Q(2), q_div(Q_PI, I2Q(64)))); + q_sincos(t, &st, &ct); + lineSDFAABB(q_add(cx, q_mul(r1, ct)), q_add(cy, -q_mul(r1, st)), + q_add(cx, q_mul(r2, ct)), q_add(cy, -q_mul(r2, st)), r); } } svpng(fopen("line.png", "wb"), W, H, img, false);