Skip to content

[AArch64] LLVM generates unaligned access with -mstrict-align on AArch64 #95811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
vit9696 opened this issue Jun 17, 2024 · 3 comments · Fixed by #95828
Closed

[AArch64] LLVM generates unaligned access with -mstrict-align on AArch64 #95811

vit9696 opened this issue Jun 17, 2024 · 3 comments · Fixed by #95828

Comments

@vit9696
Copy link

vit9696 commented Jun 17, 2024

Consider the following command-line:

clang -o t.o -c -target aarch64-gnu-linux-eabi -mstrict-align -mcpu=cortex-a55 -O3 t.c 

(1) Example 1 ("GodBolt":https://godbolt.org/z/35s8h7Kdz):

typedef struct
{
  void* ccc[3];
  void* mmm;
  void* ddd;
  unsigned eee[2];
  _Bool bbb[8];
  
  _Bool ggg;
  _Bool abs;
  unsigned char kkk[16];
} struct1;

typedef struct struct2
{
  void* block[3];
  unsigned char pass_flags[2];
  unsigned int index[19];

  struct1 yyy[];
} struct2;


union union1
{
  unsigned u32;
  struct
  {
    unsigned ggg : 24;
    unsigned ffffffx : 2;
    unsigned ffffffy : 2;
    unsigned ffffffz : 2;
    unsigned ffffffw : 2;
  } aaa;
};

void func2(unsigned value);

unsigned func1(struct2* instr, unsigned yyy);

static void func3(struct1* yyy, union union1 zzzz)
{
  func2(zzzz.u32);
  if (yyy->ddd)
  {
    union union1 zzzz = {0};
    func3(yyy->ddd, zzzz);
  }
}

void bug(struct2* aaa);

void bug(struct2* aaa)
{
  for (unsigned i = 0; i < 256; i++)
  {
    unsigned a = func1(aaa, i);
    union union1 yyy;

    yyy.aaa.ggg = aaa->yyy[i].ggg;

    if (a <= 4)
    {
      yyy.aaa.ffffffx = aaa->yyy[i].kkk[0];
      yyy.aaa.ffffffy = aaa->yyy[i].kkk[1];
      yyy.aaa.ffffffz = aaa->yyy[i].kkk[2];
      yyy.aaa.ffffffw = aaa->yyy[i].kkk[3];
    }

    func3(&aaa->yyy[i], yyy);
  }
}

Output:

.LCPI0_0:
        .word   24                              // 0x18
        .word   26                              // 0x1a
        .word   28                              // 0x1c
        .word   30                              // 0x1e
bug:                                    // @bug
        sub     sp, sp, #80
        stp     x29, x30, [sp, #16]             // 16-byte Folded Spill
        str     x23, [sp, #32]                  // 8-byte Folded Spill
        stp     x22, x21, [sp, #48]             // 16-byte Folded Spill
        stp     x20, x19, [sp, #64]             // 16-byte Folded Spill
        add     x29, sp, #16
        mov     x19, x0
        mov     x20, xzr
        mov     w22, #80                        // =0x50
        adrp    x8, .LCPI0_0
        ldr     q0, [x8, :lo12:.LCPI0_0]
        str     q0, [sp]                        // 16-byte Folded Spill
        b       .LBB0_2
.LBB0_1:                                //   in Loop: Header=BB0_2 Depth=1
        add     x20, x20, #1
        cmp     x20, #256
        b.eq    .LBB0_7
.LBB0_2:                                // =>This Loop Header: Depth=1
        mov     x0, x19
        mov     w1, w20
        bl      func1
        cmp     w0, #4
        b.hi    .LBB0_4
        madd    x8, x20, x22, x19
        ldr     q1, [sp]                        // 16-byte Folded Reload
        ldur    s0, [x8, #162]
        ushll   v0.8h, v0.8b, #0
        bic     v0.4h, #252
        ushll   v0.4s, v0.4h, #0
        ushl    v0.4s, v0.4s, v1.4s
        ext     v1.16b, v0.16b, v0.16b, #8
        orr     v0.8b, v0.8b, v1.8b
        fmov    x8, d0
        lsr     x9, x8, #32
        orr     w8, w8, w9
        b       .LBB0_5
.LBB0_4:                                //   in Loop: Header=BB0_2 Depth=1
        and     w8, w21, #0xff000000
.LBB0_5:                                //   in Loop: Header=BB0_2 Depth=1
        madd    x9, x20, x22, x19
        ldrb    w9, [x9, #160]
        orr     w21, w8, w9
        mov     w0, w21
        bl      func2
        madd    x8, x20, x22, x19
        ldr     x23, [x8, #136]
        cbz     x23, .LBB0_1
.LBB0_6:                                //   Parent Loop BB0_2 Depth=1
        mov     w0, wzr
        bl      func2
        ldr     x23, [x23, #32]
        cbnz    x23, .LBB0_6
        b       .LBB0_1
.LBB0_7:
        ldp     x20, x19, [sp, #64]             // 16-byte Folded Reload
        ldp     x22, x21, [sp, #48]             // 16-byte Folded Reload
        ldp     x29, x30, [sp, #16]             // 16-byte Folded Reload
        ldr     x23, [sp, #32]                  // 8-byte Folded Reload
        add     sp, sp, #80
        ret

Note ldur s0, [x8, #162].

(2) Example 2 ("GodBolt":https://godbolt.org/z/nsovb6s1f):

void f(char p[restrict], char *q)
{
    for (int i = 0; i < 4; i++)
        p[i] = -q[i];
}

void g(char p[restrict], char *q)
{
    for (int i = 0; i < 4; i++)
        p[i] = q[i];
}

Output:

f:                                      // @f
        movi    v0.2d, #0000000000000000
        ldr     s1, [x1]
        usubw   v0.8h, v0.8h, v1.8b
        umov    w8, v0.h[3]
        umov    w9, v0.h[2]
        umov    w10, v0.h[1]
        umov    w11, v0.h[0]
        strb    w8, [x0, #3]
        strb    w9, [x0, #2]
        strb    w10, [x0, #1]
        strb    w11, [x0]
        ret
g:                                      // @g
        ldrb    w8, [x1, #3]
        ldrb    w9, [x1, #2]
        ldrb    w10, [x1, #1]
        ldrb    w11, [x1]
        strb    w8, [x0, #3]
        strb    w9, [x0, #2]
        strb    w10, [x0, #1]
        strb    w11, [x0]
        ret

Note ldr s1, [x1]

Tested on 18.1.8.

This results in unaligned access exception raised on targets with strict alignment enabled in hardware.

@vit9696
Copy link
Author

vit9696 commented Jun 17, 2024

This could be a follow-up to #63258, cc @MaskRay

@vit9696 vit9696 changed the title LLVM generates unaligned access with -mstrict-align on AArch64 [AArch64] LLVM generates unaligned access with -mstrict-align on AArch64 Jun 17, 2024
@efriedma-quic
Copy link
Collaborator

Caused by b062fff, I think.

@llvmbot
Copy link
Member

llvmbot commented Jun 17, 2024

@llvm/issue-subscribers-backend-aarch64

Author: Vitaly Cheptsov (vit9696)

Consider the following command-line:
clang -o t.o -c -target aarch64-gnu-linux-eabi -mstrict-align -mcpu=cortex-a55 -O3 t.c 

(1) Example 1 ("GodBolt":https://godbolt.org/z/35s8h7Kdz):

<cut>

<details>

typedef struct
{
  void* ccc[3];
  void* mmm;
  void* ddd;
  unsigned eee[2];
  _Bool bbb[8];
  
  _Bool ggg;
  _Bool abs;
  unsigned char kkk[16];
} struct1;

typedef struct struct2
{
  void* block[3];
  unsigned char pass_flags[2];
  unsigned int index[19];

  struct1 yyy[];
} struct2;


union union1
{
  unsigned u32;
  struct
  {
    unsigned ggg : 24;
    unsigned ffffffx : 2;
    unsigned ffffffy : 2;
    unsigned ffffffz : 2;
    unsigned ffffffw : 2;
  } aaa;
};

void func2(unsigned value);

unsigned func1(struct2* instr, unsigned yyy);

static void func3(struct1* yyy, union union1 zzzz)
{
  func2(zzzz.u32);
  if (yyy-&gt;ddd)
  {
    union union1 zzzz = {0};
    func3(yyy-&gt;ddd, zzzz);
  }
}

void bug(struct2* aaa);

void bug(struct2* aaa)
{
  for (unsigned i = 0; i &lt; 256; i++)
  {
    unsigned a = func1(aaa, i);
    union union1 yyy;

    yyy.aaa.ggg = aaa-&gt;yyy[i].ggg;

    if (a &lt;= 4)
    {
      yyy.aaa.ffffffx = aaa-&gt;yyy[i].kkk[0];
      yyy.aaa.ffffffy = aaa-&gt;yyy[i].kkk[1];
      yyy.aaa.ffffffz = aaa-&gt;yyy[i].kkk[2];
      yyy.aaa.ffffffw = aaa-&gt;yyy[i].kkk[3];
    }

    func3(&amp;aaa-&gt;yyy[i], yyy);
  }
}

</details>

</cut>

Output:

<cut>

<details>

.LCPI0_0:
        .word   24                              // 0x18
        .word   26                              // 0x1a
        .word   28                              // 0x1c
        .word   30                              // 0x1e
bug:                                    // @<!-- -->bug
        sub     sp, sp, #<!-- -->80
        stp     x29, x30, [sp, #<!-- -->16]             // 16-byte Folded Spill
        str     x23, [sp, #<!-- -->32]                  // 8-byte Folded Spill
        stp     x22, x21, [sp, #<!-- -->48]             // 16-byte Folded Spill
        stp     x20, x19, [sp, #<!-- -->64]             // 16-byte Folded Spill
        add     x29, sp, #<!-- -->16
        mov     x19, x0
        mov     x20, xzr
        mov     w22, #<!-- -->80                        // =0x50
        adrp    x8, .LCPI0_0
        ldr     q0, [x8, :lo12:.LCPI0_0]
        str     q0, [sp]                        // 16-byte Folded Spill
        b       .LBB0_2
.LBB0_1:                                //   in Loop: Header=BB0_2 Depth=1
        add     x20, x20, #<!-- -->1
        cmp     x20, #<!-- -->256
        b.eq    .LBB0_7
.LBB0_2:                                // =&gt;This Loop Header: Depth=1
        mov     x0, x19
        mov     w1, w20
        bl      func1
        cmp     w0, #<!-- -->4
        b.hi    .LBB0_4
        madd    x8, x20, x22, x19
        ldr     q1, [sp]                        // 16-byte Folded Reload
        ldur    s0, [x8, #<!-- -->162]
        ushll   v0.8h, v0.8b, #<!-- -->0
        bic     v0.4h, #<!-- -->252
        ushll   v0.4s, v0.4h, #<!-- -->0
        ushl    v0.4s, v0.4s, v1.4s
        ext     v1.16b, v0.16b, v0.16b, #<!-- -->8
        orr     v0.8b, v0.8b, v1.8b
        fmov    x8, d0
        lsr     x9, x8, #<!-- -->32
        orr     w8, w8, w9
        b       .LBB0_5
.LBB0_4:                                //   in Loop: Header=BB0_2 Depth=1
        and     w8, w21, #<!-- -->0xff000000
.LBB0_5:                                //   in Loop: Header=BB0_2 Depth=1
        madd    x9, x20, x22, x19
        ldrb    w9, [x9, #<!-- -->160]
        orr     w21, w8, w9
        mov     w0, w21
        bl      func2
        madd    x8, x20, x22, x19
        ldr     x23, [x8, #<!-- -->136]
        cbz     x23, .LBB0_1
.LBB0_6:                                //   Parent Loop BB0_2 Depth=1
        mov     w0, wzr
        bl      func2
        ldr     x23, [x23, #<!-- -->32]
        cbnz    x23, .LBB0_6
        b       .LBB0_1
.LBB0_7:
        ldp     x20, x19, [sp, #<!-- -->64]             // 16-byte Folded Reload
        ldp     x22, x21, [sp, #<!-- -->48]             // 16-byte Folded Reload
        ldp     x29, x30, [sp, #<!-- -->16]             // 16-byte Folded Reload
        ldr     x23, [sp, #<!-- -->32]                  // 8-byte Folded Reload
        add     sp, sp, #<!-- -->80
        ret

</details>

</cut>

Note ldur s0, [x8, #<!-- -->162].

(2) Example 2 ("GodBolt":https://godbolt.org/z/nsovb6s1f):

<cut>

<details>

void f(char p[restrict], char *q)
{
    for (int i = 0; i &lt; 4; i++)
        p[i] = -q[i];
}

void g(char p[restrict], char *q)
{
    for (int i = 0; i &lt; 4; i++)
        p[i] = q[i];
}

</details>

</cut>

Output:

<cut>

<details>

f:                                      // @<!-- -->f
        movi    v0.2d, #<!-- -->0000000000000000
        ldr     s1, [x1]
        usubw   v0.8h, v0.8h, v1.8b
        umov    w8, v0.h[3]
        umov    w9, v0.h[2]
        umov    w10, v0.h[1]
        umov    w11, v0.h[0]
        strb    w8, [x0, #<!-- -->3]
        strb    w9, [x0, #<!-- -->2]
        strb    w10, [x0, #<!-- -->1]
        strb    w11, [x0]
        ret
g:                                      // @<!-- -->g
        ldrb    w8, [x1, #<!-- -->3]
        ldrb    w9, [x1, #<!-- -->2]
        ldrb    w10, [x1, #<!-- -->1]
        ldrb    w11, [x1]
        strb    w8, [x0, #<!-- -->3]
        strb    w9, [x0, #<!-- -->2]
        strb    w10, [x0, #<!-- -->1]
        strb    w11, [x0]
        ret

</details>
</cut>

Note ldr s1, [x1]

Tested on 18.1.8.

This results in unaligned access exception raised on targets with strict alignment enabled in hardware.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants