[Libre-soc-isa] [Bug 560] big-endian little-endian SV regfile layout idea

Tue Jan 5 18:30:56 GMT 2021

https://bugs.libre-soc.org/show_bug.cgi?id=560

Jacob Lifshay <programmerjake at gmail.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|INVALID                     |---
             Status|RESOLVED                    |IN_PROGRESS

--- Comment #87 from Jacob Lifshay <programmerjake at gmail.com> ---
Reopening since this needs more evaluation:

If we copy what VSX does, then we'll have to implement the byteswapping in the
ALUs, since that's what VSX does:

Notice how the big-endian and little-endian Power code is identical -- implying
that the registers switch between big-endian and little-endian, however it
changes on AArch64 (64-bit Arm) since they define the registers to be little
endian only and need to insert explicit byteswapping instructions.

https://godbolt.org/z/nbM5qq

C Source:
#include <stdint.h>

typedef uint8_t u8x16 __attribute__((vector_size(16)));
typedef uint16_t u16x8 __attribute__((vector_size(16)));
typedef uint32_t u32x4 __attribute__((vector_size(16)));
typedef uint64_t u64x2 __attribute__((vector_size(16)));

void ld_st(u8x16 *a, u8x16 *b, u16x8 *c, u16x8 *r) {
    u16x8 temp = (u16x8)(*a + *b);
    *r = temp + *c;
}

u16x8 by_value(u8x16 a, u8x16 b, u16x8 c) {
    u16x8 temp = (u16x8)(a + b);
    return temp + c;
}

u16x8 load_array(uint16_t *a) {
    a = (uint16_t *)__builtin_assume_aligned(a, 16);
    u16x8 retval;
    for(int i = 0; i < 8; i++)
        retval[i] = a[i];
    return retval;
}

Generated big-endian powerpc64:
ld_st:                                  # @ld_st
        .quad   .Lfunc_begin0
        .quad   .TOC. at tocbase
        .quad   0
.Lfunc_begin0:
        lxv 34, 0(3)
        lxv 35, 0(4)
        vaddubm 2, 3, 2
        lxv 35, 0(5)
        vadduhm 2, 3, 2
        stxv 34, 0(6)
        blr
        .long   0
        .quad   0
by_value:                               # @by_value
        .quad   .Lfunc_begin1
        .quad   .TOC. at tocbase
        .quad   0
.Lfunc_begin1:
        vaddubm 2, 3, 2
        vadduhm 2, 2, 4
        blr
        .long   0
        .quad   0
load_array:                             # @load_array
        .quad   .Lfunc_begin2
        .quad   .TOC. at tocbase
        .quad   0
.Lfunc_begin2:
        lxv 34, 0(3)
        blr
        .long   0
        .quad   0

Generated little-endian powerpc64le:
ld_st:                                  # @ld_st
        lxv 34, 0(3)
        lxv 35, 0(4)
        vaddubm 2, 3, 2
        lxv 35, 0(5)
        vadduhm 2, 3, 2
        stxv 34, 0(6)
        blr
        .long   0
        .quad   0
by_value:                               # @by_value
        vaddubm 2, 3, 2
        vadduhm 2, 2, 4
        blr
        .long   0
        .quad   0
load_array:                             # @load_array
        lxv 34, 0(3)
        blr
        .long   0
        .quad   0

Generated big-endian AArch64:
ld_st:                                  // @ld_st
        ld1     { v0.16b }, [x0]
        ld1     { v1.16b }, [x1]
        ld1     { v2.8h }, [x2]
        add     v0.16b, v1.16b, v0.16b
        rev16   v0.16b, v0.16b
        add     v0.8h, v2.8h, v0.8h
        st1     { v0.8h }, [x3]
        ret
by_value:                               // @by_value
        rev64   v0.16b, v0.16b
        rev64   v1.16b, v1.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        ext     v1.16b, v1.16b, v1.16b, #8
        rev64   v2.8h, v2.8h
        add     v0.16b, v1.16b, v0.16b
        ext     v2.16b, v2.16b, v2.16b, #8
        rev16   v0.16b, v0.16b
        add     v0.8h, v0.8h, v2.8h
        rev64   v0.8h, v0.8h
        ext     v0.16b, v0.16b, v0.16b, #8
        ret
load_array:                             // @load_array
        ldr     q0, [x0]
        ret

Generated little-endian AArch64:
ld_st:                                  // @ld_st
        ldr     q0, [x0]
        ldr     q1, [x1]
        ldr     q2, [x2]
        add     v0.16b, v1.16b, v0.16b
        add     v0.8h, v2.8h, v0.8h
        str     q0, [x3]
        ret
by_value:                               // @by_value
        add     v0.16b, v1.16b, v0.16b
        add     v0.8h, v0.8h, v2.8h
        ret
load_array:                             // @load_array
        ldr     q0, [x0]
        ret

Generated x86_64:
ld_st:
        movdqa  xmm0, XMMWORD PTR [rdi]
        paddb   xmm0, XMMWORD PTR [rsi]
        paddw   xmm0, XMMWORD PTR [rdx]
        movaps  XMMWORD PTR [rcx], xmm0
        ret
by_value:
        paddb   xmm0, xmm1
        paddw   xmm0, xmm2
        ret
load_array:
        movdqa  xmm0, XMMWORD PTR [rdi]
        ret

-- 
You are receiving this mail because:
You are on the CC list for the bug.