Singularity/Library/PackageCache/com.unity.burst@1.8.4/Documentation~/optimization-skiplocalsinit.md
2024-05-06 11:45:45 -07:00

5.6 KiB

SkipLocalsInit attribute

Use SkipLocalsInitAttribute, to tell Burst that any stack allocations within a method don't have to be initialized to zero.

In C# all local variables are initialized to zero by default. This is useful because it means an entire class of bugs surrounding undefined data disappears. But this can impact runtime performance, because initializing this data to zero takes work:

static unsafe int DoSomethingWithLUT(int* data);

static unsafe int DoSomething(int size)
{
    int* data = stackalloc int[size];

    // Initialize every field of data to be an incrementing set of values.
    for (int i = 0; i < size; i++)
    {
        data[i] = i;
    }

    // Use the data elsewhere.
    return DoSomethingWithLUT(data);
}

The X86 assembly for this is:

        push    rbp
        .seh_pushreg rbp
        push    rsi
        .seh_pushreg rsi
        push    rdi
        .seh_pushreg rdi
        mov     rbp, rsp
        .seh_setframe rbp, 0
        .seh_endprologue
        mov     edi, ecx
        lea     r8d, [4*rdi]
        lea     rax, [r8 + 15]
        and     rax, -16
        movabs  r11, offset __chkstk
        call    r11
        sub     rsp, rax
        mov     rsi, rsp
        sub     rsp, 32
        movabs  rax, offset burst.memset.inline.X64_SSE4.i32@@32
        mov     rcx, rsi
        xor     edx, edx
        xor     r9d, r9d
        call    rax
        add     rsp, 32
        test    edi, edi
        jle     .LBB0_7
        mov     eax, edi
        cmp     edi, 8
        jae     .LBB0_3
        xor     ecx, ecx
        jmp     .LBB0_6
.LBB0_3:
        mov     ecx, eax
        and     ecx, -8
        movabs  rdx, offset __xmm@00000003000000020000000100000000
        movdqa  xmm0, xmmword ptr [rdx]
        mov     rdx, rsi
        add     rdx, 16
        movabs  rdi, offset __xmm@00000004000000040000000400000004
        movdqa  xmm1, xmmword ptr [rdi]
        movabs  rdi, offset __xmm@00000008000000080000000800000008
        movdqa  xmm2, xmmword ptr [rdi]
        mov     rdi, rcx
        .p2align        4, 0x90
.LBB0_4:
        movdqa  xmm3, xmm0
        paddd   xmm3, xmm1
        movdqu  xmmword ptr [rdx - 16], xmm0
        movdqu  xmmword ptr [rdx], xmm3
        paddd   xmm0, xmm2
        add     rdx, 32
        add     rdi, -8
        jne     .LBB0_4
        cmp     rcx, rax
        je      .LBB0_7
        .p2align        4, 0x90
.LBB0_6:
        mov     dword ptr [rsi + 4*rcx], ecx
        inc     rcx
        cmp     rax, rcx
        jne     .LBB0_6
.LBB0_7:
        sub     rsp, 32
        movabs  rax, offset "DoSomethingWithLUT"
        mov     rcx, rsi
        call    rax
        nop
        mov     rsp, rbp
        pop     rdi
        pop     rsi
        pop     rbp
        ret

In this example, the movabs rax, offset burst.memset.inline.X64_SSE4.i32@@32 line means that you've had to inject a memset to zero out the data. In the above example, you know that the array is entirely initialized in the following loop, but Burst doesn't know that.

To fix this problem, use Unity.Burst.CompilerServices.SkipLocalsInitAttribute, which tells Burst that any stack allocations within a method don't have to be initialized to zero.

Note

Only use this attribute if you're certain that you won't run into undefined behavior bugs.

For example:

using Unity.Burst.CompilerServices;

static unsafe int DoSomethingWithLUT(int* data);

[SkipLocalsInit]
static unsafe int DoSomething(int size)
{
    int* data = stackalloc int[size];

    // Initialize every field of data to be an incrementing set of values.
    for (int i = 0; i < size; i++)
    {
        data[i] = i;
    }

    // Use the data elsewhere.
    return DoSomethingWithLUT(data);
}

The assembly after adding the [SkipLocalsInit] on the method is:

        push    rbp
        .seh_pushreg rbp
        mov     rbp, rsp
        .seh_setframe rbp, 0
        .seh_endprologue
        mov     edx, ecx
        lea     eax, [4*rdx]
        add     rax, 15
        and     rax, -16
        movabs  r11, offset __chkstk
        call    r11
        sub     rsp, rax
        mov     rcx, rsp
        test    edx, edx
        jle     .LBB0_7
        mov     r8d, edx
        cmp     edx, 8
        jae     .LBB0_3
        xor     r10d, r10d
        jmp     .LBB0_6
.LBB0_3:
        mov     r10d, r8d
        and     r10d, -8
        movabs  rax, offset __xmm@00000003000000020000000100000000
        movdqa  xmm0, xmmword ptr [rax]
        mov     rax, rcx
        add     rax, 16
        movabs  rdx, offset __xmm@00000004000000040000000400000004
        movdqa  xmm1, xmmword ptr [rdx]
        movabs  rdx, offset __xmm@00000008000000080000000800000008
        movdqa  xmm2, xmmword ptr [rdx]
        mov     r9, r10
        .p2align        4, 0x90
.LBB0_4:
        movdqa  xmm3, xmm0
        paddd   xmm3, xmm1
        movdqu  xmmword ptr [rax - 16], xmm0
        movdqu  xmmword ptr [rax], xmm3
        paddd   xmm0, xmm2
        add     rax, 32
        add     r9, -8
        jne     .LBB0_4
        cmp     r10, r8
        je      .LBB0_7
        .p2align        4, 0x90
.LBB0_6:
        mov     dword ptr [rcx + 4*r10], r10d
        inc     r10
        cmp     r8, r10
        jne     .LBB0_6
.LBB0_7:
        sub     rsp, 32
        movabs  rax, offset "DoSomethingWithLUT"
        call    rax
        nop
        mov     rsp, rbp
        pop     rbp
        ret

The call to memset is now gone, because you've told Burst that any stack allocations within a method don't have to be initialized to zero.