Initial commit: Go 1.23 release state
This commit is contained in:
238
src/runtime/memmove_arm64.s
Normal file
238
src/runtime/memmove_arm64.s
Normal file
@@ -0,0 +1,238 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// See memmove Go doc for important implementation constraints.
|
||||
|
||||
// Register map
|
||||
//
|
||||
// dstin R0
|
||||
// src R1
|
||||
// count R2
|
||||
// dst R3 (same as R0, but gets modified in unaligned cases)
|
||||
// srcend R4
|
||||
// dstend R5
|
||||
// data R6-R17
|
||||
// tmp1 R14
|
||||
|
||||
// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
// copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
// check is negligible since it is only required for large copies.
|
||||
//
|
||||
// Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
// The destination pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
// The loop tail is handled by always copying 64 bytes from the end.
|
||||
|
||||
// func memmove(to, from unsafe.Pointer, n uintptr)
|
||||
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
|
||||
CBZ R2, copy0
|
||||
|
||||
// Small copies: 1..16 bytes
|
||||
CMP $16, R2
|
||||
BLE copy16
|
||||
|
||||
// Large copies
|
||||
CMP $128, R2
|
||||
BHI copy_long
|
||||
CMP $32, R2
|
||||
BHI copy32_128
|
||||
|
||||
// Small copies: 17..32 bytes.
|
||||
LDP (R1), (R6, R7)
|
||||
ADD R1, R2, R4 // R4 points just past the last source byte
|
||||
LDP -16(R4), (R12, R13)
|
||||
STP (R6, R7), (R0)
|
||||
ADD R0, R2, R5 // R5 points just past the last destination byte
|
||||
STP (R12, R13), -16(R5)
|
||||
RET
|
||||
|
||||
// Small copies: 1..16 bytes.
|
||||
copy16:
|
||||
ADD R1, R2, R4 // R4 points just past the last source byte
|
||||
ADD R0, R2, R5 // R5 points just past the last destination byte
|
||||
CMP $8, R2
|
||||
BLT copy7
|
||||
MOVD (R1), R6
|
||||
MOVD -8(R4), R7
|
||||
MOVD R6, (R0)
|
||||
MOVD R7, -8(R5)
|
||||
RET
|
||||
|
||||
copy7:
|
||||
TBZ $2, R2, copy3
|
||||
MOVWU (R1), R6
|
||||
MOVWU -4(R4), R7
|
||||
MOVW R6, (R0)
|
||||
MOVW R7, -4(R5)
|
||||
RET
|
||||
|
||||
copy3:
|
||||
TBZ $1, R2, copy1
|
||||
MOVHU (R1), R6
|
||||
MOVHU -2(R4), R7
|
||||
MOVH R6, (R0)
|
||||
MOVH R7, -2(R5)
|
||||
RET
|
||||
|
||||
copy1:
|
||||
MOVBU (R1), R6
|
||||
MOVB R6, (R0)
|
||||
|
||||
copy0:
|
||||
RET
|
||||
|
||||
// Medium copies: 33..128 bytes.
|
||||
copy32_128:
|
||||
ADD R1, R2, R4 // R4 points just past the last source byte
|
||||
ADD R0, R2, R5 // R5 points just past the last destination byte
|
||||
LDP (R1), (R6, R7)
|
||||
LDP 16(R1), (R8, R9)
|
||||
LDP -32(R4), (R10, R11)
|
||||
LDP -16(R4), (R12, R13)
|
||||
CMP $64, R2
|
||||
BHI copy128
|
||||
STP (R6, R7), (R0)
|
||||
STP (R8, R9), 16(R0)
|
||||
STP (R10, R11), -32(R5)
|
||||
STP (R12, R13), -16(R5)
|
||||
RET
|
||||
|
||||
// Copy 65..128 bytes.
|
||||
copy128:
|
||||
LDP 32(R1), (R14, R15)
|
||||
LDP 48(R1), (R16, R17)
|
||||
CMP $96, R2
|
||||
BLS copy96
|
||||
LDP -64(R4), (R2, R3)
|
||||
LDP -48(R4), (R1, R4)
|
||||
STP (R2, R3), -64(R5)
|
||||
STP (R1, R4), -48(R5)
|
||||
|
||||
copy96:
|
||||
STP (R6, R7), (R0)
|
||||
STP (R8, R9), 16(R0)
|
||||
STP (R14, R15), 32(R0)
|
||||
STP (R16, R17), 48(R0)
|
||||
STP (R10, R11), -32(R5)
|
||||
STP (R12, R13), -16(R5)
|
||||
RET
|
||||
|
||||
// Copy more than 128 bytes.
|
||||
copy_long:
|
||||
ADD R1, R2, R4 // R4 points just past the last source byte
|
||||
ADD R0, R2, R5 // R5 points just past the last destination byte
|
||||
MOVD ZR, R7
|
||||
MOVD ZR, R8
|
||||
|
||||
CMP $1024, R2
|
||||
BLT backward_check
|
||||
// feature detect to decide how to align
|
||||
MOVBU runtime·arm64UseAlignedLoads(SB), R6
|
||||
CBNZ R6, use_aligned_loads
|
||||
MOVD R0, R7
|
||||
MOVD R5, R8
|
||||
B backward_check
|
||||
use_aligned_loads:
|
||||
MOVD R1, R7
|
||||
MOVD R4, R8
|
||||
// R7 and R8 are used here for the realignment calculation. In
|
||||
// the use_aligned_loads case, R7 is the src pointer and R8 is
|
||||
// srcend pointer, which is used in the backward copy case.
|
||||
// When doing aligned stores, R7 is the dst pointer and R8 is
|
||||
// the dstend pointer.
|
||||
|
||||
backward_check:
|
||||
// Use backward copy if there is an overlap.
|
||||
SUB R1, R0, R14
|
||||
CBZ R14, copy0
|
||||
CMP R2, R14
|
||||
BCC copy_long_backward
|
||||
|
||||
// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
|
||||
LDP (R1), (R12, R13) // Load A
|
||||
AND $15, R7, R14 // Calculate the realignment offset
|
||||
SUB R14, R1, R1
|
||||
SUB R14, R0, R3 // move dst back same amount as src
|
||||
ADD R14, R2, R2
|
||||
LDP 16(R1), (R6, R7) // Load B
|
||||
STP (R12, R13), (R0) // Store A
|
||||
LDP 32(R1), (R8, R9) // Load C
|
||||
LDP 48(R1), (R10, R11) // Load D
|
||||
LDP.W 64(R1), (R12, R13) // Load E
|
||||
// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
|
||||
SUBS $144, R2, R2
|
||||
BLS copy64_from_end
|
||||
|
||||
loop64:
|
||||
STP (R6, R7), 16(R3) // Store B
|
||||
LDP 16(R1), (R6, R7) // Load B (next iteration)
|
||||
STP (R8, R9), 32(R3) // Store C
|
||||
LDP 32(R1), (R8, R9) // Load C
|
||||
STP (R10, R11), 48(R3) // Store D
|
||||
LDP 48(R1), (R10, R11) // Load D
|
||||
STP.W (R12, R13), 64(R3) // Store E
|
||||
LDP.W 64(R1), (R12, R13) // Load E
|
||||
SUBS $64, R2, R2
|
||||
BHI loop64
|
||||
|
||||
// Write the last iteration and copy 64 bytes from the end.
|
||||
copy64_from_end:
|
||||
LDP -64(R4), (R14, R15) // Load F
|
||||
STP (R6, R7), 16(R3) // Store B
|
||||
LDP -48(R4), (R6, R7) // Load G
|
||||
STP (R8, R9), 32(R3) // Store C
|
||||
LDP -32(R4), (R8, R9) // Load H
|
||||
STP (R10, R11), 48(R3) // Store D
|
||||
LDP -16(R4), (R10, R11) // Load I
|
||||
STP (R12, R13), 64(R3) // Store E
|
||||
STP (R14, R15), -64(R5) // Store F
|
||||
STP (R6, R7), -48(R5) // Store G
|
||||
STP (R8, R9), -32(R5) // Store H
|
||||
STP (R10, R11), -16(R5) // Store I
|
||||
RET
|
||||
|
||||
// Large backward copy for overlapping copies.
|
||||
// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
|
||||
copy_long_backward:
|
||||
LDP -16(R4), (R12, R13)
|
||||
AND $15, R8, R14
|
||||
SUB R14, R4, R4
|
||||
SUB R14, R2, R2
|
||||
LDP -16(R4), (R6, R7)
|
||||
STP (R12, R13), -16(R5)
|
||||
LDP -32(R4), (R8, R9)
|
||||
LDP -48(R4), (R10, R11)
|
||||
LDP.W -64(R4), (R12, R13)
|
||||
SUB R14, R5, R5
|
||||
SUBS $128, R2, R2
|
||||
BLS copy64_from_start
|
||||
|
||||
loop64_backward:
|
||||
STP (R6, R7), -16(R5)
|
||||
LDP -16(R4), (R6, R7)
|
||||
STP (R8, R9), -32(R5)
|
||||
LDP -32(R4), (R8, R9)
|
||||
STP (R10, R11), -48(R5)
|
||||
LDP -48(R4), (R10, R11)
|
||||
STP.W (R12, R13), -64(R5)
|
||||
LDP.W -64(R4), (R12, R13)
|
||||
SUBS $64, R2, R2
|
||||
BHI loop64_backward
|
||||
|
||||
// Write the last iteration and copy 64 bytes from the start.
|
||||
copy64_from_start:
|
||||
LDP 48(R1), (R2, R3)
|
||||
STP (R6, R7), -16(R5)
|
||||
LDP 32(R1), (R6, R7)
|
||||
STP (R8, R9), -32(R5)
|
||||
LDP 16(R1), (R8, R9)
|
||||
STP (R10, R11), -48(R5)
|
||||
LDP (R1), (R10, R11)
|
||||
STP (R12, R13), -64(R5)
|
||||
STP (R2, R3), 48(R0)
|
||||
STP (R6, R7), 32(R0)
|
||||
STP (R8, R9), 16(R0)
|
||||
STP (R10, R11), (R0)
|
||||
RET
|
||||
Reference in New Issue
Block a user