arch/mips/lib/memcpy-inatomic.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  *
  13  * Mnemonic names for arguments to memcpy/__copy_user
  14  */
  15
  16 /*
  17  * Hack to resolve longstanding prefetch issue
  18  *
  19  * Prefetching may be fatal on some systems if we're prefetching beyond the
  20  * end of memory on some systems.  It's also a seriously bad idea on non
  21  * dma-coherent systems.
  22  */
  23 #if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
  24 #undef CONFIG_CPU_HAS_PREFETCH
  25 #endif
  26 #ifdef CONFIG_MIPS_MALTA
  27 #undef CONFIG_CPU_HAS_PREFETCH
  28 #endif
  29
  30 #include <asm/asm.h>
  31 #include <asm/asm-offsets.h>
  32 #include <asm/regdef.h>
  33
  34 #define dst a0
  35 #define src a1
  36 #define len a2
  37
  38 /*
  39  * Spec
  40  *
  41  * memcpy copies len bytes from src to dst and sets v0 to dst.
  42  * It assumes that
  43  *   - src and dst don't overlap
  44  *   - src is readable
  45  *   - dst is writable
  46  * memcpy uses the standard calling convention
  47  *
  48  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  49  * the number of uncopied bytes due to an exception caused by a read or write.
  50  * __copy_user assumes that src and dst don't overlap, and that the call is
  51  * implementing one of the following:
  52  *   copy_to_user
  53  *     - src is readable  (no exceptions when reading src)
  54  *   copy_from_user
  55  *     - dst is writable  (no exceptions when writing dst)
  56  * __copy_user uses a non-standard calling convention; see
  57  * include/asm-mips/uaccess.h
  58  *
  59  * When an exception happens on a load, the handler must
  60  # ensure that all of the destination buffer is overwritten to prevent
  61  * leaking information to user mode programs.
  62  */
  63
  64 /*
  65  * Implementation
  66  */
  67
  68 /*
  69  * The exception handler for loads requires that:
  70  *  1- AT contain the address of the byte just past the end of the source
  71  *     of the copy,
  72  *  2- src_entry <= src < AT, and
  73  *  3- (dst - src) == (dst_entry - src_entry),
  74  * The _entry suffix denotes values when __copy_user was called.
  75  *
  76  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  77  * (2) is met by incrementing src by the number of bytes copied
  78  * (3) is met by not doing loads between a pair of increments of dst and src
  79  *
  80  * The exception handlers for stores adjust len (if necessary) and return.
  81  * These handlers do not need to overwrite any data.
  82  *
  83  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  84  * they're not protected.
  85  */
  86
  87 #define EXC(inst_reg,addr,handler)              \
  88 9:      inst_reg, addr;                         \
  89         .section __ex_table,"a";                \
  90         PTR     9b, handler;                    \
  91         .previous
  92
  93 /*
  94  * Only on the 64-bit kernel we can made use of 64-bit registers.
  95  */
  96 #ifdef CONFIG_64BIT
  97 #define USE_DOUBLE
  98 #endif
  99
 100 #ifdef USE_DOUBLE
 101
 102 #define LOAD   ld
 103 #define LOADL  ldl
 104 #define LOADR  ldr
 105 #define STOREL sdl
 106 #define STORER sdr
 107 #define STORE  sd
 108 #define ADD    daddu
 109 #define SUB    dsubu
 110 #define SRL    dsrl
 111 #define SRA    dsra
 112 #define SLL    dsll
 113 #define SLLV   dsllv
 114 #define SRLV   dsrlv
 115 #define NBYTES 8
 116 #define LOG_NBYTES 3
 117
 118 /*
 119  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 120  * register definitions). We need to redefine the register definitions from
 121  * the n64 ABI register naming to the o32 ABI register naming.
 122  */
 123 #undef t0
 124 #undef t1
 125 #undef t2
 126 #undef t3
 127 #define t0      $8
 128 #define t1      $9
 129 #define t2      $10
 130 #define t3      $11
 131 #define t4      $12
 132 #define t5      $13
 133 #define t6      $14
 134 #define t7      $15
 135
 136 #else
 137
 138 #define LOAD   lw
 139 #define LOADL  lwl
 140 #define LOADR  lwr
 141 #define STOREL swl
 142 #define STORER swr
 143 #define STORE  sw
 144 #define ADD    addu
 145 #define SUB    subu
 146 #define SRL    srl
 147 #define SLL    sll
 148 #define SRA    sra
 149 #define SLLV   sllv
 150 #define SRLV   srlv
 151 #define NBYTES 4
 152 #define LOG_NBYTES 2
 153
 154 #endif /* USE_DOUBLE */
 155
 156 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 157 #define LDFIRST LOADR
 158 #define LDREST  LOADL
 159 #define STFIRST STORER
 160 #define STREST  STOREL
 161 #define SHIFT_DISCARD SLLV
 162 #else
 163 #define LDFIRST LOADL
 164 #define LDREST  LOADR
 165 #define STFIRST STOREL
 166 #define STREST  STORER
 167 #define SHIFT_DISCARD SRLV
 168 #endif
 169
 170 #define FIRST(unit) ((unit)*NBYTES)
 171 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 172 #define UNIT(unit)  FIRST(unit)
 173
 174 #define ADDRMASK (NBYTES-1)
 175
 176         .text
 177         .set    noreorder
 178         .set    noat
 179
 180 /*
 181  * A combined memcpy/__copy_user
 182  * __copy_user sets len to 0 for success; else to an upper bound of
 183  * the number of uncopied bytes.
 184  * memcpy sets v0 to dst.
 185  */
 186         .align  5
 187 LEAF(__copy_user_inatomic)
 188         /*
 189          * Note: dst & src may be unaligned, len may be 0
 190          * Temps
 191          */
 192 #define rem t8
 193
 194         /*
 195          * The "issue break"s below are very approximate.
 196          * Issue delays for dcache fills will perturb the schedule, as will
 197          * load queue full replay traps, etc.
 198          *
 199          * If len < NBYTES use byte operations.
 200          */
 201         PREF(   0, 0(src) )
 202         PREF(   1, 0(dst) )
 203         sltu    t2, len, NBYTES
 204         and     t1, dst, ADDRMASK
 205         PREF(   0, 1*32(src) )
 206         PREF(   1, 1*32(dst) )
 207         bnez    t2, copy_bytes_checklen
 208          and    t0, src, ADDRMASK
 209         PREF(   0, 2*32(src) )
 210         PREF(   1, 2*32(dst) )
 211         bnez    t1, dst_unaligned
 212          nop
 213         bnez    t0, src_unaligned_dst_aligned
 214         /*
 215          * use delay slot for fall-through
 216          * src and dst are aligned; need to compute rem
 217          */
 218 both_aligned:
 219          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 220         beqz    t0, cleanup_both_aligned # len < 8*NBYTES
 221          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 222         PREF(   0, 3*32(src) )
 223         PREF(   1, 3*32(dst) )
 224         .align  4
 225 1:
 226 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 227 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 228 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 229 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 230         SUB     len, len, 8*NBYTES
 231 EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
 232 EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
 233         STORE   t0, UNIT(0)(dst)
 234         STORE   t1, UNIT(1)(dst)
 235 EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
 236 EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
 237         ADD     src, src, 8*NBYTES
 238         ADD     dst, dst, 8*NBYTES
 239         STORE   t2, UNIT(-6)(dst)
 240         STORE   t3, UNIT(-5)(dst)
 241         STORE   t4, UNIT(-4)(dst)
 242         STORE   t7, UNIT(-3)(dst)
 243         STORE   t0, UNIT(-2)(dst)
 244         STORE   t1, UNIT(-1)(dst)
 245         PREF(   0, 8*32(src) )
 246         PREF(   1, 8*32(dst) )
 247         bne     len, rem, 1b
 248          nop
 249
 250         /*
 251          * len == rem == the number of bytes left to copy < 8*NBYTES
 252          */
 253 cleanup_both_aligned:
 254         beqz    len, done
 255          sltu   t0, len, 4*NBYTES
 256         bnez    t0, less_than_4units
 257          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 258         /*
 259          * len >= 4*NBYTES
 260          */
 261 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 262 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 263 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 264 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 265         SUB     len, len, 4*NBYTES
 266         ADD     src, src, 4*NBYTES
 267         STORE   t0, UNIT(0)(dst)
 268         STORE   t1, UNIT(1)(dst)
 269         STORE   t2, UNIT(2)(dst)
 270         STORE   t3, UNIT(3)(dst)
 271         beqz    len, done
 272          ADD    dst, dst, 4*NBYTES
 273 less_than_4units:
 274         /*
 275          * rem = len % NBYTES
 276          */
 277         beq     rem, len, copy_bytes
 278          nop
 279 1:
 280 EXC(    LOAD    t0, 0(src),             l_exc)
 281         ADD     src, src, NBYTES
 282         SUB     len, len, NBYTES
 283         STORE   t0, 0(dst)
 284         bne     rem, len, 1b
 285          ADD    dst, dst, NBYTES
 286
 287         /*
 288          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 289          * A loop would do only a byte at a time with possible branch
 290          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 291          * because can't assume read-access to dst.  Instead, use
 292          * STREST dst, which doesn't require read access to dst.
 293          *
 294          * This code should perform better than a simple loop on modern,
 295          * wide-issue mips processors because the code has fewer branches and
 296          * more instruction-level parallelism.
 297          */
 298 #define bits t2
 299         beqz    len, done
 300          ADD    t1, dst, len    # t1 is just past last byte of dst
 301         li      bits, 8*NBYTES
 302         SLL     rem, len, 3     # rem = number of bits to keep
 303 EXC(    LOAD    t0, 0(src),             l_exc)
 304         SUB     bits, bits, rem # bits = number of bits to discard
 305         SHIFT_DISCARD t0, t0, bits
 306         STREST  t0, -1(t1)
 307         jr      ra
 308          move   len, zero
 309 dst_unaligned:
 310         /*
 311          * dst is unaligned
 312          * t0 = src & ADDRMASK
 313          * t1 = dst & ADDRMASK; T1 > 0
 314          * len >= NBYTES
 315          *
 316          * Copy enough bytes to align dst
 317          * Set match = (src and dst have same alignment)
 318          */
 319 #define match rem
 320 EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
 321         ADD     t2, zero, NBYTES
 322 EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
 323         SUB     t2, t2, t1      # t2 = number of bytes copied
 324         xor     match, t0, t1
 325         STFIRST t3, FIRST(0)(dst)
 326         beq     len, t2, done
 327          SUB    len, len, t2
 328         ADD     dst, dst, t2
 329         beqz    match, both_aligned
 330          ADD    src, src, t2
 331
 332 src_unaligned_dst_aligned:
 333         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 334         PREF(   0, 3*32(src) )
 335         beqz    t0, cleanup_src_unaligned
 336          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 337         PREF(   1, 3*32(dst) )
 338 1:
 339 /*
 340  * Avoid consecutive LD*'s to the same register since some mips
 341  * implementations can't issue them in the same cycle.
 342  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 343  * are to the same unit (unless src is aligned, but it's not).
 344  */
 345 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 346 EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 347         SUB     len, len, 4*NBYTES
 348 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 349 EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 350 EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 351 EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 352 EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 353 EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 354         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 355         ADD     src, src, 4*NBYTES
 356 #ifdef CONFIG_CPU_SB1
 357         nop                             # improves slotting
 358 #endif
 359         STORE   t0, UNIT(0)(dst)
 360         STORE   t1, UNIT(1)(dst)
 361         STORE   t2, UNIT(2)(dst)
 362         STORE   t3, UNIT(3)(dst)
 363         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 364         bne     len, rem, 1b
 365          ADD    dst, dst, 4*NBYTES
 366
 367 cleanup_src_unaligned:
 368         beqz    len, done
 369          and    rem, len, NBYTES-1  # rem = len % NBYTES
 370         beq     rem, len, copy_bytes
 371          nop
 372 1:
 373 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 374 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 375         ADD     src, src, NBYTES
 376         SUB     len, len, NBYTES
 377         STORE   t0, 0(dst)
 378         bne     len, rem, 1b
 379          ADD    dst, dst, NBYTES
 380
 381 copy_bytes_checklen:
 382         beqz    len, done
 383          nop
 384 copy_bytes:
 385         /* 0 < len < NBYTES  */
 386 #define COPY_BYTE(N)                    \
 387 EXC(    lb      t0, N(src), l_exc);     \
 388         SUB     len, len, 1;            \
 389         beqz    len, done;              \
 390          sb     t0, N(dst)
 391
 392         COPY_BYTE(0)
 393         COPY_BYTE(1)
 394 #ifdef USE_DOUBLE
 395         COPY_BYTE(2)
 396         COPY_BYTE(3)
 397         COPY_BYTE(4)
 398         COPY_BYTE(5)
 399 #endif
 400 EXC(    lb      t0, NBYTES-2(src), l_exc)
 401         SUB     len, len, 1
 402         jr      ra
 403          sb     t0, NBYTES-2(dst)
 404 done:
 405         jr      ra
 406          nop
 407         END(__copy_user_inatomic)
 408
 409 l_exc_copy:
 410         /*
 411          * Copy bytes from src until faulting load address (or until a
 412          * lb faults)
 413          *
 414          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 415          * may be more than a byte beyond the last address.
 416          * Hence, the lb below may get an exception.
 417          *
 418          * Assumes src < THREAD_BUADDR($28)
 419          */
 420         LOAD    t0, TI_TASK($28)
 421          nop
 422         LOAD    t0, THREAD_BUADDR(t0)
 423 1:
 424 EXC(    lb      t1, 0(src),     l_exc)
 425         ADD     src, src, 1
 426         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 427         bne     src, t0, 1b
 428          ADD    dst, dst, 1
 429 l_exc:
 430         LOAD    t0, TI_TASK($28)
 431          nop
 432         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 433          nop
 434         SUB     len, AT, t0             # len number of uncopied bytes
 435         jr      ra
 436          nop