Lines Matching +full:many +full:- +full:to +full:- +full:one
1 /* SPDX-License-Identifier: GPL-2.0 */
4 Copyright (c) 2002 Hewlett-Packard Co/CERN
15 we get to a 16B-aligned address, then loop on 128 B chunks using an
18 Since a stf.spill f0 can store 16B in one go, we use this instruction
19 to get peak speed when value = 0. */
42 // This routine uses only scratch predicate registers (p6 - p15)
43 #define p_scr p6 // default register for same-cycle branches
72 and ptr2 = -(MIN1+1), dest // aligned address
73 and tmp = MIN1, dest // prepare to check for correct alignment
78 (p_scr) br.ret.dpnt.many rp // return immediately if count = 0
83 sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
85 (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
93 (p_y) add cnt = -8, cnt //
96 (p_y) st8 [ptr2] = value,-4 //
100 (p_yy) add cnt = -4, cnt //
103 (p_yy) st4 [ptr2] = value,-2 //
108 (p_y) add cnt = -2, cnt //
111 setf.sig fvalue=value // transfer value to FLP side
112 (p_y) st2 [ptr2] = value,-1 //
120 (p_yy) add cnt = -1, cnt //
121 (p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
127 (p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
130 TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
132 and tmp = -(LINE_SIZE), cnt // compute end of range
134 and cnt = (LINE_SIZE-1), cnt // remainder
136 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
140 (p_scr) add loopcnt = -1, linecnt //
145 add tmp = -1, linecnt // next loop count
150 stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
195 (p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
196 br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
200 .l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
202 and tmp = -(LINE_SIZE), cnt // compute end of range
204 and cnt = (LINE_SIZE-1), cnt // remainder
206 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
210 (p_scr) add loopcnt = -1, linecnt
215 add tmp = -1, linecnt // next loop count
220 stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
249 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
259 add loopcnt = -1, loopcnt
260 (p_scr) br.cond.dpnt.many .store_words
267 .l2: // ------------------------------------ // L2A: store 32B in 2 cycles
274 br.cloop.dptk.many .l2
279 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
285 add cnt = -8, cnt // subtract
290 (p_y) add cnt = -8, cnt // subtract
294 (p_yy) add cnt = -8, cnt // subtract
319 br.ret.sptk.many rp
331 (p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
332 (p_y) add cnt = -1, cnt
341 (p_yy) add cnt = -4, cnt
345 add ptr3 = -1, ptr3 // last store
350 (p_y) add cnt = -4, cnt
357 (p_yy) add cnt = -4, cnt
362 br.ret.sptk.many rp