1/* 2 * Flush routine for SHA512 multibuffer 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2016 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Megha Dey <megha.dey@linux.intel.com> 22 * 23 * BSD LICENSE 24 * 25 * Copyright(c) 2016 Intel Corporation. 26 * 27 * Redistribution and use in source and binary forms, with or without 28 * modification, are permitted provided that the following conditions 29 * are met: 30 * 31 * * Redistributions of source code must retain the above copyright 32 * notice, this list of conditions and the following disclaimer. 33 * * Redistributions in binary form must reproduce the above copyright 34 * notice, this list of conditions and the following disclaimer in 35 * the documentation and/or other materials provided with the 36 * distribution. 37 * * Neither the name of Intel Corporation nor the names of its 38 * contributors may be used to endorse or promote products derived 39 * from this software without specific prior written permission. 40 * 41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 52 */ 53 54#include <linux/linkage.h> 55#include <asm/frame.h> 56#include "sha512_mb_mgr_datastruct.S" 57 58.extern sha512_x4_avx2 59 60# LINUX register definitions 61#define arg1 %rdi 62#define arg2 %rsi 63 64# idx needs to be other than arg1, arg2, rbx, r12 65#define idx %rdx 66 67# Common definitions 68#define state arg1 69#define job arg2 70#define len2 arg2 71 72#define unused_lanes %rbx 73#define lane_data %rbx 74#define tmp2 %rbx 75 76#define job_rax %rax 77#define tmp1 %rax 78#define size_offset %rax 79#define tmp %rax 80#define start_offset %rax 81 82#define tmp3 arg1 83 84#define extra_blocks arg2 85#define p arg2 86 87#define tmp4 %r8 88#define lens0 %r8 89 90#define lens1 %r9 91#define lens2 %r10 92#define lens3 %r11 93 94.macro LABEL prefix n 95\prefix\n\(): 96.endm 97 98.macro JNE_SKIP i 99jne skip_\i 100.endm 101 102.altmacro 103.macro SET_OFFSET _offset 104offset = \_offset 105.endm 106.noaltmacro 107 108# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state) 109# arg 1 : rcx : state 110ENTRY(sha512_mb_mgr_flush_avx2) 111 FRAME_BEGIN 112 push %rbx 113 114 # If bit (32+3) is set, then all lanes are empty 115 mov _unused_lanes(state), unused_lanes 116 bt $32+7, unused_lanes 117 jc return_null 118 119 # find a lane with a non-null job 120 xor idx, idx 121 offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane) 122 cmpq $0, offset(state) 123 cmovne one(%rip), idx 124 offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane) 125 cmpq $0, offset(state) 126 cmovne two(%rip), idx 127 offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane) 128 cmpq $0, offset(state) 129 cmovne three(%rip), idx 130 131 # copy idx to empty lanes 132copy_lane_data: 133 offset = (_args + _data_ptr) 134 mov offset(state,idx,8), tmp 135 136 I = 0 137.rep 4 138 offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) 139 cmpq $0, offset(state) 140.altmacro 141 JNE_SKIP %I 142 offset = (_args + _data_ptr + 8*I) 143 mov tmp, offset(state) 144 offset = (_lens + 8*I +4) 145 movl $0xFFFFFFFF, offset(state) 146LABEL skip_ %I 147 I = (I+1) 148.noaltmacro 149.endr 150 151 # Find min length 152 mov _lens + 0*8(state),lens0 153 mov lens0,idx 154 mov _lens + 1*8(state),lens1 155 cmp idx,lens1 156 cmovb lens1,idx 157 mov _lens + 2*8(state),lens2 158 cmp idx,lens2 159 cmovb lens2,idx 160 mov _lens + 3*8(state),lens3 161 cmp idx,lens3 162 cmovb lens3,idx 163 mov idx,len2 164 and $0xF,idx 165 and $~0xFF,len2 166 jz len_is_0 167 168 sub len2, lens0 169 sub len2, lens1 170 sub len2, lens2 171 sub len2, lens3 172 shr $32,len2 173 mov lens0, _lens + 0*8(state) 174 mov lens1, _lens + 1*8(state) 175 mov lens2, _lens + 2*8(state) 176 mov lens3, _lens + 3*8(state) 177 178 # "state" and "args" are the same address, arg1 179 # len is arg2 180 call sha512_x4_avx2 181 # state and idx are intact 182 183len_is_0: 184 # process completed job "idx" 185 imul $_LANE_DATA_size, idx, lane_data 186 lea _ldata(state, lane_data), lane_data 187 188 mov _job_in_lane(lane_data), job_rax 189 movq $0, _job_in_lane(lane_data) 190 movl $STS_COMPLETED, _status(job_rax) 191 mov _unused_lanes(state), unused_lanes 192 shl $8, unused_lanes 193 or idx, unused_lanes 194 mov unused_lanes, _unused_lanes(state) 195 196 movl $0xFFFFFFFF, _lens+4(state, idx, 8) 197 198 vmovq _args_digest+0*32(state, idx, 8), %xmm0 199 vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0 200 vmovq _args_digest+2*32(state, idx, 8), %xmm1 201 vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1 202 vmovq _args_digest+4*32(state, idx, 8), %xmm2 203 vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2 204 vmovq _args_digest+6*32(state, idx, 8), %xmm3 205 vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3 206 207 vmovdqu %xmm0, _result_digest(job_rax) 208 vmovdqu %xmm1, _result_digest+1*16(job_rax) 209 vmovdqu %xmm2, _result_digest+2*16(job_rax) 210 vmovdqu %xmm3, _result_digest+3*16(job_rax) 211 212return: 213 pop %rbx 214 FRAME_END 215 ret 216 217return_null: 218 xor job_rax, job_rax 219 jmp return 220ENDPROC(sha512_mb_mgr_flush_avx2) 221.align 16 222 223ENTRY(sha512_mb_mgr_get_comp_job_avx2) 224 push %rbx 225 226 mov _unused_lanes(state), unused_lanes 227 bt $(32+7), unused_lanes 228 jc .return_null 229 230 # Find min length 231 mov _lens(state),lens0 232 mov lens0,idx 233 mov _lens+1*8(state),lens1 234 cmp idx,lens1 235 cmovb lens1,idx 236 mov _lens+2*8(state),lens2 237 cmp idx,lens2 238 cmovb lens2,idx 239 mov _lens+3*8(state),lens3 240 cmp idx,lens3 241 cmovb lens3,idx 242 test $~0xF,idx 243 jnz .return_null 244 and $0xF,idx 245 246 #process completed job "idx" 247 imul $_LANE_DATA_size, idx, lane_data 248 lea _ldata(state, lane_data), lane_data 249 250 mov _job_in_lane(lane_data), job_rax 251 movq $0, _job_in_lane(lane_data) 252 movl $STS_COMPLETED, _status(job_rax) 253 mov _unused_lanes(state), unused_lanes 254 shl $8, unused_lanes 255 or idx, unused_lanes 256 mov unused_lanes, _unused_lanes(state) 257 258 movl $0xFFFFFFFF, _lens+4(state, idx, 8) 259 260 vmovq _args_digest(state, idx, 8), %xmm0 261 vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0 262 vmovq _args_digest+2*32(state, idx, 8), %xmm1 263 vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1 264 vmovq _args_digest+4*32(state, idx, 8), %xmm2 265 vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2 266 vmovq _args_digest+6*32(state, idx, 8), %xmm3 267 vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3 268 269 vmovdqu %xmm0, _result_digest+0*16(job_rax) 270 vmovdqu %xmm1, _result_digest+1*16(job_rax) 271 vmovdqu %xmm2, _result_digest+2*16(job_rax) 272 vmovdqu %xmm3, _result_digest+3*16(job_rax) 273 274 pop %rbx 275 276 ret 277 278.return_null: 279 xor job_rax, job_rax 280 pop %rbx 281 ret 282ENDPROC(sha512_mb_mgr_get_comp_job_avx2) 283 284.section .rodata.cst8.one, "aM", @progbits, 8 285.align 8 286one: 287.quad 1 288 289.section .rodata.cst8.two, "aM", @progbits, 8 290.align 8 291two: 292.quad 2 293 294.section .rodata.cst8.three, "aM", @progbits, 8 295.align 8 296three: 297.quad 3 298