#! /usr/bin/env perl # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements support for AES instructions as per PowerISA # specification version 2.07, first implemented by POWER8 processor. # The module is endian-agnostic in sense that it supports both big- # and little-endian cases. Data alignment in parallelizable modes is # handled with VSX loads and stores, which implies MSR.VSX flag being # set. It should also be noted that ISA specification doesn't prohibit # alignment exceptions for these instructions on page boundaries. # Initially alignment was handled in pure AltiVec/VMX way [when data # is aligned programmatically, which in turn guarantees exception- # free execution], but it turned to hamper performance when vcipher # instructions are interleaved. It's reckoned that eventual # misalignment penalties at page boundaries are in average lower # than additional overhead in pure AltiVec approach. # # May 2016 # # Add XTS subroutine, 9x on little- and 12x improvement on big-endian # systems were measured. # ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS # POWER8[le] 3.96/0.72 0.74 1.1 # POWER8[be] 3.75/0.65 0.66 1.0 # POWER9[le] 4.02/0.86 0.84 1.05 # POWER9[be] 3.99/0.78 0.79 0.97 $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; $UCMP ="cmpld"; $SHL ="sldi"; } elsif ($flavour =~ /32/) { $SIZE_T =4; $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; $UCMP ="cmplw"; $SHL ="slwi"; } else { die "nonsense $flavour"; } $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; $FRAME=8*$SIZE_T; $prefix="aes_hw"; $sp="r1"; $vrsave="r12"; ######################################################################### {{{ # Key setup procedures # my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); $code.=<<___; .machine "any" .text .align 7 Lrcon: .long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0,0,0,0 ?asis Lconsts: mflr r0 bcl 20,31,\$+4 mflr $ptr #vvvvv "distance between . and rcon addi $ptr,$ptr,-0x48 mtlr r0 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .asciz "AES for PowerISA 2.07, CRYPTOGAMS by " .globl .${prefix}_set_encrypt_key .align 5 .${prefix}_set_encrypt_key: Lset_encrypt_key: mflr r11 $PUSH r11,$LRSAVE($sp) li $ptr,-1 ${UCMP}i $inp,0 beq- Lenc_key_abort # if ($inp==0) return -1; ${UCMP}i $out,0 beq- Lenc_key_abort # if ($out==0) return -1; li $ptr,-2 cmpwi $bits,128 blt- Lenc_key_abort cmpwi $bits,256 bgt- Lenc_key_abort andi. r0,$bits,0x3f bne- Lenc_key_abort lis r0,0xfff0 mfspr $vrsave,256 mtspr 256,r0 bl Lconsts mtlr r11 neg r9,$inp lvx $in0,0,$inp addi $inp,$inp,15 # 15 is not typo lvsr $key,0,r9 # borrow $key li r8,0x20 cmpwi $bits,192 lvx $in1,0,$inp le?vspltisb $mask,0x0f # borrow $mask lvx $rcon,0,$ptr le?vxor $key,$key,$mask # adjust for byte swap lvx $mask,r8,$ptr addi $ptr,$ptr,0x10 vperm $in0,$in0,$in1,$key # align [and byte swap in LE] li $cnt,8 vxor $zero,$zero,$zero mtctr $cnt ?lvsr $outperm,0,$out vspltisb $outmask,-1 lvx $outhead,0,$out ?vperm $outmask,$zero,$outmask,$outperm blt Loop128 addi $inp,$inp,8 beq L192 addi $inp,$inp,8 b L256 .align 4 Loop128: vperm $key,$in0,$in0,$mask # rotate-n-splat vsldoi $tmp,$zero,$in0,12 # >>32 vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail vcipherlast $key,$key,$rcon stvx $stage,0,$out addi $out,$out,16 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vadduwm $rcon,$rcon,$rcon vxor $in0,$in0,$key bdnz Loop128 lvx $rcon,0,$ptr # last two round keys vperm $key,$in0,$in0,$mask # rotate-n-splat vsldoi $tmp,$zero,$in0,12 # >>32 vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail vcipherlast $key,$key,$rcon stvx $stage,0,$out addi $out,$out,16 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vadduwm $rcon,$rcon,$rcon vxor $in0,$in0,$key vperm $key,$in0,$in0,$mask # rotate-n-splat vsldoi $tmp,$zero,$in0,12 # >>32 vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail vcipherlast $key,$key,$rcon stvx $stage,0,$out addi $out,$out,16 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vxor $in0,$in0,$key vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail stvx $stage,0,$out addi $inp,$out,15 # 15 is not typo addi $out,$out,0x50 li $rounds,10 b Ldone .align 4 L192: lvx $tmp,0,$inp li $cnt,4 vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail stvx $stage,0,$out addi $out,$out,16 vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] vspltisb $key,8 # borrow $key mtctr $cnt vsububm $mask,$mask,$key # adjust the mask Loop192: vperm $key,$in1,$in1,$mask # roate-n-splat vsldoi $tmp,$zero,$in0,12 # >>32 vcipherlast $key,$key,$rcon vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vsldoi $stage,$zero,$in1,8 vspltw $tmp,$in0,3 vxor $tmp,$tmp,$in1 vsldoi $in1,$zero,$in1,12 # >>32 vadduwm $rcon,$rcon,$rcon vxor $in1,$in1,$tmp vxor $in0,$in0,$key vxor $in1,$in1,$key vsldoi $stage,$stage,$in0,8 vperm $key,$in1,$in1,$mask # rotate-n-splat vsldoi $tmp,$zero,$in0,12 # >>32 vperm $outtail,$stage,$stage,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail vcipherlast $key,$key,$rcon stvx $stage,0,$out addi $out,$out,16 vsldoi $stage,$in0,$in1,8 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vperm $outtail,$stage,$stage,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp stvx $stage,0,$out addi $out,$out,16 vspltw $tmp,$in0,3 vxor $tmp,$tmp,$in1 vsldoi $in1,$zero,$in1,12 # >>32 vadduwm $rcon,$rcon,$rcon vxor $in1,$in1,$tmp vxor $in0,$in0,$key vxor $in1,$in1,$key vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail stvx $stage,0,$out addi $inp,$out,15 # 15 is not typo addi $out,$out,16 bdnz Loop192 li $rounds,12 addi $out,$out,0x20 b Ldone .align 4 L256: lvx $tmp,0,$inp li $cnt,7 li $rounds,14 vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail stvx $stage,0,$out addi $out,$out,16 vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] mtctr $cnt Loop256: vperm $key,$in1,$in1,$mask # rotate-n-splat vsldoi $tmp,$zero,$in0,12 # >>32 vperm $outtail,$in1,$in1,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail vcipherlast $key,$key,$rcon stvx $stage,0,$out addi $out,$out,16 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in0,$in0,$tmp vadduwm $rcon,$rcon,$rcon vxor $in0,$in0,$key vperm $outtail,$in0,$in0,$outperm # rotate vsel $stage,$outhead,$outtail,$outmask vmr $outhead,$outtail stvx $stage,0,$out addi $inp,$out,15 # 15 is not typo addi $out,$out,16 bdz Ldone vspltw $key,$in0,3 # just splat vsldoi $tmp,$zero,$in1,12 # >>32 vsbox $key,$key vxor $in1,$in1,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in1,$in1,$tmp vsldoi $tmp,$zero,$tmp,12 # >>32 vxor $in1,$in1,$tmp vxor $in1,$in1,$key b Loop256 .align 4 Ldone: lvx $in1,0,$inp # redundant in aligned case vsel $in1,$outhead,$in1,$outmask stvx $in1,0,$inp li $ptr,0 mtspr 256,$vrsave stw $rounds,0($out) Lenc_key_abort: mr r3,$ptr blr .long 0 .byte 0,12,0x14,1,0,0,3,0 .long 0 .size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key .globl .${prefix}_set_decrypt_key .align 5 .${prefix}_set_decrypt_key: $STU $sp,-$FRAME($sp) mflr r10 $PUSH r10,`$FRAME+$LRSAVE`($sp) bl Lset_encrypt_key mtlr r10 cmpwi r3,0 bne- Ldec_key_abort slwi $cnt,$rounds,4 subi $inp,$out,240 # first round key srwi $rounds,$rounds,1 add $out,$inp,$cnt # last round key mtctr $rounds Ldeckey: lwz r0, 0($inp) lwz r6, 4($inp) lwz r7, 8($inp) lwz r8, 12($inp) addi $inp,$inp,16 lwz r9, 0($out) lwz r10,4($out) lwz r11,8($out) lwz r12,12($out) stw r0, 0($out) stw r6, 4($out) stw r7, 8($out) stw r8, 12($out) subi $out,$out,16 stw r9, -16($inp) stw r10,-12($inp) stw r11,-8($inp) stw r12,-4($inp) bdnz Ldeckey xor r3,r3,r3 # return value Ldec_key_abort: addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,0,3,0 .long 0 .size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key ___ }}} ######################################################################### {{{ # Single block en- and decrypt procedures # sub gen_block () { my $dir = shift; my $n = $dir eq "de" ? "n" : ""; my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); $code.=<<___; .globl .${prefix}_${dir}crypt .align 5 .${prefix}_${dir}crypt: lwz $rounds,240($key) lis r0,0xfc00 mfspr $vrsave,256 li $idx,15 # 15 is not typo mtspr 256,r0 lvx v0,0,$inp neg r11,$out lvx v1,$idx,$inp lvsl v2,0,$inp # inpperm le?vspltisb v4,0x0f ?lvsl v3,0,r11 # outperm le?vxor v2,v2,v4 li $idx,16 vperm v0,v0,v1,v2 # align [and byte swap in LE] lvx v1,0,$key ?lvsl v5,0,$key # keyperm srwi $rounds,$rounds,1 lvx v2,$idx,$key addi $idx,$idx,16 subi $rounds,$rounds,1 ?vperm v1,v1,v2,v5 # align round key vxor v0,v0,v1 lvx v1,$idx,$key addi $idx,$idx,16 mtctr $rounds Loop_${dir}c: ?vperm v2,v2,v1,v5 v${n}cipher v0,v0,v2 lvx v2,$idx,$key addi $idx,$idx,16 ?vperm v1,v1,v2,v5 v${n}cipher v0,v0,v1 lvx v1,$idx,$key addi $idx,$idx,16 bdnz Loop_${dir}c ?vperm v2,v2,v1,v5 v${n}cipher v0,v0,v2 lvx v2,$idx,$key ?vperm v1,v1,v2,v5 v${n}cipherlast v0,v0,v1 vspltisb v2,-1 vxor v1,v1,v1 li $idx,15 # 15 is not typo ?vperm v2,v1,v2,v3 # outmask le?vxor v3,v3,v4 lvx v1,0,$out # outhead vperm v0,v0,v0,v3 # rotate [and byte swap in LE] vsel v1,v1,v0,v2 lvx v4,$idx,$out stvx v1,0,$out vsel v0,v0,v4,v2 stvx v0,$idx,$out mtspr 256,$vrsave blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt ___ } &gen_block("en"); &gen_block("de"); }}} ######################################################################### {{{ # CBC en- and decrypt procedures # my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= map("v$_",(4..10)); $code.=<<___; .globl .${prefix}_cbc_encrypt .align 5 .${prefix}_cbc_encrypt: ${UCMP}i $len,16 bltlr- cmpwi $enc,0 # test direction lis r0,0xffe0 mfspr $vrsave,256 mtspr 256,r0 li $idx,15 vxor $rndkey0,$rndkey0,$rndkey0 le?vspltisb $tmp,0x0f lvx $ivec,0,$ivp # load [unaligned] iv lvsl $inpperm,0,$ivp lvx $inptail,$idx,$ivp le?vxor $inpperm,$inpperm,$tmp vperm $ivec,$ivec,$inptail,$inpperm neg r11,$inp ?lvsl $keyperm,0,$key # prepare for unaligned key lwz $rounds,240($key) lvsr $inpperm,0,r11 # prepare for unaligned load lvx $inptail,0,$inp addi $inp,$inp,15 # 15 is not typo le?vxor $inpperm,$inpperm,$tmp ?lvsr $outperm,0,$out # prepare for unaligned store vspltisb $outmask,-1 lvx $outhead,0,$out ?vperm $outmask,$rndkey0,$outmask,$outperm le?vxor $outperm,$outperm,$tmp srwi $rounds,$rounds,1 li $idx,16 subi $rounds,$rounds,1 beq Lcbc_dec Lcbc_enc: vmr $inout,$inptail lvx $inptail,0,$inp addi $inp,$inp,16 mtctr $rounds subi $len,$len,16 # len-=16 lvx $rndkey0,0,$key vperm $inout,$inout,$inptail,$inpperm lvx $rndkey1,$idx,$key addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 vxor $inout,$inout,$ivec Loop_cbc_enc: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipher $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 bdnz Loop_cbc_enc ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key li $idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipherlast $ivec,$inout,$rndkey0 ${UCMP}i $len,16 vperm $tmp,$ivec,$ivec,$outperm vsel $inout,$outhead,$tmp,$outmask vmr $outhead,$tmp stvx $inout,0,$out addi $out,$out,16 bge Lcbc_enc b Lcbc_done .align 4 Lcbc_dec: ${UCMP}i $len,128 bge _aesp8_cbc_decrypt8x vmr $tmp,$inptail lvx $inptail,0,$inp addi $inp,$inp,16 mtctr $rounds subi $len,$len,16 # len-=16 lvx $rndkey0,0,$key vperm $tmp,$tmp,$inptail,$inpperm lvx $rndkey1,$idx,$key addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$tmp,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 Loop_cbc_dec: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vncipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vncipher $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 bdnz Loop_cbc_dec ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vncipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key li $idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vncipherlast $inout,$inout,$rndkey0 ${UCMP}i $len,16 vxor $inout,$inout,$ivec vmr $ivec,$tmp vperm $tmp,$inout,$inout,$outperm vsel $inout,$outhead,$tmp,$outmask vmr $outhead,$tmp stvx $inout,0,$out addi $out,$out,16 bge Lcbc_dec Lcbc_done: addi $out,$out,-1 lvx $inout,0,$out # redundant in aligned case vsel $inout,$outhead,$inout,$outmask stvx $inout,0,$out neg $enc,$ivp # write [unaligned] iv li $idx,15 # 15 is not typo vxor $rndkey0,$rndkey0,$rndkey0 vspltisb $outmask,-1 le?vspltisb $tmp,0x0f ?lvsl $outperm,0,$enc ?vperm $outmask,$rndkey0,$outmask,$outperm le?vxor $outperm,$outperm,$tmp lvx $outhead,0,$ivp vperm $ivec,$ivec,$ivec,$outperm vsel $inout,$outhead,$ivec,$outmask lvx $inptail,$idx,$ivp stvx $inout,0,$ivp vsel $inout,$ivec,$inptail,$outmask stvx $inout,$idx,$ivp mtspr 256,$vrsave blr .long 0 .byte 0,12,0x14,0,0,0,6,0 .long 0 ___ ######################################################################### {{ # Optimized CBC decrypt procedure # my $key_="r11"; my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); $x00=0 if ($flavour =~ /osx/); my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys # v26-v31 last 6 round keys my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment $code.=<<___; .align 5 _aesp8_cbc_decrypt8x: $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) li r10,`$FRAME+8*16+15` li r11,`$FRAME+8*16+31` stvx v20,r10,$sp # ABI says so addi r10,r10,32 stvx v21,r11,$sp addi r11,r11,32 stvx v22,r10,$sp addi r10,r10,32 stvx v23,r11,$sp addi r11,r11,32 stvx v24,r10,$sp addi r10,r10,32 stvx v25,r11,$sp addi r11,r11,32 stvx v26,r10,$sp addi r10,r10,32 stvx v27,r11,$sp addi r11,r11,32 stvx v28,r10,$sp addi r10,r10,32 stvx v29,r11,$sp addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp li r0,-1 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave li $x10,0x10 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) li $x20,0x20 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) li $x30,0x30 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) li $x40,0x40 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) li $x50,0x50 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) li $x60,0x60 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) li $x70,0x70 mtspr 256,r0 subi $rounds,$rounds,3 # -4 in total subi $len,$len,128 # bias lvx $rndkey0,$x00,$key # load key schedule lvx v30,$x10,$key addi $key,$key,0x20 lvx v31,$x00,$key ?vperm $rndkey0,$rndkey0,v30,$keyperm addi $key_,$sp,`$FRAME+15` mtctr $rounds Load_cbc_dec_key: ?vperm v24,v30,v31,$keyperm lvx v30,$x10,$key addi $key,$key,0x20 stvx v24,$x00,$key_ # off-load round[1] ?vperm v25,v31,v30,$keyperm lvx v31,$x00,$key stvx v25,$x10,$key_ # off-load round[2] addi $key_,$key_,0x20 bdnz Load_cbc_dec_key lvx v26,$x10,$key ?vperm v24,v30,v31,$keyperm lvx v27,$x20,$key stvx v24,$x00,$key_ # off-load round[3] ?vperm v25,v31,v26,$keyperm lvx v28,$x30,$key stvx v25,$x10,$key_ # off-load round[4] addi $key_,$sp,`$FRAME+15` # rewind $key_ ?vperm v26,v26,v27,$keyperm lvx v29,$x40,$key ?vperm v27,v27,v28,$keyperm lvx v30,$x50,$key ?vperm v28,v28,v29,$keyperm lvx v31,$x60,$key ?vperm v29,v29,v30,$keyperm lvx $out0,$x70,$key # borrow $out0 ?vperm v30,v30,v31,$keyperm lvx v24,$x00,$key_ # pre-load round[1] ?vperm v31,v31,$out0,$keyperm lvx v25,$x10,$key_ # pre-load round[2] #lvx $inptail,0,$inp # "caller" already did this #addi $inp,$inp,15 # 15 is not typo subi $inp,$inp,15 # undo "caller" le?li $idx,8 lvx_u $in0,$x00,$inp # load first 8 "words" le?lvsl $inpperm,0,$idx le?vspltisb $tmp,0x0f lvx_u $in1,$x10,$inp le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u lvx_u $in2,$x20,$inp le?vperm $in0,$in0,$in0,$inpperm lvx_u $in3,$x30,$inp le?vperm $in1,$in1,$in1,$inpperm lvx_u $in4,$x40,$inp le?vperm $in2,$in2,$in2,$inpperm vxor $out0,$in0,$rndkey0 lvx_u $in5,$x50,$inp le?vperm $in3,$in3,$in3,$inpperm vxor $out1,$in1,$rndkey0 lvx_u $in6,$x60,$inp le?vperm $in4,$in4,$in4,$inpperm vxor $out2,$in2,$rndkey0 lvx_u $in7,$x70,$inp addi $inp,$inp,0x80 le?vperm $in5,$in5,$in5,$inpperm vxor $out3,$in3,$rndkey0 le?vperm $in6,$in6,$in6,$inpperm vxor $out4,$in4,$rndkey0 le?vperm $in7,$in7,$in7,$inpperm vxor $out5,$in5,$rndkey0 vxor $out6,$in6,$rndkey0 vxor $out7,$in7,$rndkey0 mtctr $rounds b Loop_cbc_dec8x .align 5 Loop_cbc_dec8x: vncipher $out0,$out0,v24 vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 vncipher $out6,$out6,v24 vncipher $out7,$out7,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 vncipher $out5,$out5,v25 vncipher $out6,$out6,v25 vncipher $out7,$out7,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_cbc_dec8x subic $len,$len,128 # $len-=128 vncipher $out0,$out0,v24 vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 vncipher $out6,$out6,v24 vncipher $out7,$out7,v24 subfe. r0,r0,r0 # borrow?-1:0 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 vncipher $out5,$out5,v25 vncipher $out6,$out6,v25 vncipher $out7,$out7,v25 and r0,r0,$len vncipher $out0,$out0,v26 vncipher $out1,$out1,v26 vncipher $out2,$out2,v26 vncipher $out3,$out3,v26 vncipher $out4,$out4,v26 vncipher $out5,$out5,v26 vncipher $out6,$out6,v26 vncipher $out7,$out7,v26 add $inp,$inp,r0 # $inp is adjusted in such # way that at exit from the # loop inX-in7 are loaded # with last "words" vncipher $out0,$out0,v27 vncipher $out1,$out1,v27 vncipher $out2,$out2,v27 vncipher $out3,$out3,v27 vncipher $out4,$out4,v27 vncipher $out5,$out5,v27 vncipher $out6,$out6,v27 vncipher $out7,$out7,v27 addi $key_,$sp,`$FRAME+15` # rewind $key_ vncipher $out0,$out0,v28 vncipher $out1,$out1,v28 vncipher $out2,$out2,v28 vncipher $out3,$out3,v28 vncipher $out4,$out4,v28 vncipher $out5,$out5,v28 vncipher $out6,$out6,v28 vncipher $out7,$out7,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vncipher $out0,$out0,v29 vncipher $out1,$out1,v29 vncipher $out2,$out2,v29 vncipher $out3,$out3,v29 vncipher $out4,$out4,v29 vncipher $out5,$out5,v29 vncipher $out6,$out6,v29 vncipher $out7,$out7,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vncipher $out0,$out0,v30 vxor $ivec,$ivec,v31 # xor with last round key vncipher $out1,$out1,v30 vxor $in0,$in0,v31 vncipher $out2,$out2,v30 vxor $in1,$in1,v31 vncipher $out3,$out3,v30 vxor $in2,$in2,v31 vncipher $out4,$out4,v30 vxor $in3,$in3,v31 vncipher $out5,$out5,v30 vxor $in4,$in4,v31 vncipher $out6,$out6,v30 vxor $in5,$in5,v31 vncipher $out7,$out7,v30 vxor $in6,$in6,v31 vncipherlast $out0,$out0,$ivec vncipherlast $out1,$out1,$in0 lvx_u $in0,$x00,$inp # load next input block vncipherlast $out2,$out2,$in1 lvx_u $in1,$x10,$inp vncipherlast $out3,$out3,$in2 le?vperm $in0,$in0,$in0,$inpperm lvx_u $in2,$x20,$inp vncipherlast $out4,$out4,$in3 le?vperm $in1,$in1,$in1,$inpperm lvx_u $in3,$x30,$inp vncipherlast $out5,$out5,$in4 le?vperm $in2,$in2,$in2,$inpperm lvx_u $in4,$x40,$inp vncipherlast $out6,$out6,$in5 le?vperm $in3,$in3,$in3,$inpperm lvx_u $in5,$x50,$inp vncipherlast $out7,$out7,$in6 le?vperm $in4,$in4,$in4,$inpperm lvx_u $in6,$x60,$inp vmr $ivec,$in7 le?vperm $in5,$in5,$in5,$inpperm lvx_u $in7,$x70,$inp addi $inp,$inp,0x80 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $in6,$in6,$in6,$inpperm vxor $out0,$in0,$rndkey0 le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out le?vperm $in7,$in7,$in7,$inpperm vxor $out1,$in1,$rndkey0 le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x20,$out vxor $out2,$in2,$rndkey0 le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x30,$out vxor $out3,$in3,$rndkey0 le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x40,$out vxor $out4,$in4,$rndkey0 le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x50,$out vxor $out5,$in5,$rndkey0 le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x60,$out vxor $out6,$in6,$rndkey0 stvx_u $out7,$x70,$out addi $out,$out,0x80 vxor $out7,$in7,$rndkey0 mtctr $rounds beq Loop_cbc_dec8x # did $len-=128 borrow? addic. $len,$len,128 beq Lcbc_dec8x_done nop nop Loop_cbc_dec8x_tail: # up to 7 "words" tail... vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 vncipher $out6,$out6,v24 vncipher $out7,$out7,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 vncipher $out5,$out5,v25 vncipher $out6,$out6,v25 vncipher $out7,$out7,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_cbc_dec8x_tail vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 vncipher $out6,$out6,v24 vncipher $out7,$out7,v24 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 vncipher $out5,$out5,v25 vncipher $out6,$out6,v25 vncipher $out7,$out7,v25 vncipher $out1,$out1,v26 vncipher $out2,$out2,v26 vncipher $out3,$out3,v26 vncipher $out4,$out4,v26 vncipher $out5,$out5,v26 vncipher $out6,$out6,v26 vncipher $out7,$out7,v26 vncipher $out1,$out1,v27 vncipher $out2,$out2,v27 vncipher $out3,$out3,v27 vncipher $out4,$out4,v27 vncipher $out5,$out5,v27 vncipher $out6,$out6,v27 vncipher $out7,$out7,v27 vncipher $out1,$out1,v28 vncipher $out2,$out2,v28 vncipher $out3,$out3,v28 vncipher $out4,$out4,v28 vncipher $out5,$out5,v28 vncipher $out6,$out6,v28 vncipher $out7,$out7,v28 vncipher $out1,$out1,v29 vncipher $out2,$out2,v29 vncipher $out3,$out3,v29 vncipher $out4,$out4,v29 vncipher $out5,$out5,v29 vncipher $out6,$out6,v29 vncipher $out7,$out7,v29 vncipher $out1,$out1,v30 vxor $ivec,$ivec,v31 # last round key vncipher $out2,$out2,v30 vxor $in1,$in1,v31 vncipher $out3,$out3,v30 vxor $in2,$in2,v31 vncipher $out4,$out4,v30 vxor $in3,$in3,v31 vncipher $out5,$out5,v30 vxor $in4,$in4,v31 vncipher $out6,$out6,v30 vxor $in5,$in5,v31 vncipher $out7,$out7,v30 vxor $in6,$in6,v31 cmplwi $len,32 # switch($len) blt Lcbc_dec8x_one nop beq Lcbc_dec8x_two cmplwi $len,64 blt Lcbc_dec8x_three nop beq Lcbc_dec8x_four cmplwi $len,96 blt Lcbc_dec8x_five nop beq Lcbc_dec8x_six Lcbc_dec8x_seven: vncipherlast $out1,$out1,$ivec vncipherlast $out2,$out2,$in1 vncipherlast $out3,$out3,$in2 vncipherlast $out4,$out4,$in3 vncipherlast $out5,$out5,$in4 vncipherlast $out6,$out6,$in5 vncipherlast $out7,$out7,$in6 vmr $ivec,$in7 le?vperm $out1,$out1,$out1,$inpperm le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x00,$out le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x10,$out le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x20,$out le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x30,$out le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x40,$out le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x50,$out stvx_u $out7,$x60,$out addi $out,$out,0x70 b Lcbc_dec8x_done .align 5 Lcbc_dec8x_six: vncipherlast $out2,$out2,$ivec vncipherlast $out3,$out3,$in2 vncipherlast $out4,$out4,$in3 vncipherlast $out5,$out5,$in4 vncipherlast $out6,$out6,$in5 vncipherlast $out7,$out7,$in6 vmr $ivec,$in7 le?vperm $out2,$out2,$out2,$inpperm le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x00,$out le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x10,$out le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x20,$out le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x30,$out le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x40,$out stvx_u $out7,$x50,$out addi $out,$out,0x60 b Lcbc_dec8x_done .align 5 Lcbc_dec8x_five: vncipherlast $out3,$out3,$ivec vncipherlast $out4,$out4,$in3 vncipherlast $out5,$out5,$in4 vncipherlast $out6,$out6,$in5 vncipherlast $out7,$out7,$in6 vmr $ivec,$in7 le?vperm $out3,$out3,$out3,$inpperm le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x00,$out le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x10,$out le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x20,$out le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x30,$out stvx_u $out7,$x40,$out addi $out,$out,0x50 b Lcbc_dec8x_done .align 5 Lcbc_dec8x_four: vncipherlast $out4,$out4,$ivec vncipherlast $out5,$out5,$in4 vncipherlast $out6,$out6,$in5 vncipherlast $out7,$out7,$in6 vmr $ivec,$in7 le?vperm $out4,$out4,$out4,$inpperm le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x00,$out le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x10,$out le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x20,$out stvx_u $out7,$x30,$out addi $out,$out,0x40 b Lcbc_dec8x_done .align 5 Lcbc_dec8x_three: vncipherlast $out5,$out5,$ivec vncipherlast $out6,$out6,$in5 vncipherlast $out7,$out7,$in6 vmr $ivec,$in7 le?vperm $out5,$out5,$out5,$inpperm le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x00,$out le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x10,$out stvx_u $out7,$x20,$out addi $out,$out,0x30 b Lcbc_dec8x_done .align 5 Lcbc_dec8x_two: vncipherlast $out6,$out6,$ivec vncipherlast $out7,$out7,$in6 vmr $ivec,$in7 le?vperm $out6,$out6,$out6,$inpperm le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x00,$out stvx_u $out7,$x10,$out addi $out,$out,0x20 b Lcbc_dec8x_done .align 5 Lcbc_dec8x_one: vncipherlast $out7,$out7,$ivec vmr $ivec,$in7 le?vperm $out7,$out7,$out7,$inpperm stvx_u $out7,0,$out addi $out,$out,0x10 Lcbc_dec8x_done: le?vperm $ivec,$ivec,$ivec,$inpperm stvx_u $ivec,0,$ivp # write [unaligned] iv li r10,`$FRAME+15` li r11,`$FRAME+31` stvx $inpperm,r10,$sp # wipe copies of round keys addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 stvx $inpperm,r10,$sp addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 stvx $inpperm,r10,$sp addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 stvx $inpperm,r10,$sp addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 mtspr 256,$vrsave lvx v20,r10,$sp # ABI says so addi r10,r10,32 lvx v21,r11,$sp addi r11,r11,32 lvx v22,r10,$sp addi r10,r10,32 lvx v23,r11,$sp addi r11,r11,32 lvx v24,r10,$sp addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 lvx v26,r10,$sp addi r10,r10,32 lvx v27,r11,$sp addi r11,r11,32 lvx v28,r10,$sp addi r10,r10,32 lvx v29,r11,$sp addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` blr .long 0 .byte 0,12,0x04,0,0x80,6,6,0 .long 0 .size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt ___ }} }}} ######################################################################### {{{ # CTR procedure[s] # my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= map("v$_",(4..11)); my $dat=$tmp; $code.=<<___; .globl .${prefix}_ctr32_encrypt_blocks .align 5 .${prefix}_ctr32_encrypt_blocks: ${UCMP}i $len,1 bltlr- lis r0,0xfff0 mfspr $vrsave,256 mtspr 256,r0 li $idx,15 vxor $rndkey0,$rndkey0,$rndkey0 le?vspltisb $tmp,0x0f lvx $ivec,0,$ivp # load [unaligned] iv lvsl $inpperm,0,$ivp lvx $inptail,$idx,$ivp vspltisb $one,1 le?vxor $inpperm,$inpperm,$tmp vperm $ivec,$ivec,$inptail,$inpperm vsldoi $one,$rndkey0,$one,1 neg r11,$inp ?lvsl $keyperm,0,$key # prepare for unaligned key lwz $rounds,240($key) lvsr $inpperm,0,r11 # prepare for unaligned load lvx $inptail,0,$inp addi $inp,$inp,15 # 15 is not typo le?vxor $inpperm,$inpperm,$tmp srwi $rounds,$rounds,1 li $idx,16 subi $rounds,$rounds,1 ${UCMP}i $len,8 bge _aesp8_ctr32_encrypt8x ?lvsr $outperm,0,$out # prepare for unaligned store vspltisb $outmask,-1 lvx $outhead,0,$out ?vperm $outmask,$rndkey0,$outmask,$outperm le?vxor $outperm,$outperm,$tmp lvx $rndkey0,0,$key mtctr $rounds lvx $rndkey1,$idx,$key addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$ivec,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 b Loop_ctr32_enc .align 5 Loop_ctr32_enc: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipher $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 bdnz Loop_ctr32_enc vadduwm $ivec,$ivec,$one vmr $dat,$inptail lvx $inptail,0,$inp addi $inp,$inp,16 subic. $len,$len,1 # blocks-- ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key vperm $dat,$dat,$inptail,$inpperm li $idx,16 ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm lvx $rndkey0,0,$key vxor $dat,$dat,$rndkey1 # last round key vcipherlast $inout,$inout,$dat lvx $rndkey1,$idx,$key addi $idx,$idx,16 vperm $inout,$inout,$inout,$outperm vsel $dat,$outhead,$inout,$outmask mtctr $rounds ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vmr $outhead,$inout vxor $inout,$ivec,$rndkey0 lvx $rndkey0,$idx,$key addi $idx,$idx,16 stvx $dat,0,$out addi $out,$out,16 bne Loop_ctr32_enc addi $out,$out,-1 lvx $inout,0,$out # redundant in aligned case vsel $inout,$outhead,$inout,$outmask stvx $inout,0,$out mtspr 256,$vrsave blr .long 0 .byte 0,12,0x14,0,0,0,6,0 .long 0 ___ ######################################################################### {{ # Optimized CTR procedure # my $key_="r11"; my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); $x00=0 if ($flavour =~ /osx/); my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys # v26-v31 last 6 round keys my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment my ($two,$three,$four)=($outhead,$outperm,$outmask); $code.=<<___; .align 5 _aesp8_ctr32_encrypt8x: $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) li r10,`$FRAME+8*16+15` li r11,`$FRAME+8*16+31` stvx v20,r10,$sp # ABI says so addi r10,r10,32 stvx v21,r11,$sp addi r11,r11,32 stvx v22,r10,$sp addi r10,r10,32 stvx v23,r11,$sp addi r11,r11,32 stvx v24,r10,$sp addi r10,r10,32 stvx v25,r11,$sp addi r11,r11,32 stvx v26,r10,$sp addi r10,r10,32 stvx v27,r11,$sp addi r11,r11,32 stvx v28,r10,$sp addi r10,r10,32 stvx v29,r11,$sp addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp li r0,-1 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave li $x10,0x10 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) li $x20,0x20 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) li $x30,0x30 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) li $x40,0x40 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) li $x50,0x50 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) li $x60,0x60 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) li $x70,0x70 mtspr 256,r0 subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key # load key schedule lvx v30,$x10,$key addi $key,$key,0x20 lvx v31,$x00,$key ?vperm $rndkey0,$rndkey0,v30,$keyperm addi $key_,$sp,`$FRAME+15` mtctr $rounds Load_ctr32_enc_key: ?vperm v24,v30,v31,$keyperm lvx v30,$x10,$key addi $key,$key,0x20 stvx v24,$x00,$key_ # off-load round[1] ?vperm v25,v31,v30,$keyperm lvx v31,$x00,$key stvx v25,$x10,$key_ # off-load round[2] addi $key_,$key_,0x20 bdnz Load_ctr32_enc_key lvx v26,$x10,$key ?vperm v24,v30,v31,$keyperm lvx v27,$x20,$key stvx v24,$x00,$key_ # off-load round[3] ?vperm v25,v31,v26,$keyperm lvx v28,$x30,$key stvx v25,$x10,$key_ # off-load round[4] addi $key_,$sp,`$FRAME+15` # rewind $key_ ?vperm v26,v26,v27,$keyperm lvx v29,$x40,$key ?vperm v27,v27,v28,$keyperm lvx v30,$x50,$key ?vperm v28,v28,v29,$keyperm lvx v31,$x60,$key ?vperm v29,v29,v30,$keyperm lvx $out0,$x70,$key # borrow $out0 ?vperm v30,v30,v31,$keyperm lvx v24,$x00,$key_ # pre-load round[1] ?vperm v31,v31,$out0,$keyperm lvx v25,$x10,$key_ # pre-load round[2] vadduwm $two,$one,$one subi $inp,$inp,15 # undo "caller" $SHL $len,$len,4 vadduwm $out1,$ivec,$one # counter values ... vadduwm $out2,$ivec,$two vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] le?li $idx,8 vadduwm $out3,$out1,$two vxor $out1,$out1,$rndkey0 le?lvsl $inpperm,0,$idx vadduwm $out4,$out2,$two vxor $out2,$out2,$rndkey0 le?vspltisb $tmp,0x0f vadduwm $out5,$out3,$two vxor $out3,$out3,$rndkey0 le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u vadduwm $out6,$out4,$two vxor $out4,$out4,$rndkey0 vadduwm $out7,$out5,$two vxor $out5,$out5,$rndkey0 vadduwm $ivec,$out6,$two # next counter value vxor $out6,$out6,$rndkey0 vxor $out7,$out7,$rndkey0 mtctr $rounds b Loop_ctr32_enc8x .align 5 Loop_ctr32_enc8x: vcipher $out0,$out0,v24 vcipher $out1,$out1,v24 vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 vcipher $out4,$out4,v24 vcipher $out5,$out5,v24 vcipher $out6,$out6,v24 vcipher $out7,$out7,v24 Loop_ctr32_enc8x_middle: lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vcipher $out4,$out4,v25 vcipher $out5,$out5,v25 vcipher $out6,$out6,v25 vcipher $out7,$out7,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_ctr32_enc8x subic r11,$len,256 # $len-256, borrow $key_ vcipher $out0,$out0,v24 vcipher $out1,$out1,v24 vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 vcipher $out4,$out4,v24 vcipher $out5,$out5,v24 vcipher $out6,$out6,v24 vcipher $out7,$out7,v24 subfe r0,r0,r0 # borrow?-1:0 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vcipher $out4,$out4,v25 vcipher $out5,$out5,v25 vcipher $out6,$out6,v25 vcipher $out7,$out7,v25 and r0,r0,r11 addi $key_,$sp,`$FRAME+15` # rewind $key_ vcipher $out0,$out0,v26 vcipher $out1,$out1,v26 vcipher $out2,$out2,v26 vcipher $out3,$out3,v26 vcipher $out4,$out4,v26 vcipher $out5,$out5,v26 vcipher $out6,$out6,v26 vcipher $out7,$out7,v26 lvx v24,$x00,$key_ # re-pre-load round[1] subic $len,$len,129 # $len-=129 vcipher $out0,$out0,v27 addi $len,$len,1 # $len-=128 really vcipher $out1,$out1,v27 vcipher $out2,$out2,v27 vcipher $out3,$out3,v27 vcipher $out4,$out4,v27 vcipher $out5,$out5,v27 vcipher $out6,$out6,v27 vcipher $out7,$out7,v27 lvx v25,$x10,$key_ # re-pre-load round[2] vcipher $out0,$out0,v28 lvx_u $in0,$x00,$inp # load input vcipher $out1,$out1,v28 lvx_u $in1,$x10,$inp vcipher $out2,$out2,v28 lvx_u $in2,$x20,$inp vcipher $out3,$out3,v28 lvx_u $in3,$x30,$inp vcipher $out4,$out4,v28 lvx_u $in4,$x40,$inp vcipher $out5,$out5,v28 lvx_u $in5,$x50,$inp vcipher $out6,$out6,v28 lvx_u $in6,$x60,$inp vcipher $out7,$out7,v28 lvx_u $in7,$x70,$inp addi $inp,$inp,0x80 vcipher $out0,$out0,v29 le?vperm $in0,$in0,$in0,$inpperm vcipher $out1,$out1,v29 le?vperm $in1,$in1,$in1,$inpperm vcipher $out2,$out2,v29 le?vperm $in2,$in2,$in2,$inpperm vcipher $out3,$out3,v29 le?vperm $in3,$in3,$in3,$inpperm vcipher $out4,$out4,v29 le?vperm $in4,$in4,$in4,$inpperm vcipher $out5,$out5,v29 le?vperm $in5,$in5,$in5,$inpperm vcipher $out6,$out6,v29 le?vperm $in6,$in6,$in6,$inpperm vcipher $out7,$out7,v29 le?vperm $in7,$in7,$in7,$inpperm add $inp,$inp,r0 # $inp is adjusted in such # way that at exit from the # loop inX-in7 are loaded # with last "words" subfe. r0,r0,r0 # borrow?-1:0 vcipher $out0,$out0,v30 vxor $in0,$in0,v31 # xor with last round key vcipher $out1,$out1,v30 vxor $in1,$in1,v31 vcipher $out2,$out2,v30 vxor $in2,$in2,v31 vcipher $out3,$out3,v30 vxor $in3,$in3,v31 vcipher $out4,$out4,v30 vxor $in4,$in4,v31 vcipher $out5,$out5,v30 vxor $in5,$in5,v31 vcipher $out6,$out6,v30 vxor $in6,$in6,v31 vcipher $out7,$out7,v30 vxor $in7,$in7,v31 bne Lctr32_enc8x_break # did $len-129 borrow? vcipherlast $in0,$out0,$in0 vcipherlast $in1,$out1,$in1 vadduwm $out1,$ivec,$one # counter values ... vcipherlast $in2,$out2,$in2 vadduwm $out2,$ivec,$two vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] vcipherlast $in3,$out3,$in3 vadduwm $out3,$out1,$two vxor $out1,$out1,$rndkey0 vcipherlast $in4,$out4,$in4 vadduwm $out4,$out2,$two vxor $out2,$out2,$rndkey0 vcipherlast $in5,$out5,$in5 vadduwm $out5,$out3,$two vxor $out3,$out3,$rndkey0 vcipherlast $in6,$out6,$in6 vadduwm $out6,$out4,$two vxor $out4,$out4,$rndkey0 vcipherlast $in7,$out7,$in7 vadduwm $out7,$out5,$two vxor $out5,$out5,$rndkey0 le?vperm $in0,$in0,$in0,$inpperm vadduwm $ivec,$out6,$two # next counter value vxor $out6,$out6,$rndkey0 le?vperm $in1,$in1,$in1,$inpperm vxor $out7,$out7,$rndkey0 mtctr $rounds vcipher $out0,$out0,v24 stvx_u $in0,$x00,$out le?vperm $in2,$in2,$in2,$inpperm vcipher $out1,$out1,v24 stvx_u $in1,$x10,$out le?vperm $in3,$in3,$in3,$inpperm vcipher $out2,$out2,v24 stvx_u $in2,$x20,$out le?vperm $in4,$in4,$in4,$inpperm vcipher $out3,$out3,v24 stvx_u $in3,$x30,$out le?vperm $in5,$in5,$in5,$inpperm vcipher $out4,$out4,v24 stvx_u $in4,$x40,$out le?vperm $in6,$in6,$in6,$inpperm vcipher $out5,$out5,v24 stvx_u $in5,$x50,$out le?vperm $in7,$in7,$in7,$inpperm vcipher $out6,$out6,v24 stvx_u $in6,$x60,$out vcipher $out7,$out7,v24 stvx_u $in7,$x70,$out addi $out,$out,0x80 b Loop_ctr32_enc8x_middle .align 5 Lctr32_enc8x_break: cmpwi $len,-0x60 blt Lctr32_enc8x_one nop beq Lctr32_enc8x_two cmpwi $len,-0x40 blt Lctr32_enc8x_three nop beq Lctr32_enc8x_four cmpwi $len,-0x20 blt Lctr32_enc8x_five nop beq Lctr32_enc8x_six cmpwi $len,0x00 blt Lctr32_enc8x_seven Lctr32_enc8x_eight: vcipherlast $out0,$out0,$in0 vcipherlast $out1,$out1,$in1 vcipherlast $out2,$out2,$in2 vcipherlast $out3,$out3,$in3 vcipherlast $out4,$out4,$in4 vcipherlast $out5,$out5,$in5 vcipherlast $out6,$out6,$in6 vcipherlast $out7,$out7,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x20,$out le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x30,$out le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x40,$out le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x50,$out le?vperm $out7,$out7,$out7,$inpperm stvx_u $out6,$x60,$out stvx_u $out7,$x70,$out addi $out,$out,0x80 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_seven: vcipherlast $out0,$out0,$in1 vcipherlast $out1,$out1,$in2 vcipherlast $out2,$out2,$in3 vcipherlast $out3,$out3,$in4 vcipherlast $out4,$out4,$in5 vcipherlast $out5,$out5,$in6 vcipherlast $out6,$out6,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x20,$out le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x30,$out le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x40,$out le?vperm $out6,$out6,$out6,$inpperm stvx_u $out5,$x50,$out stvx_u $out6,$x60,$out addi $out,$out,0x70 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_six: vcipherlast $out0,$out0,$in2 vcipherlast $out1,$out1,$in3 vcipherlast $out2,$out2,$in4 vcipherlast $out3,$out3,$in5 vcipherlast $out4,$out4,$in6 vcipherlast $out5,$out5,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x20,$out le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x30,$out le?vperm $out5,$out5,$out5,$inpperm stvx_u $out4,$x40,$out stvx_u $out5,$x50,$out addi $out,$out,0x60 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_five: vcipherlast $out0,$out0,$in3 vcipherlast $out1,$out1,$in4 vcipherlast $out2,$out2,$in5 vcipherlast $out3,$out3,$in6 vcipherlast $out4,$out4,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x20,$out le?vperm $out4,$out4,$out4,$inpperm stvx_u $out3,$x30,$out stvx_u $out4,$x40,$out addi $out,$out,0x50 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_four: vcipherlast $out0,$out0,$in4 vcipherlast $out1,$out1,$in5 vcipherlast $out2,$out2,$in6 vcipherlast $out3,$out3,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$inpperm stvx_u $out2,$x20,$out stvx_u $out3,$x30,$out addi $out,$out,0x40 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_three: vcipherlast $out0,$out0,$in5 vcipherlast $out1,$out1,$in6 vcipherlast $out2,$out2,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out le?vperm $out2,$out2,$out2,$inpperm stvx_u $out1,$x10,$out stvx_u $out2,$x20,$out addi $out,$out,0x30 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_two: vcipherlast $out0,$out0,$in6 vcipherlast $out1,$out1,$in7 le?vperm $out0,$out0,$out0,$inpperm le?vperm $out1,$out1,$out1,$inpperm stvx_u $out0,$x00,$out stvx_u $out1,$x10,$out addi $out,$out,0x20 b Lctr32_enc8x_done .align 5 Lctr32_enc8x_one: vcipherlast $out0,$out0,$in7 le?vperm $out0,$out0,$out0,$inpperm stvx_u $out0,0,$out addi $out,$out,0x10 Lctr32_enc8x_done: li r10,`$FRAME+15` li r11,`$FRAME+31` stvx $inpperm,r10,$sp # wipe copies of round keys addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 stvx $inpperm,r10,$sp addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 stvx $inpperm,r10,$sp addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 stvx $inpperm,r10,$sp addi r10,r10,32 stvx $inpperm,r11,$sp addi r11,r11,32 mtspr 256,$vrsave lvx v20,r10,$sp # ABI says so addi r10,r10,32 lvx v21,r11,$sp addi r11,r11,32 lvx v22,r10,$sp addi r10,r10,32 lvx v23,r11,$sp addi r11,r11,32 lvx v24,r10,$sp addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 lvx v26,r10,$sp addi r10,r10,32 lvx v27,r11,$sp addi r11,r11,32 lvx v28,r10,$sp addi r10,r10,32 lvx v29,r11,$sp addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` blr .long 0 .byte 0,12,0x04,0,0x80,6,6,0 .long 0 .size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks ___ }} }}} ######################################################################### {{{ # XTS procedures # # int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, # # const AES_KEY *key1, const AES_KEY *key2, # # [const] unsigned char iv[16]); # # If $key2 is NULL, then a "tweak chaining" mode is engaged, in which # # input tweak value is assumed to be encrypted already, and last tweak # # value, one suitable for consecutive call on same chunk of data, is # # written back to original buffer. In addition, in "tweak chaining" # # mode only complete input blocks are processed. # my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); my $taillen = $key2; ($inp,$idx) = ($idx,$inp); # reassign $code.=<<___; .globl .${prefix}_xts_encrypt .align 5 .${prefix}_xts_encrypt: mr $inp,r3 # reassign li r3,-1 ${UCMP}i $len,16 bltlr- lis r0,0xfff0 mfspr r12,256 # save vrsave li r11,0 mtspr 256,r0 vspltisb $seven,0x07 # 0x070707..07 le?lvsl $leperm,r11,r11 le?vspltisb $tmp,0x0f le?vxor $leperm,$leperm,$seven li $idx,15 lvx $tweak,0,$ivp # load [unaligned] iv lvsl $inpperm,0,$ivp lvx $inptail,$idx,$ivp le?vxor $inpperm,$inpperm,$tmp vperm $tweak,$tweak,$inptail,$inpperm neg r11,$inp lvsr $inpperm,0,r11 # prepare for unaligned load lvx $inout,0,$inp addi $inp,$inp,15 # 15 is not typo le?vxor $inpperm,$inpperm,$tmp ${UCMP}i $key2,0 # key2==NULL? beq Lxts_enc_no_key2 ?lvsl $keyperm,0,$key2 # prepare for unaligned key lwz $rounds,240($key2) srwi $rounds,$rounds,1 subi $rounds,$rounds,1 li $idx,16 lvx $rndkey0,0,$key2 lvx $rndkey1,$idx,$key2 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $tweak,$tweak,$rndkey0 lvx $rndkey0,$idx,$key2 addi $idx,$idx,16 mtctr $rounds Ltweak_xts_enc: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $tweak,$tweak,$rndkey1 lvx $rndkey1,$idx,$key2 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipher $tweak,$tweak,$rndkey0 lvx $rndkey0,$idx,$key2 addi $idx,$idx,16 bdnz Ltweak_xts_enc ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $tweak,$tweak,$rndkey1 lvx $rndkey1,$idx,$key2 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipherlast $tweak,$tweak,$rndkey0 li $ivp,0 # don't chain the tweak b Lxts_enc Lxts_enc_no_key2: li $idx,-16 and $len,$len,$idx # in "tweak chaining" # mode only complete # blocks are processed Lxts_enc: lvx $inptail,0,$inp addi $inp,$inp,16 ?lvsl $keyperm,0,$key1 # prepare for unaligned key lwz $rounds,240($key1) srwi $rounds,$rounds,1 subi $rounds,$rounds,1 li $idx,16 vslb $eighty7,$seven,$seven # 0x808080..80 vor $eighty7,$eighty7,$seven # 0x878787..87 vspltisb $tmp,1 # 0x010101..01 vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 ${UCMP}i $len,96 bge _aesp8_xts_encrypt6x andi. $taillen,$len,15 subic r0,$len,32 subi $taillen,$taillen,16 subfe r0,r0,r0 and r0,r0,$taillen add $inp,$inp,r0 lvx $rndkey0,0,$key1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 vperm $inout,$inout,$inptail,$inpperm ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$inout,$tweak vxor $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 mtctr $rounds b Loop_xts_enc .align 5 Loop_xts_enc: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipher $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 bdnz Loop_xts_enc ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key1 li $idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $rndkey0,$rndkey0,$tweak vcipherlast $output,$inout,$rndkey0 le?vperm $tmp,$output,$output,$leperm be?nop le?stvx_u $tmp,0,$out be?stvx_u $output,0,$out addi $out,$out,16 subic. $len,$len,16 beq Lxts_enc_done vmr $inout,$inptail lvx $inptail,0,$inp addi $inp,$inp,16 lvx $rndkey0,0,$key1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 subic r0,$len,32 subfe r0,r0,r0 and r0,r0,$taillen add $inp,$inp,r0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $tweak,$tweak,$tmp vperm $inout,$inout,$inptail,$inpperm ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$inout,$tweak vxor $output,$output,$rndkey0 # just in case $len<16 vxor $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 mtctr $rounds ${UCMP}i $len,16 bge Loop_xts_enc vxor $output,$output,$tweak lvsr $inpperm,0,$len # $inpperm is no longer needed vxor $inptail,$inptail,$inptail # $inptail is no longer needed vspltisb $tmp,-1 vperm $inptail,$inptail,$tmp,$inpperm vsel $inout,$inout,$output,$inptail subi r11,$out,17 subi $out,$out,16 mtctr $len li $len,16 Loop_xts_enc_steal: lbzu r0,1(r11) stb r0,16(r11) bdnz Loop_xts_enc_steal mtctr $rounds b Loop_xts_enc # one more time... Lxts_enc_done: ${UCMP}i $ivp,0 beq Lxts_enc_ret vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $tweak,$tweak,$tmp le?vperm $tweak,$tweak,$tweak,$leperm stvx_u $tweak,0,$ivp Lxts_enc_ret: mtspr 256,r12 # restore vrsave li r3,0 blr .long 0 .byte 0,12,0x04,0,0x80,6,6,0 .long 0 .size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt .globl .${prefix}_xts_decrypt .align 5 .${prefix}_xts_decrypt: mr $inp,r3 # reassign li r3,-1 ${UCMP}i $len,16 bltlr- lis r0,0xfff8 mfspr r12,256 # save vrsave li r11,0 mtspr 256,r0 andi. r0,$len,15 neg r0,r0 andi. r0,r0,16 sub $len,$len,r0 vspltisb $seven,0x07 # 0x070707..07 le?lvsl $leperm,r11,r11 le?vspltisb $tmp,0x0f le?vxor $leperm,$leperm,$seven li $idx,15 lvx $tweak,0,$ivp # load [unaligned] iv lvsl $inpperm,0,$ivp lvx $inptail,$idx,$ivp le?vxor $inpperm,$inpperm,$tmp vperm $tweak,$tweak,$inptail,$inpperm neg r11,$inp lvsr $inpperm,0,r11 # prepare for unaligned load lvx $inout,0,$inp addi $inp,$inp,15 # 15 is not typo le?vxor $inpperm,$inpperm,$tmp ${UCMP}i $key2,0 # key2==NULL? beq Lxts_dec_no_key2 ?lvsl $keyperm,0,$key2 # prepare for unaligned key lwz $rounds,240($key2) srwi $rounds,$rounds,1 subi $rounds,$rounds,1 li $idx,16 lvx $rndkey0,0,$key2 lvx $rndkey1,$idx,$key2 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $tweak,$tweak,$rndkey0 lvx $rndkey0,$idx,$key2 addi $idx,$idx,16 mtctr $rounds Ltweak_xts_dec: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $tweak,$tweak,$rndkey1 lvx $rndkey1,$idx,$key2 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipher $tweak,$tweak,$rndkey0 lvx $rndkey0,$idx,$key2 addi $idx,$idx,16 bdnz Ltweak_xts_dec ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vcipher $tweak,$tweak,$rndkey1 lvx $rndkey1,$idx,$key2 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vcipherlast $tweak,$tweak,$rndkey0 li $ivp,0 # don't chain the tweak b Lxts_dec Lxts_dec_no_key2: neg $idx,$len andi. $idx,$idx,15 add $len,$len,$idx # in "tweak chaining" # mode only complete # blocks are processed Lxts_dec: lvx $inptail,0,$inp addi $inp,$inp,16 ?lvsl $keyperm,0,$key1 # prepare for unaligned key lwz $rounds,240($key1) srwi $rounds,$rounds,1 subi $rounds,$rounds,1 li $idx,16 vslb $eighty7,$seven,$seven # 0x808080..80 vor $eighty7,$eighty7,$seven # 0x878787..87 vspltisb $tmp,1 # 0x010101..01 vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 ${UCMP}i $len,96 bge _aesp8_xts_decrypt6x lvx $rndkey0,0,$key1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 vperm $inout,$inout,$inptail,$inpperm ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$inout,$tweak vxor $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 mtctr $rounds ${UCMP}i $len,16 blt Ltail_xts_dec be?b Loop_xts_dec .align 5 Loop_xts_dec: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vncipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vncipher $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 bdnz Loop_xts_dec ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vncipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key1 li $idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $rndkey0,$rndkey0,$tweak vncipherlast $output,$inout,$rndkey0 le?vperm $tmp,$output,$output,$leperm be?nop le?stvx_u $tmp,0,$out be?stvx_u $output,0,$out addi $out,$out,16 subic. $len,$len,16 beq Lxts_dec_done vmr $inout,$inptail lvx $inptail,0,$inp addi $inp,$inp,16 lvx $rndkey0,0,$key1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $tweak,$tweak,$tmp vperm $inout,$inout,$inptail,$inpperm ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $inout,$inout,$tweak vxor $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 mtctr $rounds ${UCMP}i $len,16 bge Loop_xts_dec Ltail_xts_dec: vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak1,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $tweak1,$tweak1,$tmp subi $inp,$inp,16 add $inp,$inp,$len vxor $inout,$inout,$tweak # :-( vxor $inout,$inout,$tweak1 # :-) Loop_xts_dec_short: ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vncipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vncipher $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 bdnz Loop_xts_dec_short ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm vncipher $inout,$inout,$rndkey1 lvx $rndkey1,$idx,$key1 li $idx,16 ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm vxor $rndkey0,$rndkey0,$tweak1 vncipherlast $output,$inout,$rndkey0 le?vperm $tmp,$output,$output,$leperm be?nop le?stvx_u $tmp,0,$out be?stvx_u $output,0,$out vmr $inout,$inptail lvx $inptail,0,$inp #addi $inp,$inp,16 lvx $rndkey0,0,$key1 lvx $rndkey1,$idx,$key1 addi $idx,$idx,16 vperm $inout,$inout,$inptail,$inpperm ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm lvsr $inpperm,0,$len # $inpperm is no longer needed vxor $inptail,$inptail,$inptail # $inptail is no longer needed vspltisb $tmp,-1 vperm $inptail,$inptail,$tmp,$inpperm vsel $inout,$inout,$output,$inptail vxor $rndkey0,$rndkey0,$tweak vxor $inout,$inout,$rndkey0 lvx $rndkey0,$idx,$key1 addi $idx,$idx,16 subi r11,$out,1 mtctr $len li $len,16 Loop_xts_dec_steal: lbzu r0,1(r11) stb r0,16(r11) bdnz Loop_xts_dec_steal mtctr $rounds b Loop_xts_dec # one more time... Lxts_dec_done: ${UCMP}i $ivp,0 beq Lxts_dec_ret vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $tweak,$tweak,$tmp le?vperm $tweak,$tweak,$tweak,$leperm stvx_u $tweak,0,$ivp Lxts_dec_ret: mtspr 256,r12 # restore vrsave li r3,0 blr .long 0 .byte 0,12,0x04,0,0x80,6,6,0 .long 0 .size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt ___ ######################################################################### {{ # Optimized XTS procedures # my $key_=$key2; my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); $x00=0 if ($flavour =~ /osx/); my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys # v26-v31 last 6 round keys my ($keyperm)=($out0); # aliases with "caller", redundant assignment my $taillen=$x70; $code.=<<___; .align 5 _aesp8_xts_encrypt6x: $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) mflr r11 li r7,`$FRAME+8*16+15` li r3,`$FRAME+8*16+31` $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) stvx v20,r7,$sp # ABI says so addi r7,r7,32 stvx v21,r3,$sp addi r3,r3,32 stvx v22,r7,$sp addi r7,r7,32 stvx v23,r3,$sp addi r3,r3,32 stvx v24,r7,$sp addi r7,r7,32 stvx v25,r3,$sp addi r3,r3,32 stvx v26,r7,$sp addi r7,r7,32 stvx v27,r3,$sp addi r3,r3,32 stvx v28,r7,$sp addi r7,r7,32 stvx v29,r3,$sp addi r3,r3,32 stvx v30,r7,$sp stvx v31,r3,$sp li r0,-1 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave li $x10,0x10 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) li $x20,0x20 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) li $x30,0x30 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) li $x40,0x40 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) li $x50,0x50 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) li $x60,0x60 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) li $x70,0x70 mtspr 256,r0 subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule lvx v30,$x10,$key1 addi $key1,$key1,0x20 lvx v31,$x00,$key1 ?vperm $rndkey0,$rndkey0,v30,$keyperm addi $key_,$sp,`$FRAME+15` mtctr $rounds Load_xts_enc_key: ?vperm v24,v30,v31,$keyperm lvx v30,$x10,$key1 addi $key1,$key1,0x20 stvx v24,$x00,$key_ # off-load round[1] ?vperm v25,v31,v30,$keyperm lvx v31,$x00,$key1 stvx v25,$x10,$key_ # off-load round[2] addi $key_,$key_,0x20 bdnz Load_xts_enc_key lvx v26,$x10,$key1 ?vperm v24,v30,v31,$keyperm lvx v27,$x20,$key1 stvx v24,$x00,$key_ # off-load round[3] ?vperm v25,v31,v26,$keyperm lvx v28,$x30,$key1 stvx v25,$x10,$key_ # off-load round[4] addi $key_,$sp,`$FRAME+15` # rewind $key_ ?vperm v26,v26,v27,$keyperm lvx v29,$x40,$key1 ?vperm v27,v27,v28,$keyperm lvx v30,$x50,$key1 ?vperm v28,v28,v29,$keyperm lvx v31,$x60,$key1 ?vperm v29,v29,v30,$keyperm lvx $twk5,$x70,$key1 # borrow $twk5 ?vperm v30,v30,v31,$keyperm lvx v24,$x00,$key_ # pre-load round[1] ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31 # undo "caller" vxor $twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 vxor $tweak,$tweak,$tmp lvx_u $in1,$x10,$inp vxor $twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand $tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 vxor $tweak,$tweak,$tmp lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor $twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand $tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 vxor $tweak,$tweak,$tmp lvx_u $in3,$x30,$inp sub $len,$len,$taillen vxor $twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand $tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 vxor $tweak,$tweak,$tmp lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor $twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand $tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 vxor $tweak,$tweak,$tmp lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 vxor $twk5,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in5,$in5,$in5,$leperm vand $tmp,$tmp,$eighty7 vxor $out5,$in5,$twk5 vxor $tweak,$tweak,$tmp vxor v31,v31,$rndkey0 mtctr $rounds b Loop_xts_enc6x .align 5 Loop_xts_enc6x: vcipher $out0,$out0,v24 vcipher $out1,$out1,v24 vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 vcipher $out4,$out4,v24 vcipher $out5,$out5,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vcipher $out4,$out4,v25 vcipher $out5,$out5,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_enc6x subic $len,$len,96 # $len-=96 vxor $in0,$twk0,v31 # xor with last round key vcipher $out0,$out0,v24 vcipher $out1,$out1,v24 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk0,$tweak,$rndkey0 vaddubm $tweak,$tweak,$tweak vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 vsldoi $tmp,$tmp,$tmp,15 vcipher $out4,$out4,v24 vcipher $out5,$out5,v24 subfe. r0,r0,r0 # borrow?-1:0 vand $tmp,$tmp,$eighty7 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 vxor $tweak,$tweak,$tmp vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vxor $in1,$twk1,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk1,$tweak,$rndkey0 vcipher $out4,$out4,v25 vcipher $out5,$out5,v25 and r0,r0,$len vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vcipher $out0,$out0,v26 vcipher $out1,$out1,v26 vand $tmp,$tmp,$eighty7 vcipher $out2,$out2,v26 vcipher $out3,$out3,v26 vxor $tweak,$tweak,$tmp vcipher $out4,$out4,v26 vcipher $out5,$out5,v26 add $inp,$inp,r0 # $inp is adjusted in such # way that at exit from the # loop inX-in5 are loaded # with last "words" vxor $in2,$twk2,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk2,$tweak,$rndkey0 vaddubm $tweak,$tweak,$tweak vcipher $out0,$out0,v27 vcipher $out1,$out1,v27 vsldoi $tmp,$tmp,$tmp,15 vcipher $out2,$out2,v27 vcipher $out3,$out3,v27 vand $tmp,$tmp,$eighty7 vcipher $out4,$out4,v27 vcipher $out5,$out5,v27 addi $key_,$sp,`$FRAME+15` # rewind $key_ vxor $tweak,$tweak,$tmp vcipher $out0,$out0,v28 vcipher $out1,$out1,v28 vxor $in3,$twk3,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk3,$tweak,$rndkey0 vcipher $out2,$out2,v28 vcipher $out3,$out3,v28 vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vcipher $out4,$out4,v28 vcipher $out5,$out5,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vand $tmp,$tmp,$eighty7 vcipher $out0,$out0,v29 vcipher $out1,$out1,v29 vxor $tweak,$tweak,$tmp vcipher $out2,$out2,v29 vcipher $out3,$out3,v29 vxor $in4,$twk4,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk4,$tweak,$rndkey0 vcipher $out4,$out4,v29 vcipher $out5,$out5,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vcipher $out0,$out0,v30 vcipher $out1,$out1,v30 vand $tmp,$tmp,$eighty7 vcipher $out2,$out2,v30 vcipher $out3,$out3,v30 vxor $tweak,$tweak,$tmp vcipher $out4,$out4,v30 vcipher $out5,$out5,v30 vxor $in5,$twk5,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk5,$tweak,$rndkey0 vcipherlast $out0,$out0,$in0 lvx_u $in0,$x00,$inp # load next input block vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vcipherlast $out1,$out1,$in1 lvx_u $in1,$x10,$inp vcipherlast $out2,$out2,$in2 le?vperm $in0,$in0,$in0,$leperm lvx_u $in2,$x20,$inp vand $tmp,$tmp,$eighty7 vcipherlast $out3,$out3,$in3 le?vperm $in1,$in1,$in1,$leperm lvx_u $in3,$x30,$inp vcipherlast $out4,$out4,$in4 le?vperm $in2,$in2,$in2,$leperm lvx_u $in4,$x40,$inp vxor $tweak,$tweak,$tmp vcipherlast $tmp,$out5,$in5 # last block might be needed # in stealing mode le?vperm $in3,$in3,$in3,$leperm lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 le?vperm $in4,$in4,$in4,$leperm le?vperm $in5,$in5,$in5,$leperm le?vperm $out0,$out0,$out0,$leperm le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $out0,$in0,$twk0 le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out vxor $out1,$in1,$twk1 le?vperm $out3,$out3,$out3,$leperm stvx_u $out2,$x20,$out vxor $out2,$in2,$twk2 le?vperm $out4,$out4,$out4,$leperm stvx_u $out3,$x30,$out vxor $out3,$in3,$twk3 le?vperm $out5,$tmp,$tmp,$leperm stvx_u $out4,$x40,$out vxor $out4,$in4,$twk4 le?stvx_u $out5,$x50,$out be?stvx_u $tmp, $x50,$out vxor $out5,$in5,$twk5 addi $out,$out,0x60 mtctr $rounds beq Loop_xts_enc6x # did $len-=96 borrow? addic. $len,$len,0x60 beq Lxts_enc6x_zero cmpwi $len,0x20 blt Lxts_enc6x_one nop beq Lxts_enc6x_two cmpwi $len,0x40 blt Lxts_enc6x_three nop beq Lxts_enc6x_four Lxts_enc6x_five: vxor $out0,$in1,$twk0 vxor $out1,$in2,$twk1 vxor $out2,$in3,$twk2 vxor $out3,$in4,$twk3 vxor $out4,$in5,$twk4 bl _aesp8_xts_enc5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk5 # unused tweak le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$leperm stvx_u $out2,$x20,$out vxor $tmp,$out4,$twk5 # last block prep for stealing le?vperm $out4,$out4,$out4,$leperm stvx_u $out3,$x30,$out stvx_u $out4,$x40,$out addi $out,$out,0x50 bne Lxts_enc6x_steal b Lxts_enc6x_done .align 4 Lxts_enc6x_four: vxor $out0,$in2,$twk0 vxor $out1,$in3,$twk1 vxor $out2,$in4,$twk2 vxor $out3,$in5,$twk3 vxor $out4,$out4,$out4 bl _aesp8_xts_enc5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk4 # unused tweak le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out vxor $tmp,$out3,$twk4 # last block prep for stealing le?vperm $out3,$out3,$out3,$leperm stvx_u $out2,$x20,$out stvx_u $out3,$x30,$out addi $out,$out,0x40 bne Lxts_enc6x_steal b Lxts_enc6x_done .align 4 Lxts_enc6x_three: vxor $out0,$in3,$twk0 vxor $out1,$in4,$twk1 vxor $out2,$in5,$twk2 vxor $out3,$out3,$out3 vxor $out4,$out4,$out4 bl _aesp8_xts_enc5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk3 # unused tweak le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $tmp,$out2,$twk3 # last block prep for stealing le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out stvx_u $out2,$x20,$out addi $out,$out,0x30 bne Lxts_enc6x_steal b Lxts_enc6x_done .align 4 Lxts_enc6x_two: vxor $out0,$in4,$twk0 vxor $out1,$in5,$twk1 vxor $out2,$out2,$out2 vxor $out3,$out3,$out3 vxor $out4,$out4,$out4 bl _aesp8_xts_enc5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk2 # unused tweak vxor $tmp,$out1,$twk2 # last block prep for stealing le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output stvx_u $out1,$x10,$out addi $out,$out,0x20 bne Lxts_enc6x_steal b Lxts_enc6x_done .align 4 Lxts_enc6x_one: vxor $out0,$in5,$twk0 nop Loop_xts_enc1x: vcipher $out0,$out0,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vcipher $out0,$out0,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_enc1x add $inp,$inp,$taillen cmpwi $taillen,0 vcipher $out0,$out0,v24 subi $inp,$inp,16 vcipher $out0,$out0,v25 lvsr $inpperm,0,$taillen vcipher $out0,$out0,v26 lvx_u $in0,0,$inp vcipher $out0,$out0,v27 addi $key_,$sp,`$FRAME+15` # rewind $key_ vcipher $out0,$out0,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vcipher $out0,$out0,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vxor $twk0,$twk0,v31 le?vperm $in0,$in0,$in0,$leperm vcipher $out0,$out0,v30 vperm $in0,$in0,$in0,$inpperm vcipherlast $out0,$out0,$twk0 vmr $twk0,$twk1 # unused tweak vxor $tmp,$out0,$twk1 # last block prep for stealing le?vperm $out0,$out0,$out0,$leperm stvx_u $out0,$x00,$out # store output addi $out,$out,0x10 bne Lxts_enc6x_steal b Lxts_enc6x_done .align 4 Lxts_enc6x_zero: cmpwi $taillen,0 beq Lxts_enc6x_done add $inp,$inp,$taillen subi $inp,$inp,16 lvx_u $in0,0,$inp lvsr $inpperm,0,$taillen # $in5 is no more le?vperm $in0,$in0,$in0,$leperm vperm $in0,$in0,$in0,$inpperm vxor $tmp,$tmp,$twk0 Lxts_enc6x_steal: vxor $in0,$in0,$twk0 vxor $out0,$out0,$out0 vspltisb $out1,-1 vperm $out0,$out0,$out1,$inpperm vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? subi r30,$out,17 subi $out,$out,16 mtctr $taillen Loop_xts_enc6x_steal: lbzu r0,1(r30) stb r0,16(r30) bdnz Loop_xts_enc6x_steal li $taillen,0 mtctr $rounds b Loop_xts_enc1x # one more time... .align 4 Lxts_enc6x_done: ${UCMP}i $ivp,0 beq Lxts_enc6x_ret vxor $tweak,$twk0,$rndkey0 le?vperm $tweak,$tweak,$tweak,$leperm stvx_u $tweak,0,$ivp Lxts_enc6x_ret: mtlr r11 li r10,`$FRAME+15` li r11,`$FRAME+31` stvx $seven,r10,$sp # wipe copies of round keys addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 stvx $seven,r10,$sp addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 stvx $seven,r10,$sp addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 stvx $seven,r10,$sp addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 mtspr 256,$vrsave lvx v20,r10,$sp # ABI says so addi r10,r10,32 lvx v21,r11,$sp addi r11,r11,32 lvx v22,r10,$sp addi r10,r10,32 lvx v23,r11,$sp addi r11,r11,32 lvx v24,r10,$sp addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 lvx v26,r10,$sp addi r10,r10,32 lvx v27,r11,$sp addi r11,r11,32 lvx v28,r10,$sp addi r10,r10,32 lvx v29,r11,$sp addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` blr .long 0 .byte 0,12,0x04,1,0x80,6,6,0 .long 0 .align 5 _aesp8_xts_enc5x: vcipher $out0,$out0,v24 vcipher $out1,$out1,v24 vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 vcipher $out4,$out4,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vcipher $out4,$out4,v25 lvx v25,$x10,$key_ # round[4] bdnz _aesp8_xts_enc5x add $inp,$inp,$taillen cmpwi $taillen,0 vcipher $out0,$out0,v24 vcipher $out1,$out1,v24 vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 vcipher $out4,$out4,v24 subi $inp,$inp,16 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vcipher $out4,$out4,v25 vxor $twk0,$twk0,v31 vcipher $out0,$out0,v26 lvsr $inpperm,0,$taillen # $in5 is no more vcipher $out1,$out1,v26 vcipher $out2,$out2,v26 vcipher $out3,$out3,v26 vcipher $out4,$out4,v26 vxor $in1,$twk1,v31 vcipher $out0,$out0,v27 lvx_u $in0,0,$inp vcipher $out1,$out1,v27 vcipher $out2,$out2,v27 vcipher $out3,$out3,v27 vcipher $out4,$out4,v27 vxor $in2,$twk2,v31 addi $key_,$sp,`$FRAME+15` # rewind $key_ vcipher $out0,$out0,v28 vcipher $out1,$out1,v28 vcipher $out2,$out2,v28 vcipher $out3,$out3,v28 vcipher $out4,$out4,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vxor $in3,$twk3,v31 vcipher $out0,$out0,v29 le?vperm $in0,$in0,$in0,$leperm vcipher $out1,$out1,v29 vcipher $out2,$out2,v29 vcipher $out3,$out3,v29 vcipher $out4,$out4,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vxor $in4,$twk4,v31 vcipher $out0,$out0,v30 vperm $in0,$in0,$in0,$inpperm vcipher $out1,$out1,v30 vcipher $out2,$out2,v30 vcipher $out3,$out3,v30 vcipher $out4,$out4,v30 vcipherlast $out0,$out0,$twk0 vcipherlast $out1,$out1,$in1 vcipherlast $out2,$out2,$in2 vcipherlast $out3,$out3,$in3 vcipherlast $out4,$out4,$in4 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .align 5 _aesp8_xts_decrypt6x: $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) mflr r11 li r7,`$FRAME+8*16+15` li r3,`$FRAME+8*16+31` $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) stvx v20,r7,$sp # ABI says so addi r7,r7,32 stvx v21,r3,$sp addi r3,r3,32 stvx v22,r7,$sp addi r7,r7,32 stvx v23,r3,$sp addi r3,r3,32 stvx v24,r7,$sp addi r7,r7,32 stvx v25,r3,$sp addi r3,r3,32 stvx v26,r7,$sp addi r7,r7,32 stvx v27,r3,$sp addi r3,r3,32 stvx v28,r7,$sp addi r7,r7,32 stvx v29,r3,$sp addi r3,r3,32 stvx v30,r7,$sp stvx v31,r3,$sp li r0,-1 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave li $x10,0x10 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) li $x20,0x20 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) li $x30,0x30 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) li $x40,0x40 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) li $x50,0x50 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) li $x60,0x60 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) li $x70,0x70 mtspr 256,r0 subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule lvx v30,$x10,$key1 addi $key1,$key1,0x20 lvx v31,$x00,$key1 ?vperm $rndkey0,$rndkey0,v30,$keyperm addi $key_,$sp,`$FRAME+15` mtctr $rounds Load_xts_dec_key: ?vperm v24,v30,v31,$keyperm lvx v30,$x10,$key1 addi $key1,$key1,0x20 stvx v24,$x00,$key_ # off-load round[1] ?vperm v25,v31,v30,$keyperm lvx v31,$x00,$key1 stvx v25,$x10,$key_ # off-load round[2] addi $key_,$key_,0x20 bdnz Load_xts_dec_key lvx v26,$x10,$key1 ?vperm v24,v30,v31,$keyperm lvx v27,$x20,$key1 stvx v24,$x00,$key_ # off-load round[3] ?vperm v25,v31,v26,$keyperm lvx v28,$x30,$key1 stvx v25,$x10,$key_ # off-load round[4] addi $key_,$sp,`$FRAME+15` # rewind $key_ ?vperm v26,v26,v27,$keyperm lvx v29,$x40,$key1 ?vperm v27,v27,v28,$keyperm lvx v30,$x50,$key1 ?vperm v28,v28,v29,$keyperm lvx v31,$x60,$key1 ?vperm v29,v29,v30,$keyperm lvx $twk5,$x70,$key1 # borrow $twk5 ?vperm v30,v30,v31,$keyperm lvx v24,$x00,$key_ # pre-load round[1] ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31 # undo "caller" vxor $twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 vxor $tweak,$tweak,$tmp lvx_u $in1,$x10,$inp vxor $twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand $tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 vxor $tweak,$tweak,$tmp lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor $twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand $tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 vxor $tweak,$tweak,$tmp lvx_u $in3,$x30,$inp sub $len,$len,$taillen vxor $twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand $tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 vxor $tweak,$tweak,$tmp lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor $twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand $tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 vxor $tweak,$tweak,$tmp lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 vxor $twk5,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 le?vperm $in5,$in5,$in5,$leperm vand $tmp,$tmp,$eighty7 vxor $out5,$in5,$twk5 vxor $tweak,$tweak,$tmp vxor v31,v31,$rndkey0 mtctr $rounds b Loop_xts_dec6x .align 5 Loop_xts_dec6x: vncipher $out0,$out0,v24 vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 vncipher $out5,$out5,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_dec6x subic $len,$len,96 # $len-=96 vxor $in0,$twk0,v31 # xor with last round key vncipher $out0,$out0,v24 vncipher $out1,$out1,v24 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk0,$tweak,$rndkey0 vaddubm $tweak,$tweak,$tweak vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vsldoi $tmp,$tmp,$tmp,15 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 subfe. r0,r0,r0 # borrow?-1:0 vand $tmp,$tmp,$eighty7 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 vxor $tweak,$tweak,$tmp vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vxor $in1,$twk1,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk1,$tweak,$rndkey0 vncipher $out4,$out4,v25 vncipher $out5,$out5,v25 and r0,r0,$len vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vncipher $out0,$out0,v26 vncipher $out1,$out1,v26 vand $tmp,$tmp,$eighty7 vncipher $out2,$out2,v26 vncipher $out3,$out3,v26 vxor $tweak,$tweak,$tmp vncipher $out4,$out4,v26 vncipher $out5,$out5,v26 add $inp,$inp,r0 # $inp is adjusted in such # way that at exit from the # loop inX-in5 are loaded # with last "words" vxor $in2,$twk2,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk2,$tweak,$rndkey0 vaddubm $tweak,$tweak,$tweak vncipher $out0,$out0,v27 vncipher $out1,$out1,v27 vsldoi $tmp,$tmp,$tmp,15 vncipher $out2,$out2,v27 vncipher $out3,$out3,v27 vand $tmp,$tmp,$eighty7 vncipher $out4,$out4,v27 vncipher $out5,$out5,v27 addi $key_,$sp,`$FRAME+15` # rewind $key_ vxor $tweak,$tweak,$tmp vncipher $out0,$out0,v28 vncipher $out1,$out1,v28 vxor $in3,$twk3,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk3,$tweak,$rndkey0 vncipher $out2,$out2,v28 vncipher $out3,$out3,v28 vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vncipher $out4,$out4,v28 vncipher $out5,$out5,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vand $tmp,$tmp,$eighty7 vncipher $out0,$out0,v29 vncipher $out1,$out1,v29 vxor $tweak,$tweak,$tmp vncipher $out2,$out2,v29 vncipher $out3,$out3,v29 vxor $in4,$twk4,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk4,$tweak,$rndkey0 vncipher $out4,$out4,v29 vncipher $out5,$out5,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vncipher $out0,$out0,v30 vncipher $out1,$out1,v30 vand $tmp,$tmp,$eighty7 vncipher $out2,$out2,v30 vncipher $out3,$out3,v30 vxor $tweak,$tweak,$tmp vncipher $out4,$out4,v30 vncipher $out5,$out5,v30 vxor $in5,$twk5,v31 vsrab $tmp,$tweak,$seven # next tweak value vxor $twk5,$tweak,$rndkey0 vncipherlast $out0,$out0,$in0 lvx_u $in0,$x00,$inp # load next input block vaddubm $tweak,$tweak,$tweak vsldoi $tmp,$tmp,$tmp,15 vncipherlast $out1,$out1,$in1 lvx_u $in1,$x10,$inp vncipherlast $out2,$out2,$in2 le?vperm $in0,$in0,$in0,$leperm lvx_u $in2,$x20,$inp vand $tmp,$tmp,$eighty7 vncipherlast $out3,$out3,$in3 le?vperm $in1,$in1,$in1,$leperm lvx_u $in3,$x30,$inp vncipherlast $out4,$out4,$in4 le?vperm $in2,$in2,$in2,$leperm lvx_u $in4,$x40,$inp vxor $tweak,$tweak,$tmp vncipherlast $out5,$out5,$in5 le?vperm $in3,$in3,$in3,$leperm lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 le?vperm $in4,$in4,$in4,$leperm le?vperm $in5,$in5,$in5,$leperm le?vperm $out0,$out0,$out0,$leperm le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $out0,$in0,$twk0 le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out vxor $out1,$in1,$twk1 le?vperm $out3,$out3,$out3,$leperm stvx_u $out2,$x20,$out vxor $out2,$in2,$twk2 le?vperm $out4,$out4,$out4,$leperm stvx_u $out3,$x30,$out vxor $out3,$in3,$twk3 le?vperm $out5,$out5,$out5,$leperm stvx_u $out4,$x40,$out vxor $out4,$in4,$twk4 stvx_u $out5,$x50,$out vxor $out5,$in5,$twk5 addi $out,$out,0x60 mtctr $rounds beq Loop_xts_dec6x # did $len-=96 borrow? addic. $len,$len,0x60 beq Lxts_dec6x_zero cmpwi $len,0x20 blt Lxts_dec6x_one nop beq Lxts_dec6x_two cmpwi $len,0x40 blt Lxts_dec6x_three nop beq Lxts_dec6x_four Lxts_dec6x_five: vxor $out0,$in1,$twk0 vxor $out1,$in2,$twk1 vxor $out2,$in3,$twk2 vxor $out3,$in4,$twk3 vxor $out4,$in5,$twk4 bl _aesp8_xts_dec5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk5 # unused tweak vxor $twk1,$tweak,$rndkey0 le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $out0,$in0,$twk1 le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$leperm stvx_u $out2,$x20,$out le?vperm $out4,$out4,$out4,$leperm stvx_u $out3,$x30,$out stvx_u $out4,$x40,$out addi $out,$out,0x50 bne Lxts_dec6x_steal b Lxts_dec6x_done .align 4 Lxts_dec6x_four: vxor $out0,$in2,$twk0 vxor $out1,$in3,$twk1 vxor $out2,$in4,$twk2 vxor $out3,$in5,$twk3 vxor $out4,$out4,$out4 bl _aesp8_xts_dec5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk4 # unused tweak vmr $twk1,$twk5 le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $out0,$in0,$twk5 le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out le?vperm $out3,$out3,$out3,$leperm stvx_u $out2,$x20,$out stvx_u $out3,$x30,$out addi $out,$out,0x40 bne Lxts_dec6x_steal b Lxts_dec6x_done .align 4 Lxts_dec6x_three: vxor $out0,$in3,$twk0 vxor $out1,$in4,$twk1 vxor $out2,$in5,$twk2 vxor $out3,$out3,$out3 vxor $out4,$out4,$out4 bl _aesp8_xts_dec5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk3 # unused tweak vmr $twk1,$twk4 le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $out0,$in0,$twk4 le?vperm $out2,$out2,$out2,$leperm stvx_u $out1,$x10,$out stvx_u $out2,$x20,$out addi $out,$out,0x30 bne Lxts_dec6x_steal b Lxts_dec6x_done .align 4 Lxts_dec6x_two: vxor $out0,$in4,$twk0 vxor $out1,$in5,$twk1 vxor $out2,$out2,$out2 vxor $out3,$out3,$out3 vxor $out4,$out4,$out4 bl _aesp8_xts_dec5x le?vperm $out0,$out0,$out0,$leperm vmr $twk0,$twk2 # unused tweak vmr $twk1,$twk3 le?vperm $out1,$out1,$out1,$leperm stvx_u $out0,$x00,$out # store output vxor $out0,$in0,$twk3 stvx_u $out1,$x10,$out addi $out,$out,0x20 bne Lxts_dec6x_steal b Lxts_dec6x_done .align 4 Lxts_dec6x_one: vxor $out0,$in5,$twk0 nop Loop_xts_dec1x: vncipher $out0,$out0,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vncipher $out0,$out0,v25 lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_dec1x subi r0,$taillen,1 vncipher $out0,$out0,v24 andi. r0,r0,16 cmpwi $taillen,0 vncipher $out0,$out0,v25 sub $inp,$inp,r0 vncipher $out0,$out0,v26 lvx_u $in0,0,$inp vncipher $out0,$out0,v27 addi $key_,$sp,`$FRAME+15` # rewind $key_ vncipher $out0,$out0,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vncipher $out0,$out0,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vxor $twk0,$twk0,v31 le?vperm $in0,$in0,$in0,$leperm vncipher $out0,$out0,v30 mtctr $rounds vncipherlast $out0,$out0,$twk0 vmr $twk0,$twk1 # unused tweak vmr $twk1,$twk2 le?vperm $out0,$out0,$out0,$leperm stvx_u $out0,$x00,$out # store output addi $out,$out,0x10 vxor $out0,$in0,$twk2 bne Lxts_dec6x_steal b Lxts_dec6x_done .align 4 Lxts_dec6x_zero: cmpwi $taillen,0 beq Lxts_dec6x_done lvx_u $in0,0,$inp le?vperm $in0,$in0,$in0,$leperm vxor $out0,$in0,$twk1 Lxts_dec6x_steal: vncipher $out0,$out0,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vncipher $out0,$out0,v25 lvx v25,$x10,$key_ # round[4] bdnz Lxts_dec6x_steal add $inp,$inp,$taillen vncipher $out0,$out0,v24 cmpwi $taillen,0 vncipher $out0,$out0,v25 lvx_u $in0,0,$inp vncipher $out0,$out0,v26 lvsr $inpperm,0,$taillen # $in5 is no more vncipher $out0,$out0,v27 addi $key_,$sp,`$FRAME+15` # rewind $key_ vncipher $out0,$out0,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vncipher $out0,$out0,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vxor $twk1,$twk1,v31 le?vperm $in0,$in0,$in0,$leperm vncipher $out0,$out0,v30 vperm $in0,$in0,$in0,$inpperm vncipherlast $tmp,$out0,$twk1 le?vperm $out0,$tmp,$tmp,$leperm le?stvx_u $out0,0,$out be?stvx_u $tmp,0,$out vxor $out0,$out0,$out0 vspltisb $out1,-1 vperm $out0,$out0,$out1,$inpperm vsel $out0,$in0,$tmp,$out0 vxor $out0,$out0,$twk0 subi r30,$out,1 mtctr $taillen Loop_xts_dec6x_steal: lbzu r0,1(r30) stb r0,16(r30) bdnz Loop_xts_dec6x_steal li $taillen,0 mtctr $rounds b Loop_xts_dec1x # one more time... .align 4 Lxts_dec6x_done: ${UCMP}i $ivp,0 beq Lxts_dec6x_ret vxor $tweak,$twk0,$rndkey0 le?vperm $tweak,$tweak,$tweak,$leperm stvx_u $tweak,0,$ivp Lxts_dec6x_ret: mtlr r11 li r10,`$FRAME+15` li r11,`$FRAME+31` stvx $seven,r10,$sp # wipe copies of round keys addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 stvx $seven,r10,$sp addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 stvx $seven,r10,$sp addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 stvx $seven,r10,$sp addi r10,r10,32 stvx $seven,r11,$sp addi r11,r11,32 mtspr 256,$vrsave lvx v20,r10,$sp # ABI says so addi r10,r10,32 lvx v21,r11,$sp addi r11,r11,32 lvx v22,r10,$sp addi r10,r10,32 lvx v23,r11,$sp addi r11,r11,32 lvx v24,r10,$sp addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 lvx v26,r10,$sp addi r10,r10,32 lvx v27,r11,$sp addi r11,r11,32 lvx v28,r10,$sp addi r10,r10,32 lvx v29,r11,$sp addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` blr .long 0 .byte 0,12,0x04,1,0x80,6,6,0 .long 0 .align 5 _aesp8_xts_dec5x: vncipher $out0,$out0,v24 vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 lvx v24,$x20,$key_ # round[3] addi $key_,$key_,0x20 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 lvx v25,$x10,$key_ # round[4] bdnz _aesp8_xts_dec5x subi r0,$taillen,1 vncipher $out0,$out0,v24 vncipher $out1,$out1,v24 vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 vncipher $out4,$out4,v24 andi. r0,r0,16 cmpwi $taillen,0 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vncipher $out4,$out4,v25 vxor $twk0,$twk0,v31 sub $inp,$inp,r0 vncipher $out0,$out0,v26 vncipher $out1,$out1,v26 vncipher $out2,$out2,v26 vncipher $out3,$out3,v26 vncipher $out4,$out4,v26 vxor $in1,$twk1,v31 vncipher $out0,$out0,v27 lvx_u $in0,0,$inp vncipher $out1,$out1,v27 vncipher $out2,$out2,v27 vncipher $out3,$out3,v27 vncipher $out4,$out4,v27 vxor $in2,$twk2,v31 addi $key_,$sp,`$FRAME+15` # rewind $key_ vncipher $out0,$out0,v28 vncipher $out1,$out1,v28 vncipher $out2,$out2,v28 vncipher $out3,$out3,v28 vncipher $out4,$out4,v28 lvx v24,$x00,$key_ # re-pre-load round[1] vxor $in3,$twk3,v31 vncipher $out0,$out0,v29 le?vperm $in0,$in0,$in0,$leperm vncipher $out1,$out1,v29 vncipher $out2,$out2,v29 vncipher $out3,$out3,v29 vncipher $out4,$out4,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vxor $in4,$twk4,v31 vncipher $out0,$out0,v30 vncipher $out1,$out1,v30 vncipher $out2,$out2,v30 vncipher $out3,$out3,v30 vncipher $out4,$out4,v30 vncipherlast $out0,$out0,$twk0 vncipherlast $out1,$out1,$in1 vncipherlast $out2,$out2,$in2 vncipherlast $out3,$out3,$in3 vncipherlast $out4,$out4,$in4 mtctr $rounds blr .long 0 .byte 0,12,0x14,0,0,0,0,0 ___ }} }}} my $consts=1; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; # constants table endian-specific conversion if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { my $conv=$3; my @bytes=(); # convert to endian-agnostic format if ($1 eq "long") { foreach (split(/,\s*/,$2)) { my $l = /^0/?oct:int; push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; } } else { @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); } # little-endian conversion if ($flavour =~ /le$/o) { SWITCH: for($conv) { /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; /\?rev/ && do { @bytes=reverse(@bytes); last; }; } } #emit print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; next; } $consts=0 if (m/Lconsts:/o); # end of table # instructions prefixed with '?' are endian-specific and need # to be adjusted accordingly... if ($flavour =~ /le$/o) { # little-endian s/le\?//o or s/be\?/#be#/o or s/\?lvsr/lvsl/o or s/\?lvsl/lvsr/o or s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; } else { # big-endian s/le\?/#le#/o or s/be\?//o or s/\?([a-z]+)/$1/o; } print $_,"\n"; } close STDOUT or die "error closing STDOUT";