diff --git a/aes/src/ni/aes128/expand.rs b/aes/src/ni/aes128/expand.rs index cc4afbce..f7b65b66 100644 --- a/aes/src/ni/aes128/expand.rs +++ b/aes/src/ni/aes128/expand.rs @@ -5,7 +5,7 @@ use core::mem; macro_rules! expand_round { ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => { - let mut t1 = _mm_load_si128($enc_keys.as_ptr().offset($pos - 1)); + let mut t1 = $enc_keys[$pos - 1]; let mut t2; let mut t3; @@ -19,9 +19,9 @@ macro_rules! expand_round { t1 = _mm_xor_si128(t1, t3); t1 = _mm_xor_si128(t1, t2); - _mm_store_si128($enc_keys.as_mut_ptr().offset($pos), t1); + $enc_keys[$pos] = t1; let t1 = if $pos != 10 { _mm_aesimc_si128(t1) } else { t1 }; - _mm_store_si128($dec_keys.as_mut_ptr().offset($pos), t1); + $dec_keys[$pos] = t1; }; } @@ -34,8 +34,8 @@ pub(super) fn expand(key: &[u8; 16]) -> (RoundKeys, RoundKeys) { // Safety: `loadu` supports unaligned loads #[allow(clippy::cast_ptr_alignment)] let k = _mm_loadu_si128(key.as_ptr() as *const __m128i); - _mm_store_si128(enc_keys.as_mut_ptr(), k); - _mm_store_si128(dec_keys.as_mut_ptr(), k); + enc_keys[0] = k; + dec_keys[0] = k; expand_round!(enc_keys, dec_keys, 1, 0x01); expand_round!(enc_keys, dec_keys, 2, 0x02); diff --git a/aes/src/ni/aes192/expand.rs b/aes/src/ni/aes192/expand.rs index 9e37798b..797b9867 100644 --- a/aes/src/ni/aes192/expand.rs +++ b/aes/src/ni/aes192/expand.rs @@ -42,8 +42,8 @@ pub(super) fn expand(key: &[u8; 24]) -> (RoundKeys, RoundKeys) { macro_rules! store { ($i:expr, $k:expr) => { - _mm_store_si128(enc_keys.as_mut_ptr().offset($i), $k); - _mm_store_si128(dec_keys.as_mut_ptr().offset($i), _mm_aesimc_si128($k)); + enc_keys[$i] = $k; + dec_keys[$i] = _mm_aesimc_si128($k); }; } @@ -63,8 +63,8 @@ pub(super) fn expand(key: &[u8; 24]) -> (RoundKeys, RoundKeys) { ) }; - _mm_store_si128(enc_keys.as_mut_ptr(), k0); - _mm_store_si128(dec_keys.as_mut_ptr(), k0); + enc_keys[0] = k0; + dec_keys[0] = k0; let (k1_2, k2r) = expand_round!(k0, k1l, 0x01); let k1 = shuffle!(k1l, k1_2, 0); @@ -100,8 +100,8 @@ pub(super) fn expand(key: &[u8; 24]) -> (RoundKeys, RoundKeys) { store!(11, k11); let (k12, _) = expand_round!(k10_11, k11r, 0x80); - _mm_store_si128(enc_keys.as_mut_ptr().offset(12), k12); - _mm_store_si128(dec_keys.as_mut_ptr().offset(12), k12); + enc_keys[12] = k12; + dec_keys[12] = k12; (enc_keys, dec_keys) } diff --git a/aes/src/ni/aes256/expand.rs b/aes/src/ni/aes256/expand.rs index 43bd0399..88b8558a 100644 --- a/aes/src/ni/aes256/expand.rs +++ b/aes/src/ni/aes256/expand.rs @@ -5,9 +5,9 @@ use core::mem; macro_rules! expand_round { ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => { - let mut t1 = _mm_load_si128($enc_keys.as_ptr().offset($pos - 2)); + let mut t1 = $enc_keys[$pos - 2]; let mut t2; - let mut t3 = _mm_load_si128($enc_keys.as_ptr().offset($pos - 1)); + let mut t3 = $enc_keys[$pos - 1]; let mut t4; t2 = _mm_aeskeygenassist_si128(t3, $round); @@ -20,9 +20,8 @@ macro_rules! expand_round { t1 = _mm_xor_si128(t1, t4); t1 = _mm_xor_si128(t1, t2); - _mm_store_si128($enc_keys.as_mut_ptr().offset($pos), t1); - let t = _mm_aesimc_si128(t1); - _mm_store_si128($dec_keys.as_mut_ptr().offset($pos), t); + $enc_keys[$pos] = t1; + $dec_keys[$pos] = _mm_aesimc_si128(t1); t4 = _mm_aeskeygenassist_si128(t1, 0x00); t2 = _mm_shuffle_epi32(t4, 0xaa); @@ -34,17 +33,16 @@ macro_rules! expand_round { t3 = _mm_xor_si128(t3, t4); t3 = _mm_xor_si128(t3, t2); - _mm_store_si128($enc_keys.as_mut_ptr().offset($pos + 1), t3); - let t = _mm_aesimc_si128(t3); - _mm_store_si128($dec_keys.as_mut_ptr().offset($pos + 1), t); + $enc_keys[$pos + 1] = t3; + $dec_keys[$pos + 1] = _mm_aesimc_si128(t3); }; } macro_rules! expand_round_last { ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => { - let mut t1 = _mm_load_si128($enc_keys.as_ptr().offset($pos - 2)); + let mut t1 = $enc_keys[$pos - 2]; let mut t2; - let t3 = _mm_load_si128($enc_keys.as_ptr().offset($pos - 1)); + let t3 = $enc_keys[$pos - 1]; let mut t4; t2 = _mm_aeskeygenassist_si128(t3, $round); @@ -57,8 +55,8 @@ macro_rules! expand_round_last { t1 = _mm_xor_si128(t1, t4); t1 = _mm_xor_si128(t1, t2); - _mm_store_si128($enc_keys.as_mut_ptr().offset($pos), t1); - _mm_store_si128($dec_keys.as_mut_ptr().offset($pos), t1); + $enc_keys[$pos] = t1; + $dec_keys[$pos] = t1; }; } @@ -73,10 +71,10 @@ pub(super) fn expand(key: &[u8; 32]) -> (RoundKeys, RoundKeys) { let kp = key.as_ptr() as *const __m128i; let k1 = _mm_loadu_si128(kp); let k2 = _mm_loadu_si128(kp.offset(1)); - _mm_store_si128(enc_keys.as_mut_ptr(), k1); - _mm_store_si128(dec_keys.as_mut_ptr(), k1); - _mm_store_si128(enc_keys.as_mut_ptr().offset(1), k2); - _mm_store_si128(dec_keys.as_mut_ptr().offset(1), _mm_aesimc_si128(k2)); + enc_keys[0] = k1; + dec_keys[0] = k1; + enc_keys[1] = k2; + dec_keys[1] = _mm_aesimc_si128(k2); expand_round!(enc_keys, dec_keys, 2, 0x01); expand_round!(enc_keys, dec_keys, 4, 0x02); diff --git a/aes/src/ni/utils.rs b/aes/src/ni/utils.rs index c1b8264c..c24d98ce 100644 --- a/aes/src/ni/utils.rs +++ b/aes/src/ni/utils.rs @@ -1,5 +1,8 @@ //! Utility functions +// TODO(tarcieri): check performance impact / generated assembly changes +#![allow(clippy::needless_range_loop)] + use super::arch::*; use cipher::{ consts::{U16, U8}, @@ -64,57 +67,29 @@ pub(crate) fn xor8(b: &mut U128x8, key: __m128i) { } #[inline(always)] -pub(crate) fn aesenc8(b: &mut U128x8, key: __m128i) { - unsafe { - b[0] = _mm_aesenc_si128(b[0], key); - b[1] = _mm_aesenc_si128(b[1], key); - b[2] = _mm_aesenc_si128(b[2], key); - b[3] = _mm_aesenc_si128(b[3], key); - b[4] = _mm_aesenc_si128(b[4], key); - b[5] = _mm_aesenc_si128(b[5], key); - b[6] = _mm_aesenc_si128(b[6], key); - b[7] = _mm_aesenc_si128(b[7], key); +pub(crate) fn aesenc8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesenc_si128(buffer[i], key) }; } } #[inline(always)] -pub(crate) fn aesenclast8(b: &mut U128x8, key: __m128i) { - unsafe { - b[0] = _mm_aesenclast_si128(b[0], key); - b[1] = _mm_aesenclast_si128(b[1], key); - b[2] = _mm_aesenclast_si128(b[2], key); - b[3] = _mm_aesenclast_si128(b[3], key); - b[4] = _mm_aesenclast_si128(b[4], key); - b[5] = _mm_aesenclast_si128(b[5], key); - b[6] = _mm_aesenclast_si128(b[6], key); - b[7] = _mm_aesenclast_si128(b[7], key); +pub(crate) fn aesenclast8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesenclast_si128(buffer[i], key) }; } } #[inline(always)] -pub(crate) fn aesdec8(b: &mut U128x8, key: __m128i) { - unsafe { - b[0] = _mm_aesdec_si128(b[0], key); - b[1] = _mm_aesdec_si128(b[1], key); - b[2] = _mm_aesdec_si128(b[2], key); - b[3] = _mm_aesdec_si128(b[3], key); - b[4] = _mm_aesdec_si128(b[4], key); - b[5] = _mm_aesdec_si128(b[5], key); - b[6] = _mm_aesdec_si128(b[6], key); - b[7] = _mm_aesdec_si128(b[7], key); +pub(crate) fn aesdec8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesdec_si128(buffer[i], key) }; } } #[inline(always)] -pub(crate) fn aesdeclast8(b: &mut U128x8, key: __m128i) { - unsafe { - b[0] = _mm_aesdeclast_si128(b[0], key); - b[1] = _mm_aesdeclast_si128(b[1], key); - b[2] = _mm_aesdeclast_si128(b[2], key); - b[3] = _mm_aesdeclast_si128(b[3], key); - b[4] = _mm_aesdeclast_si128(b[4], key); - b[5] = _mm_aesdeclast_si128(b[5], key); - b[6] = _mm_aesdeclast_si128(b[6], key); - b[7] = _mm_aesdeclast_si128(b[7], key); - }; +pub(crate) fn aesdeclast8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesdeclast_si128(buffer[i], key) }; + } }