44//!
55//! SSE2-optimized implementation for x86/x86-64 CPUs.
66
7- use crate :: { rounds:: Rounds , BLOCK_SIZE , CONSTANTS , IV_SIZE , KEY_SIZE } ;
7+ use crate :: { rounds:: Rounds , CONSTANTS , IV_SIZE , BLOCK_SIZE , KEY_SIZE } ;
88use core:: { convert:: TryInto , marker:: PhantomData } ;
9+ use super :: autodetect:: BUFFER_SIZE ;
910
1011#[ cfg( target_arch = "x86" ) ]
1112use core:: arch:: x86:: * ;
1213#[ cfg( target_arch = "x86_64" ) ]
1314use core:: arch:: x86_64:: * ;
1415
15- /// Size of buffers passed to `generate` and `apply_keystream` for this backend
16- pub ( crate ) const BUFFER_SIZE : usize = BLOCK_SIZE ;
17-
1816/// The ChaCha20 block function (SSE2 accelerated implementation for x86/x86_64)
1917// TODO(tarcieri): zeroize?
20- #[ derive( Clone ) ]
18+ #[ derive( Copy , Clone ) ]
2119pub ( crate ) struct State < R : Rounds > {
2220 v0 : __m128i ,
2321 v1 : __m128i ,
@@ -49,11 +47,13 @@ impl<R: Rounds> State<R> {
4947 pub ( crate ) fn generate ( & self , counter : u64 , output : & mut [ u8 ] ) {
5048 debug_assert_eq ! ( output. len( ) , BUFFER_SIZE ) ;
5149
52- unsafe {
53- let ( mut v0, mut v1, mut v2) = ( self . v0 , self . v1 , self . v2 ) ;
54- let mut v3 = iv_setup ( self . iv , counter) ;
55- self . rounds ( & mut v0, & mut v1, & mut v2, & mut v3) ;
56- store ( v0, v1, v2, v3, output)
50+ for ( i, chunk) in output. chunks_exact_mut ( BLOCK_SIZE ) . enumerate ( ) {
51+ unsafe {
52+ let ( mut v0, mut v1, mut v2) = ( self . v0 , self . v1 , self . v2 ) ;
53+ let mut v3 = iv_setup ( self . iv , counter. checked_add ( i as u64 ) . unwrap ( ) ) ;
54+ self . rounds ( & mut v0, & mut v1, & mut v2, & mut v3) ;
55+ store ( v0, v1, v2, v3, chunk)
56+ }
5757 }
5858 }
5959
@@ -63,15 +63,17 @@ impl<R: Rounds> State<R> {
6363 pub ( crate ) fn apply_keystream ( & self , counter : u64 , output : & mut [ u8 ] ) {
6464 debug_assert_eq ! ( output. len( ) , BUFFER_SIZE ) ;
6565
66- unsafe {
67- let ( mut v0, mut v1, mut v2) = ( self . v0 , self . v1 , self . v2 ) ;
68- let mut v3 = iv_setup ( self . iv , counter) ;
69- self . rounds ( & mut v0, & mut v1, & mut v2, & mut v3) ;
70-
71- for ( chunk, a) in output. chunks_mut ( 0x10 ) . zip ( & [ v0, v1, v2, v3] ) {
72- let b = _mm_loadu_si128 ( chunk. as_ptr ( ) as * const __m128i ) ;
73- let out = _mm_xor_si128 ( * a, b) ;
74- _mm_storeu_si128 ( chunk. as_mut_ptr ( ) as * mut __m128i , out) ;
66+ for ( i, chunk) in output. chunks_exact_mut ( BLOCK_SIZE ) . enumerate ( ) {
67+ unsafe {
68+ let ( mut v0, mut v1, mut v2) = ( self . v0 , self . v1 , self . v2 ) ;
69+ let mut v3 = iv_setup ( self . iv , counter. checked_add ( i as u64 ) . unwrap ( ) ) ;
70+ self . rounds ( & mut v0, & mut v1, & mut v2, & mut v3) ;
71+
72+ for ( ch, a) in chunk. chunks_exact_mut ( 0x10 ) . zip ( & [ v0, v1, v2, v3] ) {
73+ let b = _mm_loadu_si128 ( ch. as_ptr ( ) as * const __m128i ) ;
74+ let out = _mm_xor_si128 ( * a, b) ;
75+ _mm_storeu_si128 ( ch. as_mut_ptr ( ) as * mut __m128i , out) ;
76+ }
7577 }
7678 }
7779 }
@@ -263,12 +265,12 @@ mod tests {
263265
264266 #[ test]
265267 fn generate_vs_scalar_impl ( ) {
266- let mut soft_result = [ 0u8 ; BLOCK_SIZE ] ;
268+ let mut soft_result = [ 0u8 ; soft :: BUFFER_SIZE ] ;
267269 soft:: State :: < R20 > :: new ( & R_KEY , R_IV ) . generate ( R_CNT , & mut soft_result) ;
268270
269- let mut simd_result = [ 0u8 ; BLOCK_SIZE ] ;
271+ let mut simd_result = [ 0u8 ; BUFFER_SIZE ] ;
270272 State :: < R20 > :: new ( & R_KEY , R_IV ) . generate ( R_CNT , & mut simd_result) ;
271273
272- assert_eq ! ( & soft_result[ ..] , & simd_result[ ..] )
274+ assert_eq ! ( & soft_result[ ..] , & simd_result[ ..soft :: BUFFER_SIZE ] )
273275 }
274276}
0 commit comments