Skip to content

Commit 1767c76

Browse files
committed
chacha20: AVX2 detection
Automatically detects the availability of AVX2 on x86(_64) architectures based on CPUID, and falls back to the SSE2" backend if unavailable.
1 parent 8dcd064 commit 1767c76

9 files changed

Lines changed: 156 additions & 84 deletions

File tree

.github/workflows/chacha20.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,16 @@ jobs:
3131
- uses: actions/checkout@v1
3232
- uses: actions-rs/toolchain@v1
3333
with:
34-
profile: minimal
3534
toolchain: ${{ matrix.rust }}
3635
target: ${{ matrix.target }}
3736
override: true
38-
- run: cargo build --no-default-features --release --target ${{ matrix.target }}
39-
env:
40-
CARGO_INCREMENTAL: 0
41-
RUSTFLAGS: -D warnings
37+
profile: minimal
38+
- run: cargo build --target ${{ matrix.target }} --release --no-default-features --features cipher
39+
- run: cargo build --target ${{ matrix.target }} --release --no-default-features --features force-soft
40+
- run: cargo build --target ${{ matrix.target }} --release --no-default-features --features legacy
41+
- run: cargo build --target ${{ matrix.target }} --release --no-default-features --features rng
42+
- run: cargo build --target ${{ matrix.target }} --release --no-default-features --features xchacha20
43+
- run: cargo build --target ${{ matrix.target }} --release --no-default-features --features cipher,force-soft,legacy,rng,xchacha20,zeroize
4244

4345
test:
4446
runs-on: ubuntu-latest

Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

chacha20/Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,24 @@ readme = "README.md"
1717
edition = "2018"
1818

1919
[dependencies]
20+
cfg-if = "1"
2021
cipher = { version = "=0.3.0-pre", optional = true }
2122
rand_core = { version = "0.5", optional = true, default-features = false }
2223
zeroize = { version = "1", optional = true, default-features = false }
2324

25+
[target.'cfg(any(target_arch = "x86_64", target_arch = "x86"))'.dependencies]
26+
cpuid-bool = "0.2"
27+
2428
[dev-dependencies]
2529
cipher = { version = "=0.3.0-pre", features = ["dev"] }
2630
hex-literal = "0.2"
2731

2832
[features]
2933
default = ["xchacha20"]
34+
force-soft = []
3035
legacy = ["cipher"]
31-
xchacha20 = ["cipher"]
3236
rng = ["rand_core"]
37+
xchacha20 = ["cipher"]
3338

3439
[package.metadata.docs.rs]
3540
all-features = true

chacha20/src/backend.rs

Lines changed: 21 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,50 +2,26 @@
22
//!
33
//! <https://tools.ietf.org/html/rfc8439#section-2.3>
44
5-
// TODO(tarcieri): figure out what circumstances these occur in
6-
#![allow(unused_imports)]
7-
8-
pub(crate) mod soft;
9-
10-
use crate::rounds::Rounds;
11-
12-
#[cfg(all(
13-
any(target_arch = "x86", target_arch = "x86_64"),
14-
target_feature = "sse2",
15-
not(target_feature = "avx2")
16-
))]
17-
mod sse2;
18-
19-
#[cfg(all(
20-
any(target_arch = "x86", target_arch = "x86_64"),
21-
target_feature = "avx2"
22-
))]
23-
mod avx2;
24-
25-
#[cfg(not(all(
26-
any(target_arch = "x86", target_arch = "x86_64"),
27-
any(target_feature = "sse2", target_feature = "avx2")
28-
)))]
29-
pub(crate) use self::soft::{State, BUFFER_SIZE};
30-
31-
#[cfg(all(
32-
any(target_arch = "x86", target_arch = "x86_64"),
33-
target_feature = "sse2",
34-
not(target_feature = "avx2")
35-
))]
36-
pub(crate) use self::sse2::{State, BUFFER_SIZE};
37-
38-
#[cfg(all(
39-
any(target_arch = "x86", target_arch = "x86_64"),
40-
target_feature = "avx2"
41-
))]
42-
pub(crate) use self::avx2::{State, BUFFER_SIZE};
43-
44-
use core::fmt::{self, Debug};
45-
46-
/// Common debug impl for all blocks
47-
impl<R: Rounds> Debug for State<R> {
48-
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
49-
f.write_str("State {{ .. }}")
5+
use cfg_if::cfg_if;
6+
7+
cfg_if! {
8+
if #[cfg(all(
9+
any(target_arch = "x86", target_arch = "x86_64"),
10+
not(feature = "force-soft")
11+
))] {
12+
pub(crate) mod autodetect;
13+
pub(crate) mod avx2;
14+
pub(crate) mod sse2;
15+
16+
#[cfg(feature = "cipher")]
17+
pub(crate) use self::autodetect::{State, BUFFER_SIZE};
18+
19+
#[cfg(feature = "xchacha20")]
20+
pub(crate) mod soft;
21+
} else {
22+
pub(crate) mod soft;
23+
24+
#[cfg(feature = "cipher")]
25+
pub(crate) use self::soft::{State, BUFFER_SIZE};
5026
}
5127
}

chacha20/src/backend/autodetect.rs

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
//! Autodetection support for AVX2 CPU intrinsics on x86 CPUs, with fallback
2+
//! to the SSE2 backend when it's unavailable (the `sse2` target feature is
3+
//! enabled-by-default on all x86(_64) CPUs)
4+
5+
use crate::{rounds::Rounds, IV_SIZE, KEY_SIZE, BLOCK_SIZE};
6+
use super::{avx2, sse2};
7+
8+
/// Size of buffers passed to `generate` and `apply_keystream` for this
9+
/// backend, which operates on two blocks in parallel for optimal performance.
10+
pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 2;
11+
12+
cpuid_bool::new!(avx2_cpuid, "avx2");
13+
14+
pub struct State<R: Rounds> {
15+
inner: Inner<R>,
16+
token: avx2_cpuid::InitToken,
17+
}
18+
19+
union Inner<R: Rounds> {
20+
avx2: avx2::State<R>,
21+
sse2: sse2::State<R>,
22+
}
23+
24+
impl<R: Rounds> State<R> {
25+
/// Initialize ChaCha block function with the given key size, IV, and
26+
/// number of rounds.
27+
#[inline]
28+
pub(crate) fn new(key: &[u8; KEY_SIZE], iv: [u8; IV_SIZE]) -> Self {
29+
let (token, avx2_present) = avx2_cpuid::init_get();
30+
31+
let inner = if avx2_present {
32+
Inner {
33+
avx2: avx2::State::new(key, iv),
34+
}
35+
} else {
36+
Inner {
37+
sse2: sse2::State::new(key, iv),
38+
}
39+
};
40+
41+
Self { inner, token }
42+
}
43+
44+
#[inline]
45+
pub(crate) fn generate(&self, counter: u64, output: &mut [u8]) {
46+
if self.token.get() {
47+
unsafe { self.inner.avx2.generate(counter, output) }
48+
} else {
49+
unsafe { self.inner.sse2.generate(counter, output) }
50+
}
51+
}
52+
53+
#[inline]
54+
#[cfg(feature = "cipher")]
55+
pub(crate) fn apply_keystream(&self, counter: u64, output: &mut [u8]) {
56+
if self.token.get() {
57+
unsafe { self.inner.avx2.apply_keystream(counter, output) }
58+
} else {
59+
unsafe { self.inner.sse2.apply_keystream(counter, output) }
60+
}
61+
}
62+
}
63+
64+
impl<R: Rounds> Clone for State<R> {
65+
fn clone(&self) -> Self {
66+
let inner = if self.token.get() {
67+
Inner {
68+
avx2: unsafe { self.inner.avx2.clone() },
69+
}
70+
} else {
71+
Inner {
72+
sse2: unsafe { self.inner.sse2.clone() },
73+
}
74+
};
75+
76+
Self {
77+
inner,
78+
token: self.token,
79+
}
80+
}
81+
}

chacha20/src/backend/avx2.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,16 @@
1010
1111
use crate::{rounds::Rounds, BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE};
1212
use core::{convert::TryInto, marker::PhantomData};
13+
use super::autodetect::BUFFER_SIZE;
1314

1415
#[cfg(target_arch = "x86")]
1516
use core::arch::x86::*;
1617
#[cfg(target_arch = "x86_64")]
1718
use core::arch::x86_64::*;
1819

19-
/// Size of buffers passed to `generate` and `apply_keystream` for this
20-
/// backend, which operates on two blocks in parallel for optimal performance.
21-
pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 2;
22-
2320
/// The ChaCha20 block function (AVX2 accelerated implementation for x86/x86_64)
2421
// TODO(tarcieri): zeroize?
25-
#[derive(Clone)]
22+
#[derive(Copy, Clone)]
2623
pub(crate) struct State<R: Rounds> {
2724
v0: __m256i,
2825
v1: __m256i,

chacha20/src/backend/soft.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@ use crate::{rounds::Rounds, BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE, STATE_WORD
99
use core::{convert::TryInto, marker::PhantomData};
1010

1111
/// Size of buffers passed to `generate` and `apply_keystream` for this backend
12+
#[allow(dead_code)]
1213
pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE;
1314

1415
/// The ChaCha20 block function (portable software implementation)
1516
// TODO(tarcieri): zeroize?
16-
#[allow(dead_code)]
1717
#[derive(Clone)]
18+
#[allow(dead_code)]
1819
pub(crate) struct State<R: Rounds> {
1920
/// Internal state of the block function
2021
state: [u32; STATE_WORDS],

chacha20/src/backend/sse2.rs

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,18 @@
44
//!
55
//! SSE2-optimized implementation for x86/x86-64 CPUs.
66
7-
use crate::{rounds::Rounds, BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE};
7+
use crate::{rounds::Rounds, CONSTANTS, IV_SIZE, BLOCK_SIZE, KEY_SIZE};
88
use core::{convert::TryInto, marker::PhantomData};
9+
use super::autodetect::BUFFER_SIZE;
910

1011
#[cfg(target_arch = "x86")]
1112
use core::arch::x86::*;
1213
#[cfg(target_arch = "x86_64")]
1314
use core::arch::x86_64::*;
1415

15-
/// Size of buffers passed to `generate` and `apply_keystream` for this backend
16-
pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE;
17-
1816
/// The ChaCha20 block function (SSE2 accelerated implementation for x86/x86_64)
1917
// TODO(tarcieri): zeroize?
20-
#[derive(Clone)]
18+
#[derive(Copy, Clone)]
2119
pub(crate) struct State<R: Rounds> {
2220
v0: __m128i,
2321
v1: __m128i,
@@ -49,11 +47,13 @@ impl<R: Rounds> State<R> {
4947
pub(crate) fn generate(&self, counter: u64, output: &mut [u8]) {
5048
debug_assert_eq!(output.len(), BUFFER_SIZE);
5149

52-
unsafe {
53-
let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
54-
let mut v3 = iv_setup(self.iv, counter);
55-
self.rounds(&mut v0, &mut v1, &mut v2, &mut v3);
56-
store(v0, v1, v2, v3, output)
50+
for (i, chunk) in output.chunks_exact_mut(BLOCK_SIZE).enumerate() {
51+
unsafe {
52+
let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
53+
let mut v3 = iv_setup(self.iv, counter.checked_add(i as u64).unwrap());
54+
self.rounds(&mut v0, &mut v1, &mut v2, &mut v3);
55+
store(v0, v1, v2, v3, chunk)
56+
}
5757
}
5858
}
5959

@@ -63,15 +63,17 @@ impl<R: Rounds> State<R> {
6363
pub(crate) fn apply_keystream(&self, counter: u64, output: &mut [u8]) {
6464
debug_assert_eq!(output.len(), BUFFER_SIZE);
6565

66-
unsafe {
67-
let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
68-
let mut v3 = iv_setup(self.iv, counter);
69-
self.rounds(&mut v0, &mut v1, &mut v2, &mut v3);
70-
71-
for (chunk, a) in output.chunks_mut(0x10).zip(&[v0, v1, v2, v3]) {
72-
let b = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
73-
let out = _mm_xor_si128(*a, b);
74-
_mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, out);
66+
for (i, chunk) in output.chunks_exact_mut(BLOCK_SIZE).enumerate() {
67+
unsafe {
68+
let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
69+
let mut v3 = iv_setup(self.iv, counter.checked_add(i as u64).unwrap());
70+
self.rounds(&mut v0, &mut v1, &mut v2, &mut v3);
71+
72+
for (ch, a) in chunk.chunks_exact_mut(0x10).zip(&[v0, v1, v2, v3]) {
73+
let b = _mm_loadu_si128(ch.as_ptr() as *const __m128i);
74+
let out = _mm_xor_si128(*a, b);
75+
_mm_storeu_si128(ch.as_mut_ptr() as *mut __m128i, out);
76+
}
7577
}
7678
}
7779
}
@@ -263,12 +265,12 @@ mod tests {
263265

264266
#[test]
265267
fn generate_vs_scalar_impl() {
266-
let mut soft_result = [0u8; BLOCK_SIZE];
268+
let mut soft_result = [0u8; soft::BUFFER_SIZE];
267269
soft::State::<R20>::new(&R_KEY, R_IV).generate(R_CNT, &mut soft_result);
268270

269-
let mut simd_result = [0u8; BLOCK_SIZE];
271+
let mut simd_result = [0u8; BUFFER_SIZE];
270272
State::<R20>::new(&R_KEY, R_IV).generate(R_CNT, &mut simd_result);
271273

272-
assert_eq!(&soft_result[..], &simd_result[..])
274+
assert_eq!(&soft_result[..], &simd_result[..soft::BUFFER_SIZE])
273275
}
274276
}

chacha20/src/rng.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use crate::{
1212
macro_rules! impl_chacha_rng {
1313
($name:ident, $core:ident, $rounds:ident, $doc:expr) => {
1414
#[doc = $doc]
15-
#[derive(Clone, Debug)]
15+
#[derive(Clone)]
1616
#[cfg_attr(docsrs, doc(cfg(feature = "rng")))]
1717
pub struct $name(BlockRng<$core>);
1818

@@ -51,10 +51,10 @@ macro_rules! impl_chacha_rng {
5151
impl CryptoRng for $name {}
5252

5353
#[doc = "Core random number generator, for use with [`rand_core::block::BlockRng`]"]
54-
#[derive(Clone, Debug)]
54+
#[derive(Clone)]
5555
#[cfg_attr(docsrs, doc(cfg(feature = "rng")))]
5656
pub struct $core {
57-
block: Block<$rounds>,
57+
block: State<$rounds>,
5858
counter: u64,
5959
}
6060

@@ -63,7 +63,7 @@ macro_rules! impl_chacha_rng {
6363

6464
#[inline]
6565
fn from_seed(seed: Self::Seed) -> Self {
66-
let block = Block::new(&seed, Default::default());
66+
let block = State::new(&seed, Default::default());
6767
Self { block, counter: 0 }
6868
}
6969
}
@@ -75,7 +75,7 @@ macro_rules! impl_chacha_rng {
7575
fn generate(&mut self, results: &mut Self::Results) {
7676
assert!(self.counter <= MAX_BLOCKS as u64, "maximum number of allowed ChaCha blocks exceeded");
7777

78-
// TODO(tarcieri): eliminate unsafety (replace w\ [u8; BLOCK_SIZE)
78+
// TODO(tarcieri): eliminate unsafety (replace w\ `[u8; BLOCK_SIZE]`)
7979
self.block.generate(self.counter, unsafe {
8080
&mut *(results.as_mut_ptr() as *mut [u8; BUFFER_SIZE])
8181
});

0 commit comments

Comments
 (0)