diff --git a/.travis.yml b/.travis.yml
index 1c6ea4cb9d..29b02cc710 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,16 +9,20 @@ matrix:
     - env: TARGET=i686-unknown-linux-gnu
     - env: TARGET=x86_64-unknown-linux-gnu NO_ADD=1
     - env: TARGET=x86_64-unknown-linux-gnu-emulated NO_ADD=1 STDSIMD_TEST_EVERYTHING=1
+    - env: TARGET=x86_64-linux-android
     - env: TARGET=arm-unknown-linux-gnueabihf
+    - env: TARGET=arm-linux-androideabi
     - env: TARGET=armv7-unknown-linux-gnueabihf
     - env: TARGET=aarch64-unknown-linux-gnu
     - env: TARGET=mips-unknown-linux-gnu NORUN=1
     - env: TARGET=mipsel-unknown-linux-gnu NORUN=1
     - env: TARGET=mips64-unknown-linux-gnuabi64 NORUN=1
     - env: TARGET=mips64el-unknown-linux-gnuabi64 NORUN=1
+    - env: TARGET=aarch64-linux-android
     - env: TARGET=powerpc-unknown-linux-gnu
     - env: TARGET=powerpc64-unknown-linux-gnu
     - env: TARGET=powerpc64le-unknown-linux-gnu
+    - env: TARGET=s390x-unknown-linux-gnu NORUN=1
     - os: osx
       env: TARGET=i686-apple-darwin
       script: ci/run.sh
diff --git a/README.md b/README.md
index d8e5b22c62..e56d4ded7a 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,30 @@
-stdsimd
+stdsimd - Rust's standard library SIMD components
 =======
 
 [![Travis-CI Status]][travis] [![Appveyor Status]][appveyor] [![Latest Version]][crates.io] [![docs]][docs.rs]
 
-> Experimental support for SIMD destined to eventually become part of Rust's
-> standard library
+# Usage
 
-This is a **work in progress**.
+`stdsimd` is now shipped with Rust's `std` library - its is part of `libcore`
+and `libstd`.
+
+The easiest way to use it is just to import it via `use std::arch`. 
+
+The `std::arch` component for `x86` is available in stable Rust. The `std::arch`
+components for other architectures and the `std::simd` component require nightly
+Rust.
+
+Using `stdsimd` master branch is not recommended. It requires nightly Rust, it
+only works with particular Rust nightly versions, and it can (and does) break
+often. If you need to use `stdsimd` master branch, you can add it to your
+`Cargo.toml` as follows:
+
+```toml
+#[dependencies]
+stdsimd = { git = "https://github.com/rust-lang-nursery/stdsimd.git" }
+```
+
+# Documentation
 
 * [Documentation - i686][i686]
 * [Documentation - x86\_64][x86_64]
diff --git a/ci/android-install-ndk.sh b/ci/android-install-ndk.sh
new file mode 100644
index 0000000000..873f6c52c8
--- /dev/null
+++ b/ci/android-install-ndk.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+set -ex
+
+curl -O https://dl.google.com/android/repository/android-ndk-r15b-linux-x86_64.zip
+unzip -q android-ndk-r15b-linux-x86_64.zip
+
+case "$1" in
+  aarch64)
+    arch=arm64
+    ;;
+
+  i686)
+    arch=x86
+    ;;
+
+  *)
+    arch=$1
+    ;;
+esac;
+
+android-ndk-r15b/build/tools/make_standalone_toolchain.py \
+        --unified-headers \
+        --install-dir /android/ndk-$1 \
+        --arch $arch \
+        --api 24
+
+rm -rf ./android-ndk-r15b-linux-x86_64.zip ./android-ndk-r15b
diff --git a/ci/android-install-sdk.sh b/ci/android-install-sdk.sh
new file mode 100644
index 0000000000..ab7e14d95b
--- /dev/null
+++ b/ci/android-install-sdk.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+set -ex
+
+# Prep the SDK and emulator
+#
+# Note that the update process requires that we accept a bunch of licenses, and
+# we can't just pipe `yes` into it for some reason, so we take the same strategy
+# located in https://github.com/appunite/docker by just wrapping it in a script
+# which apparently magically accepts the licenses.
+
+mkdir sdk
+curl https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O
+unzip -d sdk sdk-tools-linux-3859397.zip
+
+case "$1" in
+  arm | armv7)
+    abi=armeabi-v7a
+    ;;
+
+  aarch64)
+    abi=arm64-v8a
+    ;;
+
+  i686)
+    abi=x86
+    ;;
+
+  x86_64)
+    abi=x86_64
+    ;;
+
+  *)
+    echo "invalid arch: $1"
+    exit 1
+    ;;
+esac;
+
+# --no_https avoids
+# javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found
+echo "yes" | \
+    ./sdk/tools/bin/sdkmanager --no_https \
+        "emulator" \
+        "platform-tools" \
+        "platforms;android-24" \
+        "system-images;android-24;default;$abi"
+
+echo "no" |
+    ./sdk/tools/bin/avdmanager create avd \
+        --name $1 \
+        --package "system-images;android-24;default;$abi"
diff --git a/ci/android-sysimage.sh b/ci/android-sysimage.sh
new file mode 100644
index 0000000000..9611dfeb0d
--- /dev/null
+++ b/ci/android-sysimage.sh
@@ -0,0 +1,52 @@
+# Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+set -ex
+
+URL=https://dl.google.com/android/repository/sys-img/android
+
+main() {
+    local arch=$1
+    local name=$2
+    local dest=/system
+    local td=$(mktemp -d)
+
+    apt-get install --no-install-recommends e2tools
+
+    pushd $td
+    curl -O $URL/$name
+    unzip -q $name
+
+    local system=$(find . -name system.img)
+    mkdir -p $dest/{bin,lib,lib64}
+
+    # Extract android linker and libraries to /system
+    # This allows android executables to be run directly (or with qemu)
+    if [ $arch = "x86_64" -o $arch = "arm64" ]; then
+        e2cp -p $system:/bin/linker64 $dest/bin/
+        e2cp -p $system:/lib64/libdl.so $dest/lib64/
+        e2cp -p $system:/lib64/libc.so $dest/lib64/
+        e2cp -p $system:/lib64/libm.so $dest/lib64/
+    else
+        e2cp -p $system:/bin/linker $dest/bin/
+        e2cp -p $system:/lib/libdl.so $dest/lib/
+        e2cp -p $system:/lib/libc.so $dest/lib/
+        e2cp -p $system:/lib/libm.so $dest/lib/
+    fi
+
+    # clean up
+    apt-get purge --auto-remove -y e2tools
+
+    popd
+
+    rm -rf $td
+}
+
+main "${@}"
diff --git a/ci/docker/aarch64-linux-android/Dockerfile b/ci/docker/aarch64-linux-android/Dockerfile
new file mode 100644
index 0000000000..27bde89c5a
--- /dev/null
+++ b/ci/docker/aarch64-linux-android/Dockerfile
@@ -0,0 +1,47 @@
+FROM ubuntu:16.04
+
+RUN dpkg --add-architecture i386 && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+  file \
+  make \
+  curl \
+  ca-certificates \
+  python \
+  unzip \
+  expect \
+  openjdk-9-jre \
+  libstdc++6:i386 \
+  libpulse0 \
+  gcc \
+  libc6-dev
+
+WORKDIR /android/
+COPY android* /android/
+
+ENV ANDROID_ARCH=aarch64
+ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
+
+RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
+RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
+RUN mv /root/.android /tmp
+RUN chmod 777 -R /tmp/.android
+RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
+
+ENV PATH=$PATH:/rust/bin \
+    CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \
+    CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \
+    OBJDUMP=aarch64-linux-android-objdump \
+    HOME=/tmp
+
+ADD runtest-android.rs /tmp/runtest.rs
+ENTRYPOINT [ \
+  "bash", \
+  "-c", \
+  # set SHELL so android can detect a 64bits system, see
+  # http://stackoverflow.com/a/41789144
+  "SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \
+   rustc /tmp/runtest.rs -o /tmp/runtest && \
+   exec \"$@\"", \
+  "--" \
+]
diff --git a/ci/docker/arm-linux-androideabi/Dockerfile b/ci/docker/arm-linux-androideabi/Dockerfile
new file mode 100644
index 0000000000..995a9e30e6
--- /dev/null
+++ b/ci/docker/arm-linux-androideabi/Dockerfile
@@ -0,0 +1,47 @@
+FROM ubuntu:16.04
+
+RUN dpkg --add-architecture i386 && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+  file \
+  make \
+  curl \
+  ca-certificates \
+  python \
+  unzip \
+  expect \
+  openjdk-9-jre \
+  libstdc++6:i386 \
+  libpulse0 \
+  gcc \
+  libc6-dev
+
+WORKDIR /android/
+COPY android* /android/
+
+ENV ANDROID_ARCH=arm
+ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
+
+RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
+RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
+RUN mv /root/.android /tmp
+RUN chmod 777 -R /tmp/.android
+RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
+
+ENV PATH=$PATH:/rust/bin \
+    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \
+    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \
+    OBJDUMP=arm-linux-androideabi-objdump \
+    HOME=/tmp
+
+ADD runtest-android.rs /tmp/runtest.rs
+ENTRYPOINT [ \
+  "bash", \
+  "-c", \
+  # set SHELL so android can detect a 64bits system, see
+  # http://stackoverflow.com/a/41789144
+  "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \
+   rustc /tmp/runtest.rs -o /tmp/runtest && \
+   exec \"$@\"", \
+  "--" \
+]
diff --git a/ci/docker/s390x-unknown-linux-gnu/Dockerfile b/ci/docker/s390x-unknown-linux-gnu/Dockerfile
new file mode 100644
index 0000000000..89d9d87a15
--- /dev/null
+++ b/ci/docker/s390x-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:17.10
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        curl ca-certificates \
+        gcc libc6-dev \
+        gcc-s390x-linux-gnu libc6-dev-s390x-cross \
+        qemu-user \
+        make \
+        file
+
+ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \
+    CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -L /usr/s390x-linux-gnu" \
+    OBJDUMP=s390x-linux-gnu-objdump
\ No newline at end of file
diff --git a/ci/docker/x86_64-linux-android/Dockerfile b/ci/docker/x86_64-linux-android/Dockerfile
new file mode 100644
index 0000000000..d52dd45b12
--- /dev/null
+++ b/ci/docker/x86_64-linux-android/Dockerfile
@@ -0,0 +1,29 @@
+FROM ubuntu:16.04
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+  ca-certificates \
+  curl \
+  gcc \
+  libc-dev \
+  python \
+  unzip \
+  file \
+  make
+
+WORKDIR /android/
+ENV ANDROID_ARCH=x86_64
+COPY android-install-ndk.sh /android/
+RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
+
+# We do not run x86_64-linux-android tests on an android emulator.
+# See ci/android-sysimage.sh for informations about how tests are run.
+COPY android-sysimage.sh /android/
+RUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip
+
+ENV PATH=$PATH:/rust/bin:/android/ndk-$ANDROID_ARCH/bin \
+    CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android-gcc \
+    CC_x86_64_linux_android=x86_64-linux-android-gcc \
+    CXX_x86_64_linux_android=x86_64-linux-android-g++ \
+    OBJDUMP=x86_64-linux-android-objdump \
+    HOME=/tmp
diff --git a/ci/run-docker.sh b/ci/run-docker.sh
index e07e1b0dc1..0c560c825c 100755
--- a/ci/run-docker.sh
+++ b/ci/run-docker.sh
@@ -5,7 +5,7 @@ set -ex
 
 run() {
     echo "Building docker container for TARGET=${1}"
-    docker build -t stdsimd ci/docker/$1
+    docker build -t stdsimd -f ci/docker/$1/Dockerfile ci/
     mkdir -p target
     target=$(echo $1 | sed 's/-emulated//')
     echo "Running docker"
@@ -18,6 +18,7 @@ run() {
       --volume `rustc --print sysroot`:/rust:ro \
       --env TARGET=$target \
       --env STDSIMD_TEST_EVERYTHING \
+      --env STDSIMD_ASSERT_INSTR_IGNORE \
       --volume `pwd`:/checkout:ro \
       --volume `pwd`/target:/checkout/target \
       --workdir /checkout \
diff --git a/ci/run.sh b/ci/run.sh
index 708d6ba341..ae4522987d 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -38,6 +38,9 @@ case ${TARGET} in
     i686-* | i586-*)
         export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static"
         ;;
+    *android*)
+        export STDSIMD_DISABLE_ASSERT_INSTR=1
+        ;;
     *)
         ;;
 esac
@@ -46,6 +49,7 @@ echo "RUSTFLAGS=${RUSTFLAGS}"
 echo "FEATURES=${FEATURES}"
 echo "OBJDUMP=${OBJDUMP}"
 echo "STDSIMD_DISABLE_ASSERT_INSTR=${STDSIMD_DISABLE_ASSERT_INSTR}"
+echo "STDSIMD_TEST_EVERYTHING=${STDSIMD_TEST_EVERYTHING}"
 
 cargo_test() {
     cmd="cargo test --target=$TARGET $1"
diff --git a/ci/runtest-android.rs b/ci/runtest-android.rs
new file mode 100644
index 0000000000..d8968f99f4
--- /dev/null
+++ b/ci/runtest-android.rs
@@ -0,0 +1,41 @@
+use std::env;
+use std::process::Command;
+use std::path::{Path, PathBuf};
+
+fn main() {
+    assert_eq!(env::args_os().len(), 2);
+    let test = PathBuf::from(env::args_os().nth(1).unwrap());
+    let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap());
+
+    let status = Command::new("adb")
+        .arg("wait-for-device")
+        .status()
+        .expect("failed to run: adb wait-for-device");
+    assert!(status.success());
+
+    let status = Command::new("adb")
+        .arg("push")
+        .arg(&test)
+        .arg(&dst)
+        .status()
+        .expect("failed to run: adb pushr");
+    assert!(status.success());
+
+    let output = Command::new("adb")
+        .arg("shell")
+        .arg(&dst)
+        .output()
+        .expect("failed to run: adb shell");
+    assert!(status.success());
+
+    println!("status: {}\nstdout ---\n{}\nstderr ---\n{}",
+             output.status,
+             String::from_utf8_lossy(&output.stdout),
+             String::from_utf8_lossy(&output.stderr));
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let mut lines = stdout.lines().filter(|l| l.starts_with("test result"));
+    if !lines.all(|l| l.contains("test result: ok") && l.contains("0 failed")) {
+        panic!("failed to find successful test run");
+    }
+}
diff --git a/coresimd/aarch64/crypto.rs b/coresimd/aarch64/crypto.rs
index a71c0b460d..75f247585c 100644
--- a/coresimd/aarch64/crypto.rs
+++ b/coresimd/aarch64/crypto.rs
@@ -16,36 +16,36 @@ extern "C" {
     fn vsha1h_u32_(hash_e: u32) -> u32;
     #[link_name = "llvm.aarch64.crypto.sha1su0"]
     fn vsha1su0q_u32_(
-        w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t
+        w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t,
     ) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha1su1"]
     fn vsha1su1q_u32_(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha1c"]
     fn vsha1cq_u32_(
-        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
     ) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha1p"]
     fn vsha1pq_u32_(
-        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
     ) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha1m"]
     fn vsha1mq_u32_(
-        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
     ) -> uint32x4_t;
 
     #[link_name = "llvm.aarch64.crypto.sha256h"]
     fn vsha256hq_u32_(
-        hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t
+        hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t,
     ) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha256h2"]
     fn vsha256h2q_u32_(
-        hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t
+        hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t,
     ) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha256su0"]
     fn vsha256su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t;
     #[link_name = "llvm.aarch64.crypto.sha256su1"]
     fn vsha256su1q_u32_(
-        tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t
+        tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t,
     ) -> uint32x4_t;
 }
 
@@ -97,7 +97,7 @@ pub unsafe fn vsha1h_u32(hash_e: u32) -> u32 {
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1c))]
 pub unsafe fn vsha1cq_u32(
-    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
 ) -> uint32x4_t {
     vsha1cq_u32_(hash_abcd, hash_e, wk)
 }
@@ -107,7 +107,7 @@ pub unsafe fn vsha1cq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1m))]
 pub unsafe fn vsha1mq_u32(
-    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
 ) -> uint32x4_t {
     vsha1mq_u32_(hash_abcd, hash_e, wk)
 }
@@ -117,7 +117,7 @@ pub unsafe fn vsha1mq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1p))]
 pub unsafe fn vsha1pq_u32(
-    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t,
 ) -> uint32x4_t {
     vsha1pq_u32_(hash_abcd, hash_e, wk)
 }
@@ -127,7 +127,7 @@ pub unsafe fn vsha1pq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1su0))]
 pub unsafe fn vsha1su0q_u32(
-    w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t
+    w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t,
 ) -> uint32x4_t {
     vsha1su0q_u32_(w0_3, w4_7, w8_11)
 }
@@ -137,7 +137,7 @@ pub unsafe fn vsha1su0q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha1su1))]
 pub unsafe fn vsha1su1q_u32(
-    tw0_3: uint32x4_t, w12_15: uint32x4_t
+    tw0_3: uint32x4_t, w12_15: uint32x4_t,
 ) -> uint32x4_t {
     vsha1su1q_u32_(tw0_3, w12_15)
 }
@@ -147,7 +147,7 @@ pub unsafe fn vsha1su1q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256h))]
 pub unsafe fn vsha256hq_u32(
-    hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t
+    hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t,
 ) -> uint32x4_t {
     vsha256hq_u32_(hash_abcd, hash_efgh, wk)
 }
@@ -157,7 +157,7 @@ pub unsafe fn vsha256hq_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256h2))]
 pub unsafe fn vsha256h2q_u32(
-    hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t
+    hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t,
 ) -> uint32x4_t {
     vsha256h2q_u32_(hash_efgh, hash_abcd, wk)
 }
@@ -167,7 +167,7 @@ pub unsafe fn vsha256h2q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256su0))]
 pub unsafe fn vsha256su0q_u32(
-    w0_3: uint32x4_t, w4_7: uint32x4_t
+    w0_3: uint32x4_t, w4_7: uint32x4_t,
 ) -> uint32x4_t {
     vsha256su0q_u32_(w0_3, w4_7)
 }
@@ -177,7 +177,7 @@ pub unsafe fn vsha256su0q_u32(
 #[target_feature(enable = "crypto")]
 #[cfg_attr(test, assert_instr(sha256su1))]
 pub unsafe fn vsha256su1q_u32(
-    tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t
+    tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t,
 ) -> uint32x4_t {
     vsha256su1q_u32_(tw0_3, w8_11, w12_15)
 }
@@ -199,22 +199,8 @@ mod tests {
         assert_eq!(
             r,
             u8x16::new(
-                124,
-                123,
-                124,
-                118,
-                124,
-                123,
-                124,
-                197,
-                124,
-                123,
-                124,
-                118,
-                124,
-                123,
-                124,
-                197
+                124, 123, 124, 118, 124, 123, 124, 197, 124, 123, 124, 118,
+                124, 123, 124, 197
             )
         );
     }
@@ -229,22 +215,7 @@ mod tests {
         assert_eq!(
             r,
             u8x16::new(
-                9,
-                213,
-                9,
-                251,
-                9,
-                213,
-                9,
-                56,
-                9,
-                213,
-                9,
-                251,
-                9,
-                213,
-                9,
-                56
+                9, 213, 9, 251, 9, 213, 9, 56, 9, 213, 9, 251, 9, 213, 9, 56
             )
         );
     }
@@ -256,24 +227,7 @@ mod tests {
         let r: u8x16 = vaesmcq_u8(data).into_bits();
         assert_eq!(
             r,
-            u8x16::new(
-                3,
-                4,
-                9,
-                10,
-                15,
-                8,
-                21,
-                30,
-                3,
-                4,
-                9,
-                10,
-                15,
-                8,
-                21,
-                30
-            )
+            u8x16::new(3, 4, 9, 10, 15, 8, 21, 30, 3, 4, 9, 10, 15, 8, 21, 30)
         );
     }
 
@@ -285,22 +239,8 @@ mod tests {
         assert_eq!(
             r,
             u8x16::new(
-                43,
-                60,
-                33,
-                50,
-                103,
-                80,
-                125,
-                70,
-                43,
-                60,
-                33,
-                50,
-                103,
-                80,
-                125,
-                70
+                43, 60, 33, 50, 103, 80, 125, 70, 43, 60, 33, 50, 103, 80,
+                125, 70
             )
         );
     }
diff --git a/coresimd/aarch64/neon.rs b/coresimd/aarch64/neon.rs
index b05113056e..9656c36302 100644
--- a/coresimd/aarch64/neon.rs
+++ b/coresimd/aarch64/neon.rs
@@ -546,7 +546,6 @@ pub unsafe fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     vpmaxq_f64_(a, b)
 }
 
-
 #[cfg(test)]
 mod tests {
     use coresimd::aarch64::*;
@@ -800,20 +799,11 @@ mod tests {
     #[simd_test(enable = "neon")]
     unsafe fn test_vpminq_s8() {
         #[cfg_attr(rustfmt, skip)]
-        let a = i8x16::new(
-            1, -2, 3, -4, 5, 6, 7, 8,
-            1, 2, 3, 4, 5, 6, 7, 8
-        );
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         #[cfg_attr(rustfmt, skip)]
-        let b = i8x16::new(
-            0, 3, 2, 5, 4, 7, 6, 9,
-            0, 3, 2, 5, 4, 7, 6, 9
-        );
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
         #[cfg_attr(rustfmt, skip)]
-        let e = i8x16::new(
-            -2, -4, 5, 7, 1, 3, 5, 7,
-            0, 2, 4, 6, 0, 2, 4, 6,
-        );
+        let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
         let r: i8x16 = vpminq_s8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
@@ -839,20 +829,11 @@ mod tests {
     #[simd_test(enable = "neon")]
     unsafe fn test_vpminq_u8() {
         #[cfg_attr(rustfmt, skip)]
-        let a = u8x16::new(
-            1, 2, 3, 4, 5, 6, 7, 8,
-            1, 2, 3, 4, 5, 6, 7, 8
-        );
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         #[cfg_attr(rustfmt, skip)]
-        let b = u8x16::new(
-            0, 3, 2, 5, 4, 7, 6, 9,
-            0, 3, 2, 5, 4, 7, 6, 9
-        );
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
         #[cfg_attr(rustfmt, skip)]
-        let e = u8x16::new(
-            1, 3, 5, 7, 1, 3, 5, 7,
-            0, 2, 4, 6, 0, 2, 4, 6,
-        );
+        let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
         let r: u8x16 = vpminq_u8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
@@ -896,20 +877,11 @@ mod tests {
     #[simd_test(enable = "neon")]
     unsafe fn test_vpmaxq_s8() {
         #[cfg_attr(rustfmt, skip)]
-        let a = i8x16::new(
-            1, -2, 3, -4, 5, 6, 7, 8,
-            1, 2, 3, 4, 5, 6, 7, 8
-        );
+        let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         #[cfg_attr(rustfmt, skip)]
-        let b = i8x16::new(
-            0, 3, 2, 5, 4, 7, 6, 9,
-            0, 3, 2, 5, 4, 7, 6, 9
-        );
+        let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
         #[cfg_attr(rustfmt, skip)]
-        let e = i8x16::new(
-            1, 3, 6, 8, 2, 4, 6, 8,
-            3, 5, 7, 9, 3, 5, 7, 9,
-        );
+        let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
         let r: i8x16 = vpmaxq_s8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
@@ -935,20 +907,11 @@ mod tests {
     #[simd_test(enable = "neon")]
     unsafe fn test_vpmaxq_u8() {
         #[cfg_attr(rustfmt, skip)]
-        let a = u8x16::new(
-            1, 2, 3, 4, 5, 6, 7, 8,
-            1, 2, 3, 4, 5, 6, 7, 8
-        );
+        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         #[cfg_attr(rustfmt, skip)]
-        let b = u8x16::new(
-            0, 3, 2, 5, 4, 7, 6, 9,
-            0, 3, 2, 5, 4, 7, 6, 9
-        );
+        let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
         #[cfg_attr(rustfmt, skip)]
-        let e = u8x16::new(
-            2, 4, 6, 8, 2, 4, 6, 8,
-            3, 5, 7, 9, 3, 5, 7, 9,
-        );
+        let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
         let r: u8x16 = vpmaxq_u8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs
index 9798db59bc..10648eff38 100644
--- a/coresimd/arm/mod.rs
+++ b/coresimd/arm/mod.rs
@@ -19,11 +19,19 @@ pub use self::v7::*;
 
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
-#[cfg(any(target_arch = "aarch64",
-          all(target_feature = "v7", target_feature = "neon"),
-          dox))]
+#[cfg(
+    any(
+        target_arch = "aarch64",
+        all(target_feature = "v7", target_feature = "neon"),
+        dox
+    )
+)]
 mod neon;
-#[cfg(any(target_arch = "aarch64",
-          all(target_feature = "v7", target_feature = "neon"),
-          dox))]
+#[cfg(
+    any(
+        target_arch = "aarch64",
+        all(target_feature = "v7", target_feature = "neon"),
+        dox
+    )
+)]
 pub use self::neon::*;
diff --git a/coresimd/arm/neon.rs b/coresimd/arm/neon.rs
index 1d786144d9..f00096505a 100644
--- a/coresimd/arm/neon.rs
+++ b/coresimd/arm/neon.rs
@@ -366,52 +366,82 @@ impl_from_bits_!(
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32"
+    )]
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
     fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
 
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8"
+    )]
     fn vpmins_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16"
+    )]
     fn vpmins_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32"
+    )]
     fn vpmins_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8"
+    )]
     fn vpminu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16"
+    )]
     fn vpminu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32"
+    )]
     fn vpminu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32"
+    )]
     fn vpminf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
 
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8"
+    )]
     fn vpmaxs_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16"
+    )]
     fn vpmaxs_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32"
+    )]
     fn vpmaxs_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8"
+    )]
     fn vpmaxu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16"
+    )]
     fn vpmaxu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32"
+    )]
     fn vpmaxu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")]
+    #[cfg_attr(
+        target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32"
+    )]
     fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
 }
 
@@ -782,7 +812,7 @@ pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
-pub unsafe fn vpmin_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t {
+pub unsafe fn vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     vpmins_v8i8(a, b)
 }
 
@@ -792,7 +822,7 @@ pub unsafe fn vpmin_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
-pub unsafe fn vpmin_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t {
+pub unsafe fn vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     vpmins_v4i16(a, b)
 }
 
@@ -802,7 +832,7 @@ pub unsafe fn vpmin_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))]
-pub unsafe fn vpmin_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t {
+pub unsafe fn vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     vpmins_v2i32(a, b)
 }
 
@@ -812,7 +842,7 @@ pub unsafe fn vpmin_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
-pub unsafe fn vpmin_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+pub unsafe fn vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     vpminu_v8i8(a, b)
 }
 
@@ -822,7 +852,7 @@ pub unsafe fn vpmin_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
-pub unsafe fn vpmin_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+pub unsafe fn vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     vpminu_v4i16(a, b)
 }
 
@@ -832,7 +862,7 @@ pub unsafe fn vpmin_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))]
-pub unsafe fn vpmin_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+pub unsafe fn vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     vpminu_v2i32(a, b)
 }
 
@@ -842,7 +872,7 @@ pub unsafe fn vpmin_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminp))]
-pub unsafe fn vpmin_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t {
+pub unsafe fn vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     vpminf_v2f32(a, b)
 }
 
@@ -852,7 +882,7 @@ pub unsafe fn vpmin_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
-pub unsafe fn vpmax_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t {
+pub unsafe fn vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     vpmaxs_v8i8(a, b)
 }
 
@@ -862,7 +892,7 @@ pub unsafe fn vpmax_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
-pub unsafe fn vpmax_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t {
+pub unsafe fn vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     vpmaxs_v4i16(a, b)
 }
 
@@ -872,7 +902,7 @@ pub unsafe fn vpmax_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))]
-pub unsafe fn vpmax_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t {
+pub unsafe fn vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     vpmaxs_v2i32(a, b)
 }
 
@@ -882,7 +912,7 @@ pub unsafe fn vpmax_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
-pub unsafe fn vpmax_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+pub unsafe fn vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     vpmaxu_v8i8(a, b)
 }
 
@@ -892,7 +922,7 @@ pub unsafe fn vpmax_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
-pub unsafe fn vpmax_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+pub unsafe fn vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     vpmaxu_v4i16(a, b)
 }
 
@@ -902,7 +932,7 @@ pub unsafe fn vpmax_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))]
-pub unsafe fn vpmax_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+pub unsafe fn vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     vpmaxu_v2i32(a, b)
 }
 
@@ -912,11 +942,10 @@ pub unsafe fn vpmax_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxp))]
-pub unsafe fn vpmax_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t {
+pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     vpmaxf_v2f32(a, b)
 }
 
-
 #[cfg(test)]
 mod tests {
     use coresimd::arm::*;
diff --git a/coresimd/mod.rs b/coresimd/mod.rs
index 5007ac30de..6fc312f420 100644
--- a/coresimd/mod.rs
+++ b/coresimd/mod.rs
@@ -134,7 +134,6 @@ pub mod arch {
     /// Platform-specific intrinsics for the `PowerPC64` platform.
     ///
     /// See the [module documentation](../index.html) for more details.
-    #[cfg(target_arch = "powerpc64")]
     #[cfg(any(target_arch = "powerpc64", dox))]
     #[doc(cfg(target_arch = "powerpc64"))]
     #[unstable(feature = "stdsimd", issue = "27731")]
diff --git a/coresimd/powerpc/altivec.rs b/coresimd/powerpc/altivec.rs
index 1765f79bb5..0790474b2b 100644
--- a/coresimd/powerpc/altivec.rs
+++ b/coresimd/powerpc/altivec.rs
@@ -75,8 +75,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    i8x16:
-    vector_signed_char,
+    i8x16: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -114,8 +113,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    u8x16:
-    vector_signed_char,
+    u8x16: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -135,11 +133,7 @@ impl_from_bits_!(
     vector_bool_short,
     vector_bool_int
 );
-impl_from_bits_!(
-    m8x16: vector_bool_char,
-    vector_bool_short,
-    vector_bool_int
-);
+impl_from_bits_!(m8x16: vector_bool_char, vector_bool_short, vector_bool_int);
 
 impl_from_bits_!(
     vector_signed_short: u64x2,
@@ -166,8 +160,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    i16x8:
-    vector_signed_char,
+    i16x8: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -204,8 +197,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    u16x8:
-    vector_signed_char,
+    u16x8: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -251,8 +243,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    i32x4:
-    vector_signed_char,
+    i32x4: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -289,8 +280,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    u32x4:
-    vector_signed_char,
+    u32x4: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -345,8 +335,7 @@ impl_from_bits_!(
     vector_bool_int
 );
 impl_from_bits_!(
-    f32x4:
-    vector_signed_char,
+    f32x4: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -360,10 +349,18 @@ impl_from_bits_!(
 
 #[allow(improper_ctypes)]
 extern "C" {
-#[ link_name = "llvm.ppc.altivec.vperm" ]
-fn vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int;
-#[ link_name = "llvm.ppc.altivec.vmhaddshs" ]
-fn vmhaddshs(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vperm"]
+    fn vperm(
+        a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char,
+    ) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.vmhaddshs"]
+    fn vmhaddshs(
+        a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
+    ) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.vmhraddshs"]
+    fn vmhraddshs(
+        a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
+    ) -> vector_signed_short;
 }
 
 mod sealed {
@@ -373,7 +370,9 @@ mod sealed {
     #[inline]
     #[target_feature(enable = "altivec")]
     #[cfg_attr(test, assert_instr(vperm))]
-    unsafe fn vec_vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int {
+    unsafe fn vec_vperm(
+        a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char,
+    ) -> vector_signed_int {
         vperm(a, b, c)
     }
 
@@ -703,7 +702,6 @@ where
     a.vec_add(b)
 }
 
-
 /// Endian-biased intrinsics
 #[cfg(target_endian = "little")]
 mod endian {
@@ -718,8 +716,10 @@ mod endian {
         // vperm has big-endian bias
         //
         // Xor the mask and flip the arguments
-        let d = u8x16::new(255, 255, 255, 255, 255, 255, 255, 255,
-                           255, 255, 255, 255, 255, 255, 255, 255).into_bits();
+        let d = u8x16::new(
+            255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+            255, 255, 255,
+        ).into_bits();
         let c = simd_xor(c, d);
 
         b.vec_vperm(a, c)
@@ -730,10 +730,22 @@ mod endian {
 #[inline]
 #[target_feature(enable = "altivec")]
 #[cfg_attr(test, assert_instr(vmhaddshs))]
-pub unsafe fn vec_madds(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short {
+pub unsafe fn vec_madds(
+    a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
+) -> vector_signed_short {
     vmhaddshs(a, b, c)
 }
 
+/// Vector Multiply Round and Add Saturated
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vmhraddshs))]
+pub unsafe fn vec_mradds(
+    a: vector_signed_short, b: vector_signed_short, c: vector_signed_short,
+) -> vector_signed_short {
+    vmhraddshs(a, b, c)
+}
+
 #[cfg(target_endian = "big")]
 mod endian {
     use super::*;
@@ -776,89 +788,122 @@ mod tests {
     }
 
     test_vec_perm!{test_vec_perm_u8x16,
-                   u8x16, vector_unsigned_char,
-                   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-                   [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
-                   [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
-                    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
-                   [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+    u8x16, vector_unsigned_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
     test_vec_perm!{test_vec_perm_i8x16,
-                   i8x16, vector_signed_char,
-                   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-                   [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
-                   [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
-                    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
-                   [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
+    i8x16, vector_signed_char,
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]}
     test_vec_perm!{test_vec_perm_m8x16,
-                   m8x16, vector_bool_char,
-                   [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false],
-                   [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true],
-                   [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
-                    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
-                   [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]}
+    m8x16, vector_bool_char,
+    [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]}
 
     test_vec_perm!{test_vec_perm_u16x8,
-                   u16x8, vector_unsigned_short,
-                   [0, 1, 2, 3, 4, 5, 6, 7],
-                   [10, 11, 12, 13, 14, 15, 16, 17],
-                   [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
-                    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
-                   [0, 10, 1, 11, 2, 12, 3, 13]}
+    u16x8, vector_unsigned_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
     test_vec_perm!{test_vec_perm_i16x8,
-                   i16x8, vector_signed_short,
-                   [0, 1, 2, 3, 4, 5, 6, 7],
-                   [10, 11, 12, 13, 14, 15, 16, 17],
-                   [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
-                    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
-                   [0, 10, 1, 11, 2, 12, 3, 13]}
+    i16x8, vector_signed_short,
+    [0, 1, 2, 3, 4, 5, 6, 7],
+    [10, 11, 12, 13, 14, 15, 16, 17],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [0, 10, 1, 11, 2, 12, 3, 13]}
     test_vec_perm!{test_vec_perm_m16x8,
-                   m16x8, vector_bool_short,
-                   [false, false, false, false, false, false, false, false],
-                   [true, true, true, true, true, true, true, true],
-                   [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
-                    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
-                   [false, true, false, true, false, true, false, true]}
+    m16x8, vector_bool_short,
+    [false, false, false, false, false, false, false, false],
+    [true, true, true, true, true, true, true, true],
+    [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17],
+    [false, true, false, true, false, true, false, true]}
 
     test_vec_perm!{test_vec_perm_u32x4,
-                   u32x4, vector_unsigned_int,
-                   [0, 1, 2, 3],
-                   [10, 11, 12, 13],
-                   [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                    0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
-                   [0, 10, 1, 11]}
+    u32x4, vector_unsigned_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
     test_vec_perm!{test_vec_perm_i32x4,
-                   i32x4, vector_signed_int,
-                   [0, 1, 2, 3],
-                   [10, 11, 12, 13],
-                   [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                    0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
-                   [0, 10, 1, 11]}
+    i32x4, vector_signed_int,
+    [0, 1, 2, 3],
+    [10, 11, 12, 13],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0, 10, 1, 11]}
     test_vec_perm!{test_vec_perm_m32x4,
-                   m32x4, vector_bool_int,
-                   [false, false, false, false],
-                   [true, true, true, true],
-                   [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                    0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
-                   [false, true, false, true]}
+    m32x4, vector_bool_int,
+    [false, false, false, false],
+    [true, true, true, true],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [false, true, false, true]}
     test_vec_perm!{test_vec_perm_f32x4,
-                   f32x4, vector_float,
-                   [0.0, 1.0, 2.0, 3.0],
-                   [1.0, 1.1, 1.2, 1.3],
-                   [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-                    0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
-                   [0.0, 1.0, 1.0, 1.1]}
+    f32x4, vector_float,
+    [0.0, 1.0, 2.0, 3.0],
+    [1.0, 1.1, 1.2, 1.3],
+    [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17],
+    [0.0, 1.0, 1.0, 1.1]}
 
     #[simd_test(enable = "altivec")]
     unsafe fn test_vec_madds() {
-        let a: vector_signed_short = i16x8::new(0 * 256, 1 * 256, 2 * 256, 3 * 256, 4 * 256, 5 * 256, 6 * 256, 7 * 256).into_bits();
-        let b: vector_signed_short = i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
-        let c: vector_signed_short = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits();
+        let a: vector_signed_short = i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ).into_bits();
+        let b: vector_signed_short =
+            i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
+        let c: vector_signed_short =
+            i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits();
 
         let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21);
 
         assert_eq!(d, vec_madds(a, b, c).into_bits());
     }
 
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_mradds() {
+        let a: vector_signed_short = i16x8::new(
+            0 * 256,
+            1 * 256,
+            2 * 256,
+            3 * 256,
+            4 * 256,
+            5 * 256,
+            6 * 256,
+            7 * 256,
+        ).into_bits();
+        let b: vector_signed_short =
+            i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits();
+        let c: vector_signed_short =
+            i16x8::new(0, 1, 2, 3, 4, 5, 6, i16::max_value() - 1).into_bits();
+
+        let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, i16::max_value());
+
+        assert_eq!(d, vec_mradds(a, b, c).into_bits());
+    }
+
     #[simd_test(enable = "altivec")]
     unsafe fn vec_add_i32x4_i32x4() {
         let x = i32x4::new(1, 2, 3, 4);
diff --git a/coresimd/powerpc64/mod.rs b/coresimd/powerpc64/mod.rs
index 9049b294d0..4d7d9076fb 100644
--- a/coresimd/powerpc64/mod.rs
+++ b/coresimd/powerpc64/mod.rs
@@ -1,6 +1,7 @@
 //! PowerPC 64
 //!
-//! The reference is the [64-Bit ELF V2 ABI Specification - Power Architecture].
+//! The reference is the [64-Bit ELF V2 ABI Specification - Power
+//! Architecture].
 //!
 //! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf
 
diff --git a/coresimd/powerpc64/vsx.rs b/coresimd/powerpc64/vsx.rs
index 1953e1501a..51a8e824c1 100644
--- a/coresimd/powerpc64/vsx.rs
+++ b/coresimd/powerpc64/vsx.rs
@@ -63,8 +63,7 @@ impl_from_bits_!(
     vector_double
 );
 impl_from_bits_!(
-    i64x2:
-    vector_signed_char,
+    i64x2: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -109,8 +108,7 @@ impl_from_bits_!(
     vector_double
 );
 impl_from_bits_!(
-    u64x2:
-    vector_signed_char,
+    u64x2: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -155,8 +153,7 @@ impl_from_bits_!(
     vector_bool_long
 );
 impl_from_bits_!(
-    f64x2:
-    vector_signed_char,
+    f64x2: vector_signed_char,
     vector_unsigned_char,
     vector_bool_char,
     vector_signed_short,
@@ -234,8 +231,12 @@ mod sealed {
     // xxpermdi has an big-endian bias and extended mnemonics
     #[inline]
     #[target_feature(enable = "vsx")]
-    #[cfg_attr(all(test, target_endian="little"), assert_instr(xxmrgld, dm = 0x0))]
-    #[cfg_attr(all(test, target_endian="big"), assert_instr(xxspltd, dm = 0x0))]
+    #[cfg_attr(
+        all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0)
+    )]
+    #[cfg_attr(
+        all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0)
+    )]
     unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
         match dm & 0b11 {
             0 => simd_shuffle2(a, b, [0b00, 0b10]),
diff --git a/coresimd/ppsv/api/arithmetic_reductions.rs b/coresimd/ppsv/api/arithmetic_reductions.rs
index c8d4d23566..7b324a7bab 100644
--- a/coresimd/ppsv/api/arithmetic_reductions.rs
+++ b/coresimd/ppsv/api/arithmetic_reductions.rs
@@ -165,7 +165,6 @@ macro_rules! impl_float_arithmetic_reductions {
     };
 }
 
-
 #[cfg(test)]
 macro_rules! test_int_arithmetic_reductions {
     ($id:ident, $elem_ty:ident) => {
@@ -237,10 +236,7 @@ macro_rules! test_float_arithmetic_reductions {
             let v = $id::splat(1 as $elem_ty);
             assert_eq!(v.sum(), $id::lanes() as $elem_ty);
             let v = alternating(2);
-            assert_eq!(
-                v.sum(),
-                ($id::lanes() / 2 + $id::lanes()) as $elem_ty
-            );
+            assert_eq!(v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty);
         }
         #[test]
         fn product() {
diff --git a/coresimd/ppsv/api/float_math.rs b/coresimd/ppsv/api/float_math.rs
index 32de85eb55..9092460a76 100644
--- a/coresimd/ppsv/api/float_math.rs
+++ b/coresimd/ppsv/api/float_math.rs
@@ -59,7 +59,6 @@ macro_rules! impl_float_math {
 
 macro_rules! test_float_math {
     ($id:ident, $elem_ty:ident) => {
-
         fn sqrt2() -> $elem_ty {
             match ::mem::size_of::<$elem_ty>() {
                 4 => 1.4142135 as $elem_ty,
diff --git a/coresimd/ppsv/api/load_store.rs b/coresimd/ppsv/api/load_store.rs
index 11ea10d30c..59749da0e1 100644
--- a/coresimd/ppsv/api/load_store.rs
+++ b/coresimd/ppsv/api/load_store.rs
@@ -46,7 +46,7 @@ macro_rules! impl_load_store {
             /// undefined.
             #[inline]
             pub unsafe fn store_aligned_unchecked(
-                self, slice: &mut [$elem_ty]
+                self, slice: &mut [$elem_ty],
             ) {
                 *(slice.get_unchecked_mut(0) as *mut $elem_ty as *mut Self) =
                     self;
@@ -59,7 +59,7 @@ macro_rules! impl_load_store {
             /// If `slice.len() < Self::lanes()` the behavior is undefined.
             #[inline]
             pub unsafe fn store_unaligned_unchecked(
-                self, slice: &mut [$elem_ty]
+                self, slice: &mut [$elem_ty],
             ) {
                 let target_ptr =
                     slice.get_unchecked_mut(0) as *mut $elem_ty as *mut u8;
@@ -121,7 +121,7 @@ macro_rules! impl_load_store {
             /// If `slice.len() < Self::lanes()` the behavior is undefined.
             #[inline]
             pub unsafe fn load_unaligned_unchecked(
-                slice: &[$elem_ty]
+                slice: &[$elem_ty],
             ) -> Self {
                 use mem::size_of;
                 let target_ptr =
@@ -238,7 +238,8 @@ macro_rules! test_load_store {
                     data: [0 as $elem_ty; 2 * $id::lanes()],
                 };
                 // offset the aligned data by one byte:
-                let s: &mut [u8; 2 * $id::lanes()
+                let s: &mut [u8; 2
+                                * $id::lanes()
                                 * mem::size_of::<$elem_ty>()] =
                     mem::transmute(&mut aligned.data);
                 let s: &mut [$elem_ty] = slice::from_raw_parts_mut(
@@ -296,7 +297,8 @@ macro_rules! test_load_store {
                     data: [0 as $elem_ty; 2 * $id::lanes()],
                 };
                 // offset the aligned data by one byte:
-                let s: &[u8; 2 * $id::lanes()
+                let s: &[u8; 2
+                            * $id::lanes()
                             * mem::size_of::<$elem_ty>()] =
                     mem::transmute(&aligned.data);
                 let s: &[$elem_ty] = slice::from_raw_parts(
diff --git a/coresimd/ppsv/api/minmax.rs b/coresimd/ppsv/api/minmax.rs
old mode 100644
new mode 100755
index 7ba93b22f3..c1c7499c06
--- a/coresimd/ppsv/api/minmax.rs
+++ b/coresimd/ppsv/api/minmax.rs
@@ -27,7 +27,7 @@ macro_rules! impl_int_minmax_ops {
 
             /// Maximum of two vectors.
             ///
-            /// Returns a new vector containing the minimum value of each of
+            /// Returns a new vector containing the maximum value of each of
             /// the input vector lanes.
             #[inline]
             pub fn max(self, x: Self) -> Self {
@@ -86,7 +86,7 @@ macro_rules! impl_float_minmax_ops {
 
             /// Maximum of two vectors.
             ///
-            /// Returns a new vector containing the minimum value of each of the
+            /// Returns a new vector containing the maximum value of each of the
             /// input vector lanes. The lane-wise semantics are the same as that
             /// of `max` for the primitive floating-point types.
             #[inline]
diff --git a/coresimd/ppsv/api/mod.rs b/coresimd/ppsv/api/mod.rs
index 857264b9f3..4379e1c713 100644
--- a/coresimd/ppsv/api/mod.rs
+++ b/coresimd/ppsv/api/mod.rs
@@ -1,57 +1,4 @@
 //! This module defines the API of portable vector types.
-//!
-//! # API
-//!
-//! ## Traits
-//!
-//! All portable vector types implement the following traits:
-//!
-//! * [x] `Copy`,
-//! * [x] `Clone`,
-//! * [x] `Debug`,
-//! * [x] `Default`
-//! * [x] `PartialEq`
-//! * [x] `PartialOrd` (TODO: tests)
-//!
-//! Non-floating-point vector types also implement:
-//!
-//! * [x] `Hash`,
-//! * [x] `Eq`, and
-//! * [x] `Ord`.
-//!
-//! Integer vector types also implement:
-//!
-//! * [x] `fmt::LowerHex`.
-//!
-//! ## Conversions
-//!
-//! * [x]: `FromBits/IntoBits`: bitwise lossless transmutes between vectors of
-//!        the same size (i.e., same `mem::size_of`).
-//! * [x]: `From/Into`: casts between vectors with the same number of lanes
-//!        (potentially lossy).
-//!
-//! ## Inherent methods
-//!
-//! * [x] minimal API: implemented by all vector types except for boolean
-//!       vectors.
-//! * [x] minimal boolean vector API: implemented by boolean vectors.
-//! * [x] load/store API: aligned and unaligned memory loads and
-//!       stores - implemented by all vectors.
-//! * [x] comparison API: vector lane-wise comparison producing
-//!       boolean vectors - implemented by all vectors.
-//! * [x] arithmetic operations: implemented by all non-boolean vectors.
-//! * [x] `std::ops::Neg`: implemented by signed-integer and floating-point
-//!       vectors.
-//! * [x] bitwise operations: implemented by integer and boolean
-//!       vectors.
-//! * [x] shift operations: implemented by integer vectors.
-//! * [x] arithmetic reductions: implemented by integer and floating-point
-//!       vectors.
-//! * [x] bitwise reductions: implemented by integer and boolean
-//!       vectors.
-//! * [x] boolean reductions: implemented by boolean vectors.
-//! * [ ] portable shuffles: `shufflevector`.
-//! * [ ] portable `gather`/`scatter`:
 #![allow(unused)]
 
 /// Adds the vector type `$id`, with elements of types `$elem_tys`.
@@ -59,7 +6,8 @@ macro_rules! define_ty {
     ($id:ident, $($elem_tys:ident),+ | $(#[$doc:meta])*) => {
         $(#[$doc])*
         #[repr(simd)]
-        #[derive(Copy, Clone, Debug, /*FIXME:*/ PartialOrd)]
+        #[derive(Copy, Clone, Debug,
+                 /*FIXME: manually implement and add tests*/ PartialOrd)]
         #[allow(non_camel_case_types)]
         pub struct $id($($elem_tys),*);
     }
diff --git a/coresimd/ppsv/api/scalar_shifts.rs b/coresimd/ppsv/api/scalar_shifts.rs
index dac89e6bbb..586d909c32 100644
--- a/coresimd/ppsv/api/scalar_shifts.rs
+++ b/coresimd/ppsv/api/scalar_shifts.rs
@@ -41,18 +41,7 @@ macro_rules! impl_shifts {
 macro_rules! impl_all_scalar_shifts {
     ($id:ident, $elem_ty:ident) => {
         impl_shifts!(
-            $id,
-            $elem_ty,
-            u8,
-            u16,
-            u32,
-            u64,
-            usize,
-            i8,
-            i16,
-            i32,
-            i64,
-            isize
+            $id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize
         );
     };
 }
@@ -125,18 +114,7 @@ macro_rules! test_shift_ops {
 macro_rules! test_all_scalar_shift_ops {
     ($id:ident, $elem_ty:ident) => {
         test_shift_ops!(
-            $id,
-            $elem_ty,
-            u8,
-            u16,
-            u32,
-            u64,
-            usize,
-            i8,
-            i16,
-            i32,
-            i64,
-            isize
+            $id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize
         );
     };
 }
diff --git a/coresimd/ppsv/codegen/abs.rs b/coresimd/ppsv/codegen/abs.rs
index edca549c24..c829ff8c5b 100644
--- a/coresimd/ppsv/codegen/abs.rs
+++ b/coresimd/ppsv/codegen/abs.rs
@@ -1,9 +1,14 @@
 //! Vector absolute value
-
+#![allow(dead_code)]
 use coresimd::simd::*;
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.fabs.f32"]
+    fn abs_f32(x: f32) -> f32;
+    #[link_name = "llvm.fabs.f64"]
+    fn abs_f64(x: f64) -> f64;
+
     #[link_name = "llvm.fabs.v2f32"]
     fn abs_v2f32(x: f32x2) -> f32x2;
     #[link_name = "llvm.fabs.v4f32"]
@@ -24,14 +29,43 @@ pub(crate) trait FloatAbs {
     fn abs(self) -> Self;
 }
 
+trait RawAbs {
+    fn raw_abs(self) -> Self;
+}
+
+impl RawAbs for f32 {
+    fn raw_abs(self) -> Self {
+        unsafe { abs_f32(self) }
+    }
+}
+
+impl RawAbs for f64 {
+    fn raw_abs(self) -> Self {
+        unsafe { abs_f64(self) }
+    }
+}
+
+
 macro_rules! impl_fabs {
-    ($id:ident: $fn:ident) => {
+    ($id:ident : $fn:ident) => {
+        #[cfg(not(target_arch = "s390x"))]
         impl FloatAbs for $id {
             fn abs(self) -> Self {
                 unsafe { $fn(self) }
             }
         }
-    }
+        // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501
+        #[cfg(target_arch = "s390x")]
+        impl FloatAbs for $id {
+            fn abs(self) -> Self {
+                let mut v = $id::splat(0.);
+                for i in 0..$id::lanes() {
+                    v = v.replace(i, self.extract(i).raw_abs())
+                }
+                v
+            }
+        }
+    };
 }
 
 impl_fabs!(f32x2: abs_v2f32);
diff --git a/coresimd/ppsv/codegen/cos.rs b/coresimd/ppsv/codegen/cos.rs
index fdc61ea464..38dce584f8 100644
--- a/coresimd/ppsv/codegen/cos.rs
+++ b/coresimd/ppsv/codegen/cos.rs
@@ -1,9 +1,14 @@
 //! Exact vector cos
-
+#![allow(dead_code)]
 use coresimd::simd::*;
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.cos.f32"]
+    fn cos_f32(x: f32) -> f32;
+    #[link_name = "llvm.cos.f64"]
+    fn cos_f64(x: f64) -> f64;
+
     #[link_name = "llvm.cos.v2f32"]
     fn cos_v2f32(x: f32x2) -> f32x2;
     #[link_name = "llvm.cos.v4f32"]
@@ -24,14 +29,44 @@ pub(crate) trait FloatCos {
     fn cos(self) -> Self;
 }
 
+trait RawCos {
+    fn raw_cos(self) -> Self;
+}
+
+impl RawCos for f32 {
+    fn raw_cos(self) -> Self {
+        unsafe { cos_f32(self) }
+    }
+}
+
+impl RawCos for f64 {
+    fn raw_cos(self) -> Self {
+        unsafe { cos_f64(self) }
+    }
+}
+
+
 macro_rules! impl_fcos {
-    ($id:ident: $fn:ident) => {
+    ($id:ident : $fn:ident) => {
+        #[cfg(not(target_arch = "s390x"))]
         impl FloatCos for $id {
             fn cos(self) -> Self {
                 unsafe { $fn(self) }
             }
         }
-    }
+
+        // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501
+        #[cfg(target_arch = "s390x")]
+        impl FloatCos for $id {
+            fn cos(self) -> Self {
+                let mut v = $id::splat(0.);
+                for i in 0..$id::lanes() {
+                    v = v.replace(i, self.extract(i).raw_cos())
+                }
+                v
+            }
+        }
+    };
 }
 
 impl_fcos!(f32x2: cos_v2f32);
diff --git a/coresimd/ppsv/codegen/fma.rs b/coresimd/ppsv/codegen/fma.rs
index 9d63ac6bee..a0f0e8f729 100644
--- a/coresimd/ppsv/codegen/fma.rs
+++ b/coresimd/ppsv/codegen/fma.rs
@@ -1,5 +1,5 @@
 //! Vector fused multiply add
-
+#![allow(dead_code)]
 use coresimd::simd::*;
 
 #[allow(improper_ctypes)]
@@ -25,13 +25,21 @@ pub(crate) trait FloatFma {
 }
 
 macro_rules! impl_fma {
-    ($id:ident: $fn:ident) => {
+    ($id:ident : $fn:ident) => {
+        #[cfg(not(target_arch = "s390x"))]
         impl FloatFma for $id {
             fn fma(self, y: Self, z: Self) -> Self {
                 unsafe { $fn(self, y, z) }
             }
         }
-    }
+        // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501
+        #[cfg(target_arch = "s390x")]
+        impl FloatFma for $id {
+            fn fma(self, y: Self, z: Self) -> Self {
+                self * y + z
+            }
+        }
+    };
 }
 
 impl_fma!(f32x2: fma_v2f32);
diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs
index b06c2d0a29..617f1fd300 100644
--- a/coresimd/ppsv/codegen/masks_reductions.rs
+++ b/coresimd/ppsv/codegen/masks_reductions.rs
@@ -25,11 +25,13 @@ macro_rules! default_impl {
         impl All for $id {
             #[inline]
             unsafe fn all(self) -> bool {
-                #[cfg(not(target_arch = "aarch64"))] {
+                #[cfg(not(target_arch = "aarch64"))]
+                {
                     use coresimd::simd_llvm::simd_reduce_all;
                     simd_reduce_all(self)
                 }
-                #[cfg(target_arch = "aarch64")] {
+                #[cfg(target_arch = "aarch64")]
+                {
                     // FIXME: Broken on AArch64
                     // https://bugs.llvm.org/show_bug.cgi?id=36796
                     self.and()
@@ -40,11 +42,13 @@ macro_rules! default_impl {
         impl Any for $id {
             #[inline]
             unsafe fn any(self) -> bool {
-                #[cfg(not(target_arch = "aarch64"))] {
+                #[cfg(not(target_arch = "aarch64"))]
+                {
                     use coresimd::simd_llvm::simd_reduce_any;
                     simd_reduce_any(self)
                 }
-                #[cfg(target_arch = "aarch64")] {
+                #[cfg(target_arch = "aarch64")]
+                {
                     // FIXME: Broken on AArch64
                     // https://bugs.llvm.org/show_bug.cgi?id=36796
                     self.or()
@@ -63,7 +67,12 @@ macro_rules! default_impl {
 // or floating point vectors, we can't currently work around this yet. The
 // performance impact for this shouldn't be large, but this is filled as:
 // https://bugs.llvm.org/show_bug.cgi?id=37087
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+#[cfg(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "sse2"
+    )
+)]
 macro_rules! x86_128_sse2_movemask_impl {
     ($id:ident) => {
         impl All for $id {
@@ -71,13 +80,15 @@ macro_rules! x86_128_sse2_movemask_impl {
             #[target_feature(enable = "sse2")]
             unsafe fn all(self) -> bool {
                 #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                use coresimd::arch::x86::_mm_movemask_epi8;
                 #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
-                // _mm_movemask_epi8(a) creates a 16bit mask containing the most
-                // significant bit of each byte of `a`. If all bits are set,
-                // then all 16 lanes of the mask are true.
-                _mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32
+                use coresimd::arch::x86_64::_mm_movemask_epi8;
+                // _mm_movemask_epi8(a) creates a 16bit mask containing the
+                // most significant bit of each byte of `a`. If all
+                // bits are set, then all 16 lanes of the mask are
+                // true.
+                _mm_movemask_epi8(::mem::transmute(self))
+                    == u16::max_value() as i32
             }
         }
         impl Any for $id {
@@ -85,14 +96,14 @@ macro_rules! x86_128_sse2_movemask_impl {
             #[target_feature(enable = "sse2")]
             unsafe fn any(self) -> bool {
                 #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                use coresimd::arch::x86::_mm_movemask_epi8;
                 #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+                use coresimd::arch::x86_64::_mm_movemask_epi8;
 
                 _mm_movemask_epi8(::mem::transmute(self)) != 0
             }
         }
-    }
+    };
 }
 
 // On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256.
@@ -103,7 +114,12 @@ macro_rules! x86_128_sse2_movemask_impl {
 // integer or floating point vectors, we can't currently work around this yet.
 //
 // TODO: investigate perf impact and fill LLVM bugs as necessary.
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))]
+#[cfg(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx"
+    )
+)]
 macro_rules! x86_256_avx_test_impl {
     ($id:ident) => {
         impl All for $id {
@@ -111,11 +127,13 @@ macro_rules! x86_256_avx_test_impl {
             #[target_feature(enable = "avx")]
             unsafe fn all(self) -> bool {
                 #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm256_testc_si256;
+                use coresimd::arch::x86::_mm256_testc_si256;
                 #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm256_testc_si256;
-                _mm256_testc_si256(::mem::transmute(self),
-                                   ::mem::transmute($id::splat(true))) != 0
+                use coresimd::arch::x86_64::_mm256_testc_si256;
+                _mm256_testc_si256(
+                    ::mem::transmute(self),
+                    ::mem::transmute($id::splat(true)),
+                ) != 0
             }
         }
         impl Any for $id {
@@ -123,20 +141,27 @@ macro_rules! x86_256_avx_test_impl {
             #[target_feature(enable = "avx")]
             unsafe fn any(self) -> bool {
                 #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm256_testz_si256;
+                use coresimd::arch::x86::_mm256_testz_si256;
                 #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm256_testz_si256;
-                _mm256_testz_si256(::mem::transmute(self),
-                                   ::mem::transmute(self)) == 0
+                use coresimd::arch::x86_64::_mm256_testz_si256;
+                _mm256_testz_si256(
+                    ::mem::transmute(self),
+                    ::mem::transmute(self),
+                ) == 0
             }
         }
-    }
+    };
 }
 
-// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing
-// the algorithm for 128-bit on the higher and lower elements of the vector
-// independently.
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by
+// executing the algorithm for 128-bit on the higher and lower elements of the
+// vector independently.
+#[cfg(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "sse2"
+    )
+)]
 macro_rules! x86_256_sse2_impl {
     ($id:ident, $v128:ident) => {
         impl All for $id {
@@ -146,9 +171,9 @@ macro_rules! x86_256_sse2_impl {
                 unsafe {
                     union U {
                         halves: ($v128, $v128),
-                        vec: $id
+                        vec: $id,
                     }
-                    let halves = U {vec: self}.halves;
+                    let halves = U { vec: self }.halves;
                     halves.0.all() && halves.1.all()
                 }
             }
@@ -160,14 +185,14 @@ macro_rules! x86_256_sse2_impl {
                 unsafe {
                     union U {
                         halves: ($v128, $v128),
-                        vec: $id
+                        vec: $id,
                     }
-                    let halves = U {vec: self}.halves;
+                    let halves = U { vec: self }.halves;
                     halves.0.any() || halves.1.any()
                 }
             }
         }
-    }
+    };
 }
 
 // Implementation for 64-bit wide masks on x86.
@@ -179,13 +204,14 @@ macro_rules! x86_64_mmx_movemask_impl {
             #[target_feature(enable = "mmx")]
             unsafe fn all(self) -> bool {
                 #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                use coresimd::arch::x86::_mm_movemask_pi8;
                 #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+                use coresimd::arch::x86_64::_mm_movemask_pi8;
                 // _mm_movemask_pi8(a) creates an 8bit mask containing the most
                 // significant bit of each byte of `a`. If all bits are set,
                 // then all 8 lanes of the mask are true.
-                 _mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32
+                _mm_movemask_pi8(::mem::transmute(self))
+                    == u8::max_value() as i32
             }
         }
         impl Any for $id {
@@ -193,14 +219,14 @@ macro_rules! x86_64_mmx_movemask_impl {
             #[target_feature(enable = "mmx")]
             unsafe fn any(self) -> bool {
                 #[cfg(target_arch = "x86")]
-                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                use coresimd::arch::x86::_mm_movemask_pi8;
                 #[cfg(target_arch = "x86_64")]
-                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+                use coresimd::arch::x86_64::_mm_movemask_pi8;
 
                 _mm_movemask_pi8(::mem::transmute(self)) != 0
             }
         }
-    }
+    };
 }
 
 // Implementation for 128-bit wide masks on x86
@@ -214,7 +240,7 @@ macro_rules! x86_128_impl {
                 default_impl!($id);
             }
         }
-    }
+    };
 }
 
 // Implementation for 256-bit wide masks on x86
@@ -230,22 +256,25 @@ macro_rules! x86_256_impl {
                 default_impl!($id);
             }
         }
-    }
+    };
 }
 
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
+    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
+)]
 macro_rules! arm_64_x2_v7_neon_impl {
     ($id:ident, $vpmin:ident, $vpmax:ident) => {
         impl All for $id {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmin;
+                use mem::transmute;
                 // pmin((a, b), (-,-)) => (b, -).0 => b
-                let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized()));
+                let tmp: $id =
+                    transmute($vpmin(transmute(self), ::mem::uninitialized()));
                 tmp.extract(0)
             }
         }
@@ -253,27 +282,30 @@ macro_rules! arm_64_x2_v7_neon_impl {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmax;
+                use mem::transmute;
                 // pmax((a, b), (-,-)) => (b, -).0 => b
-                let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized()));
+                let tmp: $id =
+                    transmute($vpmax(transmute(self), ::mem::uninitialized()));
                 tmp.extract(0)
             }
         }
-    }
+    };
 }
 
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
+    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
+)]
 macro_rules! arm_64_x4_v7_neon_impl {
     ($id:ident, $vpmin:ident, $vpmax:ident) => {
         impl All for $id {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmin;
+                use mem::transmute;
                 // tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
                 let tmp = $vpmin(transmute(self), ::mem::uninitialized());
                 // tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
@@ -285,29 +317,31 @@ macro_rules! arm_64_x4_v7_neon_impl {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmax;
+                use mem::transmute;
                 // tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
-                let tmp =  $vpmax(transmute(self), ::mem::uninitialized());
+                let tmp = $vpmax(transmute(self), ::mem::uninitialized());
                 // tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
                 let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized()));
                 tmp.extract(0)
             }
         }
-    }
+    };
 }
 
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
+    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
+)]
 macro_rules! arm_64_x8_v7_neon_impl {
     ($id:ident, $vpmin:ident, $vpmax:ident) => {
         impl All for $id {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmin;
+                use mem::transmute;
                 // tmp = pmin(
                 //     (a, b, c, d, e, f, g, h),
                 //     (-, -, -, -, -, -, -, -)
@@ -330,8 +364,8 @@ macro_rules! arm_64_x8_v7_neon_impl {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmax;
+                use mem::transmute;
                 // tmp = pmax(
                 //     (a, b, c, d, e, f, g, h),
                 //     (-, -, -, -, -, -, -, -)
@@ -350,28 +384,32 @@ macro_rules! arm_64_x8_v7_neon_impl {
                 tmp.extract(0)
             }
         }
-    }
+    };
 }
 
-
 // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
 // minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with
 // more than two elements.
-#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+#[cfg(
+    all(target_arch = "arm", target_feature = "v7", target_feature = "neon")
+)]
 macro_rules! arm_128_v7_neon_impl {
     ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
         impl All for $id {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn all(self) -> bool {
-                use ::coresimd::arch::arm::$vpmin;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmin;
+                use mem::transmute;
                 union U {
                     halves: ($half, $half),
-                    vec: $id
+                    vec: $id,
                 }
                 let halves = U { vec: self }.halves;
-                let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
+                let h: $half = transmute($vpmin(
+                    transmute(halves.0),
+                    transmute(halves.1),
+                ));
                 h.all()
             }
         }
@@ -379,18 +417,21 @@ macro_rules! arm_128_v7_neon_impl {
             #[inline]
             #[target_feature(enable = "v7,neon")]
             unsafe fn any(self) -> bool {
-                use ::coresimd::arch::arm::$vpmax;
-                use ::mem::transmute;
+                use coresimd::arch::arm::$vpmax;
+                use mem::transmute;
                 union U {
                     halves: ($half, $half),
-                    vec: $id
+                    vec: $id,
                 }
                 let halves = U { vec: self }.halves;
-                let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
+                let h: $half = transmute($vpmax(
+                    transmute(halves.0),
+                    transmute(halves.1),
+                ));
                 h.any()
             }
         }
-    }
+    };
 }
 
 // Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
@@ -402,7 +443,7 @@ macro_rules! aarch64_128_neon_impl {
             #[inline]
             #[target_feature(enable = "neon")]
             unsafe fn all(self) -> bool {
-                use ::coresimd::arch::aarch64::$vmin;
+                use coresimd::arch::aarch64::$vmin;
                 $vmin(::mem::transmute(self)) != 0
             }
         }
@@ -410,11 +451,11 @@ macro_rules! aarch64_128_neon_impl {
             #[inline]
             #[target_feature(enable = "neon")]
             unsafe fn any(self) -> bool {
-                use ::coresimd::arch::aarch64::$vmax;
+                use coresimd::arch::aarch64::$vmax;
                 $vmax(::mem::transmute(self)) != 0
             }
         }
-    }
+    };
 }
 
 // Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
@@ -431,9 +472,12 @@ macro_rules! aarch64_64_neon_impl {
             unsafe fn all(self) -> bool {
                 union U {
                     halves: ($id, $id),
-                    vec: $vec128
+                    vec: $vec128,
                 }
-                U { halves: (self, self) }.vec.all()
+                U {
+                    halves: (self, self),
+                }.vec
+                    .all()
             }
         }
         impl Any for $id {
@@ -442,12 +486,15 @@ macro_rules! aarch64_64_neon_impl {
             unsafe fn any(self) -> bool {
                 union U {
                     halves: ($id, $id),
-                    vec: $vec128
+                    vec: $vec128,
                 }
-                U { halves: (self, self) }.vec.any()
+                U {
+                    halves: (self, self),
+                }.vec
+                    .any()
             }
         }
-    }
+    };
 }
 
 macro_rules! impl_mask_all_any {
diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs
index a1e8c24f6b..004f7b66f5 100644
--- a/coresimd/ppsv/codegen/mod.rs
+++ b/coresimd/ppsv/codegen/mod.rs
@@ -5,8 +5,10 @@ pub mod wrapping;
 
 pub mod masks_reductions;
 
-pub mod sqrt;
 pub mod abs;
+pub mod cos;
 pub mod fma;
 pub mod sin;
-pub mod cos;
+pub mod sqrt;
+
+pub mod swap_bytes;
diff --git a/coresimd/ppsv/codegen/sin.rs b/coresimd/ppsv/codegen/sin.rs
index cf7f3dea20..c13ae31d34 100644
--- a/coresimd/ppsv/codegen/sin.rs
+++ b/coresimd/ppsv/codegen/sin.rs
@@ -1,9 +1,14 @@
 //! Exact vector sin
-
+#![allow(dead_code)]
 use coresimd::simd::*;
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.sin.f32"]
+    fn sin_f32(x: f32) -> f32;
+    #[link_name = "llvm.sin.f64"]
+    fn sin_f64(x: f64) -> f64;
+
     #[link_name = "llvm.sin.v2f32"]
     fn sin_v2f32(x: f32x2) -> f32x2;
     #[link_name = "llvm.sin.v4f32"]
@@ -24,14 +29,44 @@ pub(crate) trait FloatSin {
     fn sin(self) -> Self;
 }
 
+trait RawSin {
+    fn raw_sin(self) -> Self;
+}
+
+impl RawSin for f32 {
+    fn raw_sin(self) -> Self {
+        unsafe { sin_f32(self) }
+    }
+}
+
+impl RawSin for f64 {
+    fn raw_sin(self) -> Self {
+        unsafe { sin_f64(self) }
+    }
+}
+
 macro_rules! impl_fsin {
-    ($id:ident: $fn:ident) => {
+    ($id:ident : $fn:ident) => {
+        #[cfg(not(target_arch = "s390x"))]
         impl FloatSin for $id {
             fn sin(self) -> Self {
                 unsafe { $fn(self) }
             }
         }
-    }
+
+        // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501
+        #[cfg(target_arch = "s390x")]
+        impl FloatSin for $id {
+            fn sin(self) -> Self {
+                let mut v = $id::splat(0.);
+                for i in 0..$id::lanes() {
+                    v = v.replace(i, self.extract(i).raw_sin())
+                }
+                v
+            }
+        }
+
+    };
 }
 
 impl_fsin!(f32x2: sin_v2f32);
diff --git a/coresimd/ppsv/codegen/sqrt.rs b/coresimd/ppsv/codegen/sqrt.rs
index 8e86650555..6a18589e71 100644
--- a/coresimd/ppsv/codegen/sqrt.rs
+++ b/coresimd/ppsv/codegen/sqrt.rs
@@ -1,9 +1,14 @@
 //! Exact vector square-root
-
+#![allow(dead_code)]
 use coresimd::simd::*;
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.sqrt.f32"]
+    fn sqrt_f32(x: f32) -> f32;
+    #[link_name = "llvm.sqrt.f64"]
+    fn sqrt_f64(x: f64) -> f64;
+
     #[link_name = "llvm.sqrt.v2f32"]
     fn sqrt_v2f32(x: f32x2) -> f32x2;
     #[link_name = "llvm.sqrt.v4f32"]
@@ -24,14 +29,43 @@ pub(crate) trait FloatSqrt {
     fn sqrt(self) -> Self;
 }
 
+trait RawSqrt {
+    fn raw_sqrt(self) -> Self;
+}
+
+impl RawSqrt for f32 {
+    fn raw_sqrt(self) -> Self {
+        unsafe { sqrt_f32(self) }
+    }
+}
+
+impl RawSqrt for f64 {
+    fn raw_sqrt(self) -> Self {
+        unsafe { sqrt_f64(self) }
+    }
+}
+
 macro_rules! impl_fsqrt {
-    ($id:ident: $fn:ident) => {
+    ($id:ident : $fn:ident) => {
+        #[cfg(not(target_arch = "s390x"))]
         impl FloatSqrt for $id {
             fn sqrt(self) -> Self {
                 unsafe { $fn(self) }
             }
         }
-    }
+        // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501
+        #[cfg(target_arch = "s390x")]
+        impl FloatSqrt for $id {
+            fn sqrt(self) -> Self {
+                let mut v = $id::splat(0.);
+                for i in 0..$id::lanes() {
+                    v = v.replace(i, self.extract(i).raw_sqrt());
+                }
+                v
+            }
+        }
+
+    };
 }
 
 impl_fsqrt!(f32x2: sqrt_v2f32);
diff --git a/coresimd/ppsv/codegen/swap_bytes.rs b/coresimd/ppsv/codegen/swap_bytes.rs
new file mode 100644
index 0000000000..a9df5c1fa2
--- /dev/null
+++ b/coresimd/ppsv/codegen/swap_bytes.rs
@@ -0,0 +1,141 @@
+//! swap bytes horizontally
+
+use coresimd::simd::*;
+
+pub(crate) trait SwapBytes {
+    fn swap_bytes(self) -> Self;
+}
+
+macro_rules! impl_swap_bytes {
+    ($vec8:ident, $shuf:ident, $indices:expr, $id:ident) => (
+        impl SwapBytes for $id {
+            fn swap_bytes(self) -> Self {
+                let vec8 = $vec8::from_bits(self);
+                let shuffled: $vec8 = unsafe { $shuf(vec8, vec8, $indices) };
+                $id::from_bits(shuffled)
+            }
+        }
+    );
+
+    // bulk impl for a vector width
+    ($vec8:ident, $shuf:ident, $indices:expr, $($id:ident,)+) => ($(
+        impl_swap_bytes! { $vec8, $shuf, $indices, $id }
+    )+);
+}
+
+impl_swap_bytes! {
+    u8x2,
+    simd_shuffle2,
+    [1, 0],
+    u8x2, i8x2,
+}
+
+impl_swap_bytes! {
+    u8x4,
+    simd_shuffle4,
+    [3, 2, 1, 0],
+    u8x4, i8x4,
+    u16x2, i16x2,
+}
+
+impl_swap_bytes! {
+    u8x8,
+    simd_shuffle8,
+    [7, 6, 5, 4, 3, 2, 1, 0],
+    u8x8, i8x8,
+    u16x4, i16x4,
+    u32x2, i32x2,
+}
+
+impl_swap_bytes! {
+    u8x16,
+    simd_shuffle16,
+    [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+    u8x16, i8x16,
+    u16x8, i16x8,
+    u32x4, i32x4,
+    u64x2, i64x2,
+}
+
+impl_swap_bytes! {
+    u8x32,
+    simd_shuffle32,
+    [
+        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+    ],
+    u8x32, i8x32,
+    u16x16, i16x16,
+    u32x8, i32x8,
+    u64x4, i64x4,
+}
+
+impl_swap_bytes! {
+    u8x64,
+    simd_shuffle64,
+    [
+        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
+        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
+        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+    ],
+    u8x64, i8x64,
+    u16x32, i16x32,
+    u32x16, i32x16,
+    u64x8, i64x8,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::mem;
+
+    // testing larger vectors is less simple
+    #[test]
+    #[cfg(feature = "simd_support")]
+    fn swap_bytes_128() {
+        let x: u128 = 0x2d99787926d46932a4c1f32680f70c55;
+        let expected = x.swap_bytes();
+
+        let vec: u8x16 = unsafe { mem::transmute(x) };
+        let actual = unsafe { mem::transmute(vec.swap_bytes()) };
+
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    #[cfg(feature = "simd_support")]
+    fn swap_bytes_64() {
+        let x: u64 = 0x2d99787926d46932;
+        let expected = x.swap_bytes();
+
+        let vec: u8x8 = unsafe { mem::transmute(x) };
+        let actual = unsafe { mem::transmute(vec.swap_bytes()) };
+
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    #[cfg(feature = "simd_support")]
+    fn swap_bytes_32() {
+        let x: u32 = 0x2d997872;
+        let expected = x.swap_bytes();
+
+        let vec: u8x4 = unsafe { mem::transmute(x) };
+        let actual = unsafe { mem::transmute(vec.swap_bytes()) };
+
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    #[cfg(feature = "simd_support")]
+    fn swap_bytes_16() {
+        let x: u16 = 0x2d99;
+        let expected = x.swap_bytes();
+
+        let vec: u8x2 = unsafe { mem::transmute(x) };
+        let actual = unsafe { mem::transmute(vec.swap_bytes()) };
+
+        assert_eq!(expected, actual);
+    }
+}
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
index 4d5c92dad0..eb2ba49541 100644
--- a/coresimd/ppsv/mod.rs
+++ b/coresimd/ppsv/mod.rs
@@ -66,8 +66,12 @@ where
     U: FromBits<T>,
 {
     // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449
-    #[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))]
-    #[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)]
+    #[cfg_attr(
+        any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always)
+    )]
+    #[cfg_attr(
+        not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline
+    )]
     fn into_bits(self) -> U {
         debug_assert!(::mem::size_of::<Self>() == ::mem::size_of::<U>());
         U::from_bits(self)
@@ -77,8 +81,12 @@ where
 // FromBits (and thus IntoBits) is reflexive.
 impl<T> FromBits<T> for T {
     // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449
-    #[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))]
-    #[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)]
+    #[cfg_attr(
+        any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always)
+    )]
+    #[cfg_attr(
+        not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline
+    )]
     fn from_bits(t: Self) -> Self {
         t
     }
diff --git a/coresimd/ppsv/v128.rs b/coresimd/ppsv/v128.rs
index 85a18f7863..08593bcef7 100644
--- a/coresimd/ppsv/v128.rs
+++ b/coresimd/ppsv/v128.rs
@@ -110,41 +110,53 @@ macro_rules! from_bits_x86 {
     };
 }
 
-#[cfg(all(target_arch = "arm", target_feature = "neon",
-          target_feature = "v7"))]
-use coresimd::arch::arm::{// FIXME: float16x8_t,
-                          float32x4_t,
-                          int16x8_t,
-                          int32x4_t,
-                          int64x2_t,
-                          int8x16_t,
-                          poly16x8_t,
-                          poly8x16_t,
-                          uint16x8_t,
-                          uint32x4_t,
-                          uint64x2_t,
-                          uint8x16_t};
+#[cfg(
+    all(target_arch = "arm", target_feature = "neon", target_feature = "v7")
+)]
+use coresimd::arch::arm::{
+    // FIXME: float16x8_t,
+    float32x4_t,
+    int16x8_t,
+    int32x4_t,
+    int64x2_t,
+    int8x16_t,
+    poly16x8_t,
+    poly8x16_t,
+    uint16x8_t,
+    uint32x4_t,
+    uint64x2_t,
+    uint8x16_t,
+};
 
 #[cfg(target_arch = "aarch64")]
-use coresimd::arch::aarch64::{// FIXME: float16x8_t,
-                              float32x4_t,
-                              float64x2_t,
-                              int16x8_t,
-                              int32x4_t,
-                              int64x2_t,
-                              int8x16_t,
-                              poly16x8_t,
-                              poly8x16_t,
-                              uint16x8_t,
-                              uint32x4_t,
-                              uint64x2_t,
-                              uint8x16_t};
+use coresimd::arch::aarch64::{
+    // FIXME: float16x8_t,
+    float32x4_t,
+    float64x2_t,
+    int16x8_t,
+    int32x4_t,
+    int64x2_t,
+    int8x16_t,
+    poly16x8_t,
+    poly8x16_t,
+    uint16x8_t,
+    uint32x4_t,
+    uint64x2_t,
+    uint8x16_t,
+};
 
 macro_rules! from_bits_arm {
     ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
-        #[cfg(any(all(target_arch = "arm", target_feature = "neon",
-                      target_feature = "v7"),
-                  target_arch = "aarch64"))]
+        #[cfg(
+            any(
+                all(
+                    target_arch = "arm",
+                    target_feature = "neon",
+                    target_feature = "v7"
+                ),
+                target_arch = "aarch64"
+            )
+        )]
         impl_from_bits_!(
             $id: int8x16_t,
             uint8x16_t,
@@ -182,12 +194,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(u64x2, u64, u64x2_from_bits_x86);
-from_bits_arm!(
-    u64x2,
-    u64,
-    u64x2_from_bits_arm,
-    u64x2_from_bits_aarch64
-);
+from_bits_arm!(u64x2, u64, u64x2_from_bits_arm, u64x2_from_bits_aarch64);
 
 impl_from_bits!(
     i64x2: i64,
@@ -207,12 +214,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(i64x2, i64, i64x2_from_bits_x86);
-from_bits_arm!(
-    i64x2,
-    i64,
-    i64x2_from_bits_arm,
-    i64x2_from_bits_aarch64
-);
+from_bits_arm!(i64x2, i64, i64x2_from_bits_arm, i64x2_from_bits_aarch64);
 
 impl_from_bits!(
     f64x2: f64,
@@ -232,12 +234,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(f64x2, f64, f64x2_from_bits_x86);
-from_bits_arm!(
-    f64x2,
-    f64,
-    f64x2_from_bits_arm,
-    f64x2_from_bits_aarch64
-);
+from_bits_arm!(f64x2, f64, f64x2_from_bits_arm, f64x2_from_bits_aarch64);
 
 impl_from_bits!(
     u32x4: u32,
@@ -257,12 +254,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(u32x4, u32, u32x4_from_bits_x86);
-from_bits_arm!(
-    u32x4,
-    u32,
-    u32x4_from_bits_arm,
-    u32x4_from_bits_aarch64
-);
+from_bits_arm!(u32x4, u32, u32x4_from_bits_arm, u32x4_from_bits_aarch64);
 
 impl_from_bits!(
     i32x4: i32,
@@ -282,12 +274,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(i32x4, i32, i32x4_from_bits_x86);
-from_bits_arm!(
-    i32x4,
-    i32,
-    i32x4_from_bits_arm,
-    i32x4_from_bits_aarch64
-);
+from_bits_arm!(i32x4, i32, i32x4_from_bits_arm, i32x4_from_bits_aarch64);
 
 impl_from_bits!(
     f32x4: f32,
@@ -307,12 +294,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(f32x4, f32, f32x4_from_bits_x86);
-from_bits_arm!(
-    f32x4,
-    f32,
-    f32x4_from_bits_arm,
-    f32x4_from_bits_aarch64
-);
+from_bits_arm!(f32x4, f32, f32x4_from_bits_arm, f32x4_from_bits_aarch64);
 
 impl_from_bits!(
     u16x8: u16,
@@ -332,12 +314,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(u16x8, u16, u16x8_from_bits_x86);
-from_bits_arm!(
-    u16x8,
-    u16,
-    u16x8_from_bits_arm,
-    u16x8_from_bits_aarch64
-);
+from_bits_arm!(u16x8, u16, u16x8_from_bits_arm, u16x8_from_bits_aarch64);
 
 impl_from_bits!(
     i16x8: i16,
@@ -357,12 +334,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(i16x8, i16, i16x8_from_bits_x86);
-from_bits_arm!(
-    i16x8,
-    i16,
-    i16x8_from_bits_arm,
-    i16x8_from_bits_aarch64
-);
+from_bits_arm!(i16x8, i16, i16x8_from_bits_arm, i16x8_from_bits_aarch64);
 
 impl_from_bits!(
     u8x16: u8,
@@ -382,12 +354,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(u8x16, u8, u8x16_from_bits_x86);
-from_bits_arm!(
-    u8x16,
-    u8,
-    u8x16_from_bits_arm,
-    u8x16_from_bits_aarch64
-);
+from_bits_arm!(u8x16, u8, u8x16_from_bits_arm, u8x16_from_bits_aarch64);
 
 impl_from_bits!(
     i8x16: i8,
@@ -407,12 +374,7 @@ impl_from_bits!(
     m8x16
 );
 from_bits_x86!(i8x16, i8, i8x16_from_bits_x86);
-from_bits_arm!(
-    i8x16,
-    i8,
-    i8x16_from_bits_arm,
-    i8x16_from_bits_aarch64
-);
+from_bits_arm!(i8x16, i8, i8x16_from_bits_arm, i8x16_from_bits_aarch64);
 
 impl_from!(
     f64x2: f64,
@@ -552,31 +514,37 @@ impl_from!(
     m8x8
 );
 
-impl_from!(u8x16: u8, u8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, i8x16, m8x16);
-impl_from!(i8x16: i8, i8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, u8x16, m8x16);
-
-impl_from!(m8x16: i8, m8x16_from, test_v128 | m1x16, m16x16);
-
 impl_from!(
-    m16x8: i16,
-    m16x8_from,
-    test_v128 | m1x8,
-    m32x8,
-    m8x8
+    u8x16: u8,
+    u8x16_from,
+    test_v128 | i32x16,
+    u32x16,
+    f32x16,
+    m1x16,
+    i16x16,
+    u16x16,
+    m16x16,
+    i8x16,
+    m8x16
 );
-
 impl_from!(
-    m32x4: i32,
-    m32x4_from,
-    test_v128 | m64x4,
-    m16x4,
-    m8x4
+    i8x16: i8,
+    i8x16_from,
+    test_v128 | i32x16,
+    u32x16,
+    f32x16,
+    m1x16,
+    i16x16,
+    u16x16,
+    m16x16,
+    u8x16,
+    m8x16
 );
 
-impl_from!(
-    m64x2: i64,
-    m64x2_from,
-    test_v128 | m32x2,
-    m16x2,
-    m8x2
-);
+impl_from!(m8x16: i8, m8x16_from, test_v128 | m1x16, m16x16);
+
+impl_from!(m16x8: i16, m16x8_from, test_v128 | m1x8, m32x8, m8x8);
+
+impl_from!(m32x4: i32, m32x4_from, test_v128 | m64x4, m16x4, m8x4);
+
+impl_from!(m64x2: i64, m64x2_from, test_v128 | m32x2, m16x2, m8x2);
diff --git a/coresimd/ppsv/v16.rs b/coresimd/ppsv/v16.rs
index 8bc08452c4..a2baf8dfc5 100644
--- a/coresimd/ppsv/v16.rs
+++ b/coresimd/ppsv/v16.rs
@@ -57,10 +57,4 @@ impl_from!(
     m8x2
 );
 
-impl_from!(
-    m8x2: i8,
-    m8x2_from,
-    test_v16 | m64x2,
-    m32x2,
-    m16x2
-);
+impl_from!(m8x2: i8, m8x2_from, test_v16 | m64x2, m32x2, m16x2);
diff --git a/coresimd/ppsv/v256.rs b/coresimd/ppsv/v256.rs
index 849897d4ea..c68ec9118e 100644
--- a/coresimd/ppsv/v256.rs
+++ b/coresimd/ppsv/v256.rs
@@ -465,25 +465,8 @@ impl_from!(
 
 impl_from!(m8x32: i8, m8x32_from, test_v256 | m1x32);
 
-impl_from!(
-    m16x16: i16,
-    m16x16_from,
-    test_v256 | m1x16,
-    m8x16
-);
+impl_from!(m16x16: i16, m16x16_from, test_v256 | m1x16, m8x16);
 
-impl_from!(
-    m32x8: i32,
-    m32x8_from,
-    test_v256 | m1x8,
-    m16x8,
-    m8x8
-);
+impl_from!(m32x8: i32, m32x8_from, test_v256 | m1x8, m16x8, m8x8);
 
-impl_from!(
-    m64x4: i64,
-    m64x4_from,
-    test_v256 | m32x4,
-    m16x4,
-    m8x4
-);
+impl_from!(m64x4: i64, m64x4_from, test_v256 | m32x4, m16x4, m8x4);
diff --git a/coresimd/ppsv/v32.rs b/coresimd/ppsv/v32.rs
index 854837e9ba..ab56b5ad80 100644
--- a/coresimd/ppsv/v32.rs
+++ b/coresimd/ppsv/v32.rs
@@ -151,18 +151,6 @@ impl_from!(
     m8x4
 );
 
-impl_from!(
-    m8x4: i8,
-    m8x4_from,
-    test_v32 | m64x4,
-    m32x4,
-    m16x4
-);
+impl_from!(m8x4: i8, m8x4_from, test_v32 | m64x4, m32x4, m16x4);
 
-impl_from!(
-    m16x2: i16,
-    m16x2_from,
-    test_v32 | m64x2,
-    m32x2,
-    m8x2
-);
+impl_from!(m16x2: i16, m16x2_from, test_v32 | m64x2, m32x2, m8x2);
diff --git a/coresimd/ppsv/v512.rs b/coresimd/ppsv/v512.rs
index 7fd42175a9..6bea72c73b 100644
--- a/coresimd/ppsv/v512.rs
+++ b/coresimd/ppsv/v512.rs
@@ -446,17 +446,6 @@ impl_from!(u8x64: u8, u8x64_from, test_v512 | i8x64, m1x64);
 
 impl_from!(m1x32: i16, m1x32_from, test_v512 | m8x32);
 
-impl_from!(
-    m1x16: i32,
-    m1x16_from,
-    test_v512 | m16x16,
-    m8x16
-);
+impl_from!(m1x16: i32, m1x16_from, test_v512 | m16x16, m8x16);
 
-impl_from!(
-    m1x8: i64,
-    m1x8_from,
-    test_v512 | m32x8,
-    m16x8,
-    m8x8
-);
+impl_from!(m1x8: i64, m1x8_from, test_v512 | m32x8, m16x8, m8x8);
diff --git a/coresimd/ppsv/v64.rs b/coresimd/ppsv/v64.rs
index cfa56a234a..64a86b601d 100644
--- a/coresimd/ppsv/v64.rs
+++ b/coresimd/ppsv/v64.rs
@@ -83,41 +83,53 @@ macro_rules! from_bits_x86 {
     };
 }
 
-#[cfg(all(target_arch = "arm", target_feature = "neon",
-          target_feature = "v7"))]
-use coresimd::arch::arm::{// FIXME: float16x4_t,
-                          float32x2_t,
-                          int16x4_t,
-                          int32x2_t,
-                          int64x1_t,
-                          int8x8_t,
-                          poly16x4_t,
-                          poly8x8_t,
-                          uint16x4_t,
-                          uint32x2_t,
-                          uint64x1_t,
-                          uint8x8_t};
+#[cfg(
+    all(target_arch = "arm", target_feature = "neon", target_feature = "v7")
+)]
+use coresimd::arch::arm::{
+    // FIXME: float16x4_t,
+    float32x2_t,
+    int16x4_t,
+    int32x2_t,
+    int64x1_t,
+    int8x8_t,
+    poly16x4_t,
+    poly8x8_t,
+    uint16x4_t,
+    uint32x2_t,
+    uint64x1_t,
+    uint8x8_t,
+};
 
 #[cfg(target_arch = "aarch64")]
-use coresimd::arch::aarch64::{// FIXME: float16x4_t,
-                              float32x2_t,
-                              float64x1_t,
-                              int16x4_t,
-                              int32x2_t,
-                              int64x1_t,
-                              int8x8_t,
-                              poly16x4_t,
-                              poly8x8_t,
-                              uint16x4_t,
-                              uint32x2_t,
-                              uint64x1_t,
-                              uint8x8_t};
+use coresimd::arch::aarch64::{
+    // FIXME: float16x4_t,
+    float32x2_t,
+    float64x1_t,
+    int16x4_t,
+    int32x2_t,
+    int64x1_t,
+    int8x8_t,
+    poly16x4_t,
+    poly8x8_t,
+    uint16x4_t,
+    uint32x2_t,
+    uint64x1_t,
+    uint8x8_t,
+};
 
 macro_rules! from_bits_arm {
     ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
-        #[cfg(any(all(target_arch = "arm", target_feature = "neon",
-                      target_feature = "v7"),
-                  target_arch = "aarch64"))]
+        #[cfg(
+            any(
+                all(
+                    target_arch = "arm",
+                    target_feature = "neon",
+                    target_feature = "v7"
+                ),
+                target_arch = "aarch64"
+            )
+        )]
         impl_from_bits_!(
             $id: int64x1_t,
             uint64x1_t,
@@ -151,12 +163,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(u32x2, u32, u32x2_from_bits_x86);
-from_bits_arm!(
-    u32x2,
-    u32,
-    u32x2_from_bits_arm,
-    u32x2_from_bits_aarch64
-);
+from_bits_arm!(u32x2, u32, u32x2_from_bits_arm, u32x2_from_bits_aarch64);
 
 impl_from_bits!(
     i32x2: i32,
@@ -172,12 +179,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(i32x2, i32, i32x2_from_bits_x86);
-from_bits_arm!(
-    i32x2,
-    i32,
-    i32x2_from_bits_arm,
-    i32x2_from_bits_aarch64
-);
+from_bits_arm!(i32x2, i32, i32x2_from_bits_arm, i32x2_from_bits_aarch64);
 
 impl_from_bits!(
     f32x2: f32,
@@ -193,12 +195,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(f32x2, f32, f32x2_from_bits_x86);
-from_bits_arm!(
-    f32x2,
-    f32,
-    f32x2_from_bits_arm,
-    f32x2_from_bits_aarch64
-);
+from_bits_arm!(f32x2, f32, f32x2_from_bits_arm, f32x2_from_bits_aarch64);
 
 impl_from_bits!(
     u16x4: u16,
@@ -213,12 +210,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(u16x4, u16, u16x4_from_bits_x86);
-from_bits_arm!(
-    u16x4,
-    u16,
-    u16x4_from_bits_arm,
-    u16x4_from_bits_aarch64
-);
+from_bits_arm!(u16x4, u16, u16x4_from_bits_arm, u16x4_from_bits_aarch64);
 
 impl_from_bits!(
     i16x4: i16,
@@ -233,12 +225,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(i16x4, i16, i16x4_from_bits_x86);
-from_bits_arm!(
-    i16x4,
-    i16,
-    i16x4_from_bits_arm,
-    i16x4_from_bits_aarch64
-);
+from_bits_arm!(i16x4, i16, i16x4_from_bits_arm, i16x4_from_bits_aarch64);
 
 impl_from_bits!(
     u8x8: u8,
@@ -253,12 +240,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(u8x8, u8, u8x8_from_bits_x86);
-from_bits_arm!(
-    u8x8,
-    u8,
-    u8x8_from_bits_arm,
-    u8x8_from_bits_aarch64
-);
+from_bits_arm!(u8x8, u8, u8x8_from_bits_arm, u8x8_from_bits_aarch64);
 
 impl_from_bits!(
     i8x8: i8,
@@ -273,12 +255,7 @@ impl_from_bits!(
     m8x8
 );
 from_bits_x86!(i8x8, i8, i8x8_from_bits_x86);
-from_bits_arm!(
-    i8x8,
-    i8,
-    i8x8_from_bits_arm,
-    i8x8_from_bits_aarch64
-);
+from_bits_arm!(i8x8, i8, i8x8_from_bits_arm, i8x8_from_bits_aarch64);
 
 impl_from!(
     f32x2: f32,
@@ -404,26 +381,8 @@ impl_from!(
     m8x8
 );
 
-impl_from!(
-    m8x8: i8,
-    m8x8_from,
-    test_v64 | m1x8,
-    m32x8,
-    m16x8
-);
+impl_from!(m8x8: i8, m8x8_from, test_v64 | m1x8, m32x8, m16x8);
 
-impl_from!(
-    m16x4: i16,
-    m16x4_from,
-    test_v64 | m64x4,
-    m32x4,
-    m8x4
-);
+impl_from!(m16x4: i16, m16x4_from, test_v64 | m64x4, m32x4, m8x4);
 
-impl_from!(
-    m32x2: i32,
-    m32x2_from,
-    test_v64 | m64x2,
-    m16x2,
-    m8x2
-);
+impl_from!(m32x2: i32, m32x2_from, test_v64 | m64x2, m16x2, m8x2);
diff --git a/coresimd/x86/avx.rs b/coresimd/x86/avx.rs
index 7fe4c0a51d..f41ebb8974 100644
--- a/coresimd/x86/avx.rs
+++ b/coresimd/x86/avx.rs
@@ -1387,7 +1387,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_ps(
-    a: __m256, b: __m256, imm8: i32
+    a: __m256, b: __m256, imm8: i32,
 ) -> __m256 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_permute2f128_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_pd(
-    a: __m256d, b: __m256d, imm8: i32
+    a: __m256d, b: __m256d, imm8: i32,
 ) -> __m256d {
     macro_rules! call {
         ($imm8:expr) => {
@@ -1427,7 +1427,7 @@ pub unsafe fn _mm256_permute2f128_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_si256(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
     let a = a.as_i32x8();
     let b = b.as_i32x8();
@@ -1529,7 +1529,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_pd(
-    a: __m256d, b: __m128d, imm8: i32
+    a: __m256d, b: __m128d, imm8: i32,
 ) -> __m256d {
     match imm8 & 1 {
         0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
@@ -1547,7 +1547,7 @@ pub unsafe fn _mm256_insertf128_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_si256(
-    a: __m256i, b: __m128i, imm8: i32
+    a: __m256i, b: __m128i, imm8: i32,
 ) -> __m256i {
     let b = _mm256_castsi128_si256(b).as_i64x4();
     let dst: i64x4 = match imm8 & 1 {
@@ -1567,11 +1567,7 @@ pub unsafe fn _mm256_insertf128_si256(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
-    mem::transmute(simd_insert(
-        a.as_i8x32(),
-        (index as u32) & 31,
-        i,
-    ))
+    mem::transmute(simd_insert(a.as_i8x32(), (index as u32) & 31, i))
 }
 
 /// Copy `a` to result, and insert the 16-bit integer `i` into result
@@ -1584,11 +1580,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
-    mem::transmute(simd_insert(
-        a.as_i16x16(),
-        (index as u32) & 15,
-        i,
-    ))
+    mem::transmute(simd_insert(a.as_i16x16(), (index as u32) & 15, i))
 }
 
 /// Copy `a` to result, and insert the 32-bit integer `i` into result
@@ -1790,7 +1782,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_pd(
-    mem_addr: *const f64, mask: __m256i
+    mem_addr: *const f64, mask: __m256i,
 ) -> __m256d {
     maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
 }
@@ -1804,7 +1796,7 @@ pub unsafe fn _mm256_maskload_pd(
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_pd(
-    mem_addr: *mut f64, mask: __m256i, a: __m256d
+    mem_addr: *mut f64, mask: __m256i, a: __m256d,
 ) {
     maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
 }
@@ -1844,7 +1836,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_ps(
-    mem_addr: *const f32, mask: __m256i
+    mem_addr: *const f32, mask: __m256i,
 ) -> __m256 {
     maskloadps256(mem_addr as *const i8, mask.as_i32x8())
 }
@@ -1858,7 +1850,7 @@ pub unsafe fn _mm256_maskload_ps(
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_ps(
-    mem_addr: *mut f32, mask: __m256i, a: __m256
+    mem_addr: *mut f32, mask: __m256i, a: __m256,
 ) {
     maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
 }
@@ -2383,7 +2375,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_ps(
-    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
+    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32,
 ) -> __m256 {
     _mm256_setr_ps(h, g, f, e, d, c, b, a)
 }
@@ -2440,7 +2432,7 @@ pub unsafe fn _mm256_set_epi16(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_epi32(
-    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
+    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32,
 ) -> __m256i {
     _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
 }
@@ -2477,7 +2469,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_ps(
-    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
+    a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32,
 ) -> __m256 {
     __m256(a, b, c, d, e, f, g, h)
 }
@@ -2536,7 +2528,7 @@ pub unsafe fn _mm256_setr_epi16(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_epi32(
-    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
+    e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32,
 ) -> __m256i {
     mem::transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
@@ -2950,7 +2942,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128(
-    hiaddr: *const f32, loaddr: *const f32
+    hiaddr: *const f32, loaddr: *const f32,
 ) -> __m256 {
     let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
     _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
@@ -2967,7 +2959,7 @@ pub unsafe fn _mm256_loadu2_m128(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128d(
-    hiaddr: *const f64, loaddr: *const f64
+    hiaddr: *const f64, loaddr: *const f64,
 ) -> __m256d {
     let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
     _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
@@ -2983,7 +2975,7 @@ pub unsafe fn _mm256_loadu2_m128d(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128i(
-    hiaddr: *const __m128i, loaddr: *const __m128i
+    hiaddr: *const __m128i, loaddr: *const __m128i,
 ) -> __m256i {
     let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
     _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
@@ -3000,7 +2992,7 @@ pub unsafe fn _mm256_loadu2_m128i(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128(
-    hiaddr: *mut f32, loaddr: *mut f32, a: __m256
+    hiaddr: *mut f32, loaddr: *mut f32, a: __m256,
 ) {
     let lo = _mm256_castps256_ps128(a);
     _mm_storeu_ps(loaddr, lo);
@@ -3019,7 +3011,7 @@ pub unsafe fn _mm256_storeu2_m128(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128d(
-    hiaddr: *mut f64, loaddr: *mut f64, a: __m256d
+    hiaddr: *mut f64, loaddr: *mut f64, a: __m256d,
 ) {
     let lo = _mm256_castpd256_pd128(a);
     _mm_storeu_pd(loaddr, lo);
@@ -3037,7 +3029,7 @@ pub unsafe fn _mm256_storeu2_m128d(
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128i(
-    hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i
+    hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i,
 ) {
     let lo = _mm256_castsi256_si128(a);
     _mm_storeu_si128(loaddr, lo);
@@ -3500,20 +3492,11 @@ mod tests {
         let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
         let r = _mm256_blend_ps(a, b, 0x0);
-        assert_eq_m256(
-            r,
-            _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.),
-        );
+        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
         let r = _mm256_blend_ps(a, b, 0x3);
-        assert_eq_m256(
-            r,
-            _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.),
-        );
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
         let r = _mm256_blend_ps(a, b, 0xF);
-        assert_eq_m256(
-            r,
-            _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.),
-        );
+        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
     }
 
     #[simd_test(enable = "avx")]
@@ -3544,16 +3527,8 @@ mod tests {
         let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
         let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
         let r = _mm256_dp_ps(a, b, 0xFF);
-        let e = _mm256_setr_ps(
-            200.,
-            200.,
-            200.,
-            200.,
-            2387.,
-            2387.,
-            2387.,
-            2387.,
-        );
+        let e =
+            _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
         assert_eq_m256(r, e);
     }
 
@@ -4234,9 +4209,7 @@ mod tests {
             pub data: [f64; 4],
         }
         let a = _mm256_set1_pd(7.0);
-        let mut mem = Memory {
-            data: [-1.0; 4],
-        };
+        let mut mem = Memory { data: [-1.0; 4] };
 
         _mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
         for i in 0..4 {
@@ -4251,9 +4224,7 @@ mod tests {
             pub data: [f32; 8],
         }
         let a = _mm256_set1_ps(7.0);
-        let mut mem = Memory {
-            data: [-1.0; 8],
-        };
+        let mut mem = Memory { data: [-1.0; 8] };
 
         _mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
         for i in 0..8 {
@@ -4534,10 +4505,7 @@ mod tests {
     #[simd_test(enable = "avx")]
     unsafe fn test_mm256_set_ps() {
         let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(
-            r,
-            _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.),
-        );
+        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
     }
 
     #[simd_test(enable = "avx")]
@@ -4595,10 +4563,7 @@ mod tests {
     #[simd_test(enable = "avx")]
     unsafe fn test_mm256_setr_ps() {
         let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(
-            r,
-            _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.),
-        );
+        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
     }
 
     #[simd_test(enable = "avx")]
diff --git a/coresimd/x86/avx2.rs b/coresimd/x86/avx2.rs
index 982b293b88..c4ead715ae 100644
--- a/coresimd/x86/avx2.rs
+++ b/coresimd/x86/avx2.rs
@@ -413,7 +413,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_epi32(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
     let imm8 = (imm8 & 0xFF) as u8;
     let a = a.as_i32x8();
@@ -480,7 +480,7 @@ pub unsafe fn _mm256_blend_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_epi16(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
     let imm8 = (imm8 & 0xFF) as u8;
     let a = a.as_i16x16();
@@ -531,76 +531,20 @@ pub unsafe fn _mm256_blend_epi16(
         ) => {
             match (imm8 >> 6) & 0b11 {
                 0b00 => blend4!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    $e,
-                    $f,
-                    6,
-                    7,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    $e2,
-                    $f2,
-                    14,
-                    15
+                    $a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2,
+                    $f2, 14, 15
                 ),
                 0b01 => blend4!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    $e,
-                    $f,
-                    22,
-                    7,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    $e2,
-                    $f2,
-                    30,
-                    15
+                    $a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2,
+                    $e2, $f2, 30, 15
                 ),
                 0b10 => blend4!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    $e,
-                    $f,
-                    6,
-                    23,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    $e2,
-                    $f2,
-                    14,
-                    31
+                    $a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2,
+                    $e2, $f2, 14, 31
                 ),
                 _ => blend4!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    $e,
-                    $f,
-                    22,
-                    23,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    $e2,
-                    $f2,
-                    30,
-                    31
+                    $a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2,
+                    $e2, $f2, 30, 31
                 ),
             }
         };
@@ -618,60 +562,16 @@ pub unsafe fn _mm256_blend_epi16(
         ) => {
             match (imm8 >> 4) & 0b11 {
                 0b00 => blend3!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    4,
-                    5,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    12,
-                    13
+                    $a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13
                 ),
                 0b01 => blend3!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    20,
-                    5,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    28,
-                    13
+                    $a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13
                 ),
                 0b10 => blend3!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    4,
-                    21,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    12,
-                    29
+                    $a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29
                 ),
                 _ => blend3!(
-                    $a,
-                    $b,
-                    $c,
-                    $d,
-                    20,
-                    21,
-                    $a2,
-                    $b2,
-                    $c2,
-                    $d2,
-                    28,
-                    29
+                    $a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29
                 ),
             }
         };
@@ -703,13 +603,9 @@ pub unsafe fn _mm256_blend_epi16(
 #[cfg_attr(test, assert_instr(vpblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_epi8(
-    a: __m256i, b: __m256i, mask: __m256i
+    a: __m256i, b: __m256i, mask: __m256i,
 ) -> __m256i {
-    mem::transmute(pblendvb(
-        a.as_i8x32(),
-        b.as_i8x32(),
-        mask.as_i8x32(),
-    ))
+    mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
 }
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
@@ -1226,7 +1122,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_epi32(
-    slice: *const i32, offsets: __m128i, scale: i32
+    slice: *const i32, offsets: __m128i, scale: i32,
 ) -> __m128i {
     let zero = _mm_setzero_si128().as_i32x4();
     let neg_one = _mm_set1_epi32(-1).as_i32x4();
@@ -1280,7 +1176,7 @@ pub unsafe fn _mm_mask_i32gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_epi32(
-    slice: *const i32, offsets: __m256i, scale: i32
+    slice: *const i32, offsets: __m256i, scale: i32,
 ) -> __m256i {
     let zero = _mm256_setzero_si256().as_i32x8();
     let neg_one = _mm256_set1_epi32(-1).as_i32x8();
@@ -1334,7 +1230,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_ps(
-    slice: *const f32, offsets: __m128i, scale: i32
+    slice: *const f32, offsets: __m128i, scale: i32,
 ) -> __m128 {
     let zero = _mm_setzero_ps();
     let neg_one = _mm_set1_ps(-1.0);
@@ -1360,7 +1256,7 @@ pub unsafe fn _mm_i32gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i32gather_ps(
-    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
+    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32,
 ) -> __m128 {
     let offsets = offsets.as_i32x4();
     let slice = slice as *const i8;
@@ -1383,7 +1279,7 @@ pub unsafe fn _mm_mask_i32gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_ps(
-    slice: *const f32, offsets: __m256i, scale: i32
+    slice: *const f32, offsets: __m256i, scale: i32,
 ) -> __m256 {
     let zero = _mm256_setzero_ps();
     let neg_one = _mm256_set1_ps(-1.0);
@@ -1409,7 +1305,7 @@ pub unsafe fn _mm256_i32gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i32gather_ps(
-    src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32
+    src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32,
 ) -> __m256 {
     let offsets = offsets.as_i32x8();
     let slice = slice as *const i8;
@@ -1432,7 +1328,7 @@ pub unsafe fn _mm256_mask_i32gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_epi64(
-    slice: *const i64, offsets: __m128i, scale: i32
+    slice: *const i64, offsets: __m128i, scale: i32,
 ) -> __m128i {
     let zero = _mm_setzero_si128().as_i64x2();
     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
@@ -1486,7 +1382,7 @@ pub unsafe fn _mm_mask_i32gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_epi64(
-    slice: *const i64, offsets: __m128i, scale: i32
+    slice: *const i64, offsets: __m128i, scale: i32,
 ) -> __m256i {
     let zero = _mm256_setzero_si256().as_i64x4();
     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
@@ -1540,7 +1436,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_pd(
-    slice: *const f64, offsets: __m128i, scale: i32
+    slice: *const f64, offsets: __m128i, scale: i32,
 ) -> __m128d {
     let zero = _mm_setzero_pd();
     let neg_one = _mm_set1_pd(-1.0);
@@ -1590,7 +1486,7 @@ pub unsafe fn _mm_mask_i32gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_pd(
-    slice: *const f64, offsets: __m128i, scale: i32
+    slice: *const f64, offsets: __m128i, scale: i32,
 ) -> __m256d {
     let zero = _mm256_setzero_pd();
     let neg_one = _mm256_set1_pd(-1.0);
@@ -1640,7 +1536,7 @@ pub unsafe fn _mm256_mask_i32gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_epi32(
-    slice: *const i32, offsets: __m128i, scale: i32
+    slice: *const i32, offsets: __m128i, scale: i32,
 ) -> __m128i {
     let zero = _mm_setzero_si128().as_i32x4();
     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
@@ -1694,7 +1590,7 @@ pub unsafe fn _mm_mask_i64gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_epi32(
-    slice: *const i32, offsets: __m256i, scale: i32
+    slice: *const i32, offsets: __m256i, scale: i32,
 ) -> __m128i {
     let zero = _mm_setzero_si128().as_i32x4();
     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
@@ -1748,7 +1644,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_ps(
-    slice: *const f32, offsets: __m128i, scale: i32
+    slice: *const f32, offsets: __m128i, scale: i32,
 ) -> __m128 {
     let zero = _mm_setzero_ps();
     let neg_one = _mm_set1_ps(-1.0);
@@ -1774,7 +1670,7 @@ pub unsafe fn _mm_i64gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i64gather_ps(
-    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
+    src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32,
 ) -> __m128 {
     let offsets = offsets.as_i64x2();
     let slice = slice as *const i8;
@@ -1797,7 +1693,7 @@ pub unsafe fn _mm_mask_i64gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_ps(
-    slice: *const f32, offsets: __m256i, scale: i32
+    slice: *const f32, offsets: __m256i, scale: i32,
 ) -> __m128 {
     let zero = _mm_setzero_ps();
     let neg_one = _mm_set1_ps(-1.0);
@@ -1823,7 +1719,7 @@ pub unsafe fn _mm256_i64gather_ps(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i64gather_ps(
-    src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32
+    src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32,
 ) -> __m128 {
     let offsets = offsets.as_i64x4();
     let slice = slice as *const i8;
@@ -1846,7 +1742,7 @@ pub unsafe fn _mm256_mask_i64gather_ps(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_epi64(
-    slice: *const i64, offsets: __m128i, scale: i32
+    slice: *const i64, offsets: __m128i, scale: i32,
 ) -> __m128i {
     let zero = _mm_setzero_si128().as_i64x2();
     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
@@ -1900,7 +1796,7 @@ pub unsafe fn _mm_mask_i64gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_epi64(
-    slice: *const i64, offsets: __m256i, scale: i32
+    slice: *const i64, offsets: __m256i, scale: i32,
 ) -> __m256i {
     let zero = _mm256_setzero_si256().as_i64x4();
     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
@@ -1954,7 +1850,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_pd(
-    slice: *const f64, offsets: __m128i, scale: i32
+    slice: *const f64, offsets: __m128i, scale: i32,
 ) -> __m128d {
     let zero = _mm_setzero_pd();
     let neg_one = _mm_set1_pd(-1.0);
@@ -2004,7 +1900,7 @@ pub unsafe fn _mm_mask_i64gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_pd(
-    slice: *const f64, offsets: __m256i, scale: i32
+    slice: *const f64, offsets: __m256i, scale: i32,
 ) -> __m256d {
     let zero = _mm256_setzero_pd();
     let neg_one = _mm256_set1_pd(-1.0);
@@ -2053,7 +1949,7 @@ pub unsafe fn _mm256_mask_i64gather_pd(
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_inserti128_si256(
-    a: __m256i, b: __m128i, imm8: i32
+    a: __m256i, b: __m128i, imm8: i32,
 ) -> __m256i {
     let a = a.as_i64x4();
     let b = _mm256_castsi128_si256(b).as_i64x4();
@@ -2101,12 +1997,9 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_epi32(
-    mem_addr: *const i32, mask: __m128i
+    mem_addr: *const i32, mask: __m128i,
 ) -> __m128i {
-    mem::transmute(maskloadd(
-        mem_addr as *const i8,
-        mask.as_i32x4(),
-    ))
+    mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
 }
 
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
@@ -2119,12 +2012,9 @@ pub unsafe fn _mm_maskload_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_epi32(
-    mem_addr: *const i32, mask: __m256i
+    mem_addr: *const i32, mask: __m256i,
 ) -> __m256i {
-    mem::transmute(maskloadd256(
-        mem_addr as *const i8,
-        mask.as_i32x8(),
-    ))
+    mem::transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
 }
 
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
@@ -2137,12 +2027,9 @@ pub unsafe fn _mm256_maskload_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_epi64(
-    mem_addr: *const i64, mask: __m128i
+    mem_addr: *const i64, mask: __m128i,
 ) -> __m128i {
-    mem::transmute(maskloadq(
-        mem_addr as *const i8,
-        mask.as_i64x2(),
-    ))
+    mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
 }
 
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
@@ -2155,12 +2042,9 @@ pub unsafe fn _mm_maskload_epi64(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_epi64(
-    mem_addr: *const i64, mask: __m256i
+    mem_addr: *const i64, mask: __m256i,
 ) -> __m256i {
-    mem::transmute(maskloadq256(
-        mem_addr as *const i8,
-        mask.as_i64x4(),
-    ))
+    mem::transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
 }
 
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
@@ -2173,13 +2057,9 @@ pub unsafe fn _mm256_maskload_epi64(
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_epi32(
-    mem_addr: *mut i32, mask: __m128i, a: __m128i
+    mem_addr: *mut i32, mask: __m128i, a: __m128i,
 ) {
-    maskstored(
-        mem_addr as *mut i8,
-        mask.as_i32x4(),
-        a.as_i32x4(),
-    )
+    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
 }
 
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
@@ -2192,13 +2072,9 @@ pub unsafe fn _mm_maskstore_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_epi32(
-    mem_addr: *mut i32, mask: __m256i, a: __m256i
+    mem_addr: *mut i32, mask: __m256i, a: __m256i,
 ) {
-    maskstored256(
-        mem_addr as *mut i8,
-        mask.as_i32x8(),
-        a.as_i32x8(),
-    )
+    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
 }
 
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
@@ -2211,13 +2087,9 @@ pub unsafe fn _mm256_maskstore_epi32(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_epi64(
-    mem_addr: *mut i64, mask: __m128i, a: __m128i
+    mem_addr: *mut i64, mask: __m128i, a: __m128i,
 ) {
-    maskstoreq(
-        mem_addr as *mut i8,
-        mask.as_i64x2(),
-        a.as_i64x2(),
-    )
+    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
 }
 
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
@@ -2230,13 +2102,9 @@ pub unsafe fn _mm_maskstore_epi64(
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_epi64(
-    mem_addr: *mut i64, mask: __m256i, a: __m256i
+    mem_addr: *mut i64, mask: __m256i, a: __m256i,
 ) {
-    maskstoreq256(
-        mem_addr as *mut i8,
-        mask.as_i64x4(),
-        a.as_i64x4(),
-    )
+    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
 }
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
@@ -2410,7 +2278,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mpsadbw_epu8(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
     let a = a.as_u8x32();
     let b = b.as_u8x32();
@@ -2656,7 +2524,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2x128_si256(
-    a: __m256i, b: __m256i, imm8: i32
+    a: __m256i, b: __m256i, imm8: i32,
 ) -> __m256i {
     let a = a.as_i64x4();
     let b = b.as_i64x4();
@@ -3559,16 +3427,23 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// #     if is_x86_feature_detected!("avx2") {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
-/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
-/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
 ///
 /// let c = _mm256_unpackhi_epi8(a, b);
 ///
-/// let expected = _mm256_setr_epi8(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13,
-/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30,
-/// 31,-31);
+/// let expected = _mm256_setr_epi8(
+///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
+///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
+///     -31,
+/// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3612,15 +3487,22 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     if is_x86_feature_detected!("avx2") {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
-/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
-/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
-/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
+/// let a = _mm256_setr_epi8(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+///     -30, -31,
+/// );
 ///
 /// let c = _mm256_unpacklo_epi8(a, b);
 ///
-/// let expected = _mm256_setr_epi8(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7,
-/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
+/// let expected = _mm256_setr_epi8(
+///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
+///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
+/// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3664,13 +3546,18 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     if is_x86_feature_detected!("avx2") {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
-/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
 ///
 /// let c = _mm256_unpackhi_epi16(a, b);
 ///
-/// let expected = _mm256_setr_epi16(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14,
-/// 15,-15);
+/// let expected = _mm256_setr_epi16(
+///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
+/// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3688,9 +3575,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
     let r: i16x16 = simd_shuffle16(
         a.as_i16x16(),
         b.as_i16x16(),
-        [
-            4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31
-        ],
+        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
     );
     mem::transmute(r)
 }
@@ -3715,13 +3600,18 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
 ///
-/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
+/// let a = _mm256_setr_epi16(
+///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
 ///
 /// let c = _mm256_unpacklo_epi16(a, b);
 ///
-/// let expected = _mm256_setr_epi16(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10,
-/// 11,-11);
+/// let expected = _mm256_setr_epi16(
+///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
+/// );
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3739,9 +3629,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
     let r: i16x16 = simd_shuffle16(
         a.as_i16x16(),
         b.as_i16x16(),
-        [
-            0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27
-        ],
+        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
     );
     mem::transmute(r)
 }
@@ -3766,11 +3654,11 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-/// let b = _mm256_setr_epi32(0,-1,-2,-3,-4,-5,-6,-7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
 ///
 /// let c = _mm256_unpackhi_epi32(a, b);
 ///
-/// let expected = _mm256_setr_epi32(2,-2, 3,-3, 6,-6, 7,-7);
+/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3813,11 +3701,11 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-/// let b = _mm256_setr_epi32(0,-1,-2,-3,-4,-5,-6,-7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
 ///
 /// let c = _mm256_unpacklo_epi32(a, b);
 ///
-/// let expected = _mm256_setr_epi32(0, 0, 1,-1, 4,-4, 5,-5);
+/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3832,11 +3720,8 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle8(
-        a.as_i32x8(),
-        b.as_i32x8(),
-        [0, 8, 1, 9, 4, 12, 5, 13],
-    );
+    let r: i32x8 =
+        simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
     mem::transmute(r)
 }
 
@@ -3860,11 +3745,11 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
-/// let b = _mm256_setr_epi64x(0,-1,-2,-3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
 ///
 /// let c = _mm256_unpackhi_epi64(a, b);
 ///
-/// let expected = _mm256_setr_epi64x(1,-1, 3,-3);
+/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -3903,11 +3788,11 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// #         #[target_feature(enable = "avx2")]
 /// #         unsafe fn worker() {
 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
-/// let b = _mm256_setr_epi64x(0,-1,-2,-3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
 ///
 /// let c = _mm256_unpacklo_epi64(a, b);
 ///
-/// let expected = _mm256_setr_epi64x(0, 0, 2,-2);
+/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
 ///
 /// #         }
@@ -4183,35 +4068,35 @@ extern "C" {
     fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.d.d"]
     fn pgatherdd(
-        src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
+        src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8,
     ) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.d.d.256"]
     fn vpgatherdd(
-        src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8
+        src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8,
     ) -> i32x8;
     #[link_name = "llvm.x86.avx2.gather.d.q"]
     fn pgatherdq(
-        src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8
+        src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8,
     ) -> i64x2;
     #[link_name = "llvm.x86.avx2.gather.d.q.256"]
     fn vpgatherdq(
-        src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8
+        src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8,
     ) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.q.d"]
     fn pgatherqd(
-        src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8
+        src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8,
     ) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.q.d.256"]
     fn vpgatherqd(
-        src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8
+        src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8,
     ) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.q.q"]
     fn pgatherqq(
-        src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8
+        src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8,
     ) -> i64x2;
     #[link_name = "llvm.x86.avx2.gather.q.q.256"]
     fn vpgatherqq(
-        src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8
+        src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8,
     ) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.d.pd"]
     fn pgatherdpd(
@@ -4235,19 +4120,19 @@ extern "C" {
     ) -> __m256d;
     #[link_name = "llvm.x86.avx2.gather.d.ps"]
     fn pgatherdps(
-        src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8
+        src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8,
     ) -> __m128;
     #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
     fn vpgatherdps(
-        src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8
+        src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8,
     ) -> __m256;
     #[link_name = "llvm.x86.avx2.gather.q.ps"]
     fn pgatherqps(
-        src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8
+        src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8,
     ) -> __m128;
     #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
     fn vpgatherqps(
-        src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8
+        src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8,
     ) -> __m128;
     #[link_name = "llvm.x86.avx2.psll.dq"]
     fn vpslldq(a: i64x4, b: i32) -> i64x4;
@@ -4718,10 +4603,7 @@ mod tests {
             7, 6, 5, 4, 3, 2, 1, 0,
         );
         let r = _mm256_cmpeq_epi8(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4737,10 +4619,7 @@ mod tests {
             7, 6, 5, 4, 3, 2, 1, 0,
         );
         let r = _mm256_cmpeq_epi16(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4758,10 +4637,7 @@ mod tests {
         let a = _mm256_setr_epi64x(0, 1, 2, 3);
         let b = _mm256_setr_epi64x(3, 2, 2, 0);
         let r = _mm256_cmpeq_epi64(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4769,10 +4645,7 @@ mod tests {
         let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
         let b = _mm256_set1_epi8(0);
         let r = _mm256_cmpgt_epi8(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4780,10 +4653,7 @@ mod tests {
         let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
         let b = _mm256_set1_epi16(0);
         let r = _mm256_cmpgt_epi16(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4791,10 +4661,7 @@ mod tests {
         let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
         let b = _mm256_set1_epi32(0);
         let r = _mm256_cmpgt_epi32(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4802,10 +4669,7 @@ mod tests {
         let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
         let b = _mm256_set1_epi64x(0);
         let r = _mm256_cmpgt_epi64(a, b);
-        assert_eq_m256i(
-            r,
-            _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0),
-        );
+        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
     }
 
     #[simd_test(enable = "avx2")]
@@ -5997,16 +5861,7 @@ mod tests {
         );
         assert_eq_m256(
             r,
-            _mm256_setr_ps(
-                0.0,
-                16.0,
-                64.0,
-                256.0,
-                256.0,
-                256.0,
-                256.0,
-                256.0,
-            ),
+            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
         );
     }
 
diff --git a/coresimd/x86/bmi1.rs b/coresimd/x86/bmi1.rs
index 953e3d9135..a84763b7e8 100644
--- a/coresimd/x86/bmi1.rs
+++ b/coresimd/x86/bmi1.rs
@@ -21,17 +21,14 @@ use stdsimd_test::assert_instr;
 #[cfg_attr(test, assert_instr(bextr))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
-    _bextr2_u32(
-        a,
-        (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32),
-    )
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
 }
 
 /// Extracts bits of `a` specified by `control` into
 /// the least significant bits of the result.
 ///
-/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
-/// be extracted, and bits `[15,8]` specify the length of the range.
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
 #[inline]
diff --git a/coresimd/x86/cpuid.rs b/coresimd/x86/cpuid.rs
index 7e000625ce..6217d8824a 100644
--- a/coresimd/x86/cpuid.rs
+++ b/coresimd/x86/cpuid.rs
@@ -86,26 +86,58 @@ pub fn has_cpuid() -> bool {
     }
     #[cfg(target_arch = "x86")]
     {
-        use coresimd::x86::{__readeflags, __writeeflags};
+        // Optimization for i586 and i686 Rust targets which SSE enabled
+        // and support cpuid:
+        #[cfg(target_feature = "sse")] {
+            true
+        }
 
-        // On `x86` the `cpuid` instruction is not always available.
-        // This follows the approach indicated in:
-        // http://wiki.osdev.org/CPUID#Checking_CPUID_availability
+        // If SSE is not enabled, detect whether cpuid is available:
+        #[cfg(not(target_feature = "sse"))]
         unsafe {
-            // Read EFLAGS:
-            let eflags: u32 = __readeflags();
-
-            // Invert the ID bit in EFLAGS:
-            let eflags_mod: u32 = eflags | 0x0020_0000;
-
-            // Store the modified EFLAGS (ID bit may or may not be inverted)
-            __writeeflags(eflags_mod);
-
-            // Read EFLAGS again:
-            let eflags_after: u32 = __readeflags();
-
-            // Check if the ID bit changed:
-            eflags_after != eflags
+            // On `x86` the `cpuid` instruction is not always available.
+            // This follows the approach indicated in:
+            // http://wiki.osdev.org/CPUID#Checking_CPUID_availability
+            // https://software.intel.com/en-us/articles/using-cpuid-to-detect-the-presence-of-sse-41-and-sse-42-instruction-sets/
+            // which detects whether `cpuid` is available by checking whether the 21st bit of the EFLAGS register is modifiable or not.
+            // If it is, then `cpuid` is available.
+            let result: u32;
+            let _temp: u32;
+            asm!(r#"
+                 # Read eflags into $0 and copy it into $1:
+                 pushfd
+                 pop     $0
+                 mov     $1, $0
+                 # Flip 21st bit of $0.
+                 xor     $0, 0x200000
+                 # Set eflags to the value of $0
+                 #
+                 # Bit 21st can only be modified if cpuid is available
+                 push    $0
+                 popfd          # A
+                 # Read eflags into $0:
+                 pushfd         # B
+                 pop     $0
+                 # xor with the original eflags sets the bits that
+                 # have been modified:
+                 xor     $0, $1
+                 "#
+                 : "=r"(result), "=r"(_temp)
+                 :
+                 : "cc", "memory"
+                 : "intel");
+            // There is a race between popfd (A) and pushfd (B)
+            // where other bits beyond 21st may have been modified due to
+            // interrupts, a debugger stepping through the asm, etc.
+            //
+            // Therefore, explicitly check whether the 21st bit
+            // was modified or not.
+            //
+            // If the result is zero, the cpuid bit was not modified.
+            // If the result is 0x200000 (non-zero), then the cpuid
+            // was correctly modified and the CPU supports the cpuid
+            // instruction:
+            (result & 0x200000) != 0
         }
     }
 }
@@ -138,17 +170,8 @@ mod tests {
         assert!(cpuid::has_cpuid());
     }
 
-    #[cfg(target_arch = "x86")]
     #[test]
-    fn test_has_cpuid() {
-        unsafe {
-            let before = __readeflags();
-
-            if cpuid::has_cpuid() {
-                assert!(before != __readeflags());
-            } else {
-                assert!(before == __readeflags());
-            }
-        }
+    fn test_has_cpuid_idempotent() {
+        assert_eq!(cpuid::has_cpuid(), cpuid::has_cpuid());
     }
 }
diff --git a/coresimd/x86/eflags.rs b/coresimd/x86/eflags.rs
index 0a7ba919a7..42380e6e38 100644
--- a/coresimd/x86/eflags.rs
+++ b/coresimd/x86/eflags.rs
@@ -6,6 +6,8 @@
 #[cfg(target_arch = "x86")]
 #[inline(always)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")]
+#[doc(hidden)]
 pub unsafe fn __readeflags() -> u32 {
     let eflags: u32;
     asm!("pushfd; popl $0" : "=r"(eflags) : : : "volatile");
@@ -18,6 +20,8 @@ pub unsafe fn __readeflags() -> u32 {
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")]
+#[doc(hidden)]
 pub unsafe fn __readeflags() -> u64 {
     let eflags: u64;
     asm!("pushfq; popq $0" : "=r"(eflags) : : : "volatile");
@@ -30,6 +34,8 @@ pub unsafe fn __readeflags() -> u64 {
 #[cfg(target_arch = "x86")]
 #[inline(always)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")]
+#[doc(hidden)]
 pub unsafe fn __writeeflags(eflags: u32) {
     asm!("pushl $0; popfd" : : "r"(eflags) : "cc", "flags" : "volatile");
 }
@@ -40,6 +46,8 @@ pub unsafe fn __writeeflags(eflags: u32) {
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")]
+#[doc(hidden)]
 pub unsafe fn __writeeflags(eflags: u64) {
     asm!("pushq $0; popfq" : : "r"(eflags) : "cc", "flags" : "volatile");
 }
@@ -49,6 +57,7 @@ mod tests {
     use coresimd::x86::*;
 
     #[test]
+    #[allow(deprecated)]
     fn test_eflags() {
         unsafe {
             // reads eflags, writes them back, reads them again,
diff --git a/coresimd/x86/fxsr.rs b/coresimd/x86/fxsr.rs
index b70e84eaf9..91261c721c 100644
--- a/coresimd/x86/fxsr.rs
+++ b/coresimd/x86/fxsr.rs
@@ -58,7 +58,7 @@ pub unsafe fn _fxrstor(mem_addr: *const u8) {
 #[cfg(test)]
 mod tests {
     use coresimd::x86::*;
-    use std::{fmt, cmp::PartialEq};
+    use std::{cmp::PartialEq, fmt};
     use stdsimd_test::simd_test;
 
     #[repr(align(16))]
diff --git a/coresimd/x86/mmx.rs b/coresimd/x86/mmx.rs
index 6e6b0b7d4f..c58a97c3cd 100644
--- a/coresimd/x86/mmx.rs
+++ b/coresimd/x86/mmx.rs
@@ -380,7 +380,7 @@ pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 {
 #[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi8(
-    e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8
+    e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
 ) -> __m64 {
     _mm_setr_pi8(e0, e1, e2, e3, e4, e5, e6, e7)
 }
@@ -426,7 +426,7 @@ pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 {
 #[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi8(
-    e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8
+    e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8,
 ) -> __m64 {
     mem::transmute(i8x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
@@ -514,12 +514,8 @@ mod tests {
             -30001,
             i16::max_value() - 1,
         );
-        let e = _mm_setr_pi16(
-            i16::min_value(),
-            30000,
-            -30000,
-            i16::max_value(),
-        );
+        let e =
+            _mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value());
         assert_eq_m64(e, _mm_add_pi16(a, b));
         assert_eq_m64(e, _m_paddw(a, b));
     }
@@ -537,16 +533,8 @@ mod tests {
     unsafe fn test_mm_adds_pi8() {
         let a = _mm_setr_pi8(-100, -1, 1, 100, -1, 0, 1, 0);
         let b = _mm_setr_pi8(-100, 1, -1, 100, 0, -1, 0, 1);
-        let e = _mm_setr_pi8(
-            i8::min_value(),
-            0,
-            0,
-            i8::max_value(),
-            -1,
-            -1,
-            1,
-            1,
-        );
+        let e =
+            _mm_setr_pi8(i8::min_value(), 0, 0, i8::max_value(), -1, -1, 1, 1);
         assert_eq_m64(e, _mm_adds_pi8(a, b));
         assert_eq_m64(e, _m_paddsb(a, b));
     }
diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs
index aba3d137c0..31d950a2c6 100644
--- a/coresimd/x86/mod.rs
+++ b/coresimd/x86/mod.rs
@@ -276,13 +276,13 @@ types! {
     /// use std::arch::x86_64::*;
     ///
     /// # fn main() {
-    /// # #[target_feature(enable = "sse")]
+    /// # #[target_feature(enable = "avx")]
     /// # unsafe fn foo() {
     /// let eight_zeros = _mm256_setzero_ps();
     /// let eight_ones = _mm256_set1_ps(1.0);
     /// let eight_floats = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
     /// # }
-    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
+    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
     /// # }
     /// ```
     #[stable(feature = "simd_x86", since = "1.27.0")]
@@ -444,11 +444,12 @@ impl m256iExt for __m256i {
     }
 }
 
-use coresimd::simd::{f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8,
-                     i32x2, i32x4, i32x8, i64x2, i64x4, i8x16, i8x32, i8x8,
-                     m16x16, m16x4, m16x8, m32x2, m32x4, m32x8, m64x2, m64x4,
-                     m8x16, m8x32, m8x8, u16x16, u16x4, u16x8, u32x2, u32x4,
-                     u32x8, u64x2, u64x4, u8x16, u8x32, u8x8};
+use coresimd::simd::{
+    f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8, i32x2, i32x4,
+    i32x8, i64x2, i64x4, i8x16, i8x32, i8x8, m16x16, m16x4, m16x8, m32x2,
+    m32x4, m32x8, m64x2, m64x4, m8x16, m8x32, m8x8, u16x16, u16x4, u16x8,
+    u32x2, u32x4, u32x8, u64x2, u64x4, u8x16, u8x32, u8x8,
+};
 
 impl_from_bits_!(
     __m64: u32x2,
diff --git a/coresimd/x86/pclmulqdq.rs b/coresimd/x86/pclmulqdq.rs
index e33928f06d..987ac89d79 100644
--- a/coresimd/x86/pclmulqdq.rs
+++ b/coresimd/x86/pclmulqdq.rs
@@ -25,20 +25,25 @@ extern "C" {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
 #[inline]
 #[target_feature(enable = "pclmulqdq")]
-#[cfg_attr(all(test, not(target_os = "linux")),
-           assert_instr(pclmulqdq, imm8 = 0))]
-#[cfg_attr(all(test, target_os = "linux"),
-           assert_instr(pclmullqlqdq, imm8 = 0))]
-#[cfg_attr(all(test, target_os = "linux"),
-           assert_instr(pclmulhqlqdq, imm8 = 1))]
-#[cfg_attr(all(test, target_os = "linux"),
-           assert_instr(pclmullqhqdq, imm8 = 16))]
-#[cfg_attr(all(test, target_os = "linux"),
-           assert_instr(pclmulhqhqdq, imm8 = 17))]
+#[cfg_attr(
+    all(test, not(target_os = "linux")), assert_instr(pclmulqdq, imm8 = 0)
+)]
+#[cfg_attr(
+    all(test, target_os = "linux"), assert_instr(pclmullqlqdq, imm8 = 0)
+)]
+#[cfg_attr(
+    all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, imm8 = 1)
+)]
+#[cfg_attr(
+    all(test, target_os = "linux"), assert_instr(pclmullqhqdq, imm8 = 16)
+)]
+#[cfg_attr(
+    all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, imm8 = 17)
+)]
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_clmulepi64_si128(
-    a: __m128i, b: __m128i, imm8: i32
+    a: __m128i, b: __m128i, imm8: i32,
 ) -> __m128i {
     macro_rules! call {
         ($imm8:expr) => {
diff --git a/coresimd/x86/rdrand.rs b/coresimd/x86/rdrand.rs
index a20cee0747..2b900837fd 100644
--- a/coresimd/x86/rdrand.rs
+++ b/coresimd/x86/rdrand.rs
@@ -1,4 +1,3 @@
-
 //! RDRAND and RDSEED instructions for returning random numbers from an Intel
 //! on-chip hardware random number generator which has been seeded by an
 //! on-chip entropy source.
diff --git a/coresimd/x86/sha.rs b/coresimd/x86/sha.rs
index 344cb43991..f6546fa1b4 100644
--- a/coresimd/x86/sha.rs
+++ b/coresimd/x86/sha.rs
@@ -75,7 +75,7 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1rnds4_epu32(
-    a: __m128i, b: __m128i, func: i32
+    a: __m128i, b: __m128i, func: i32,
 ) -> __m128i {
     let a = a.as_i32x4();
     let b = b.as_i32x4();
@@ -126,13 +126,9 @@ pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(sha256rnds2))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256rnds2_epu32(
-    a: __m128i, b: __m128i, k: __m128i
+    a: __m128i, b: __m128i, k: __m128i,
 ) -> __m128i {
-    mem::transmute(sha256rnds2(
-        a.as_i32x4(),
-        b.as_i32x4(),
-        k.as_i32x4(),
-    ))
+    mem::transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4()))
 }
 
 #[cfg(test)]
diff --git a/coresimd/x86/sse.rs b/coresimd/x86/sse.rs
index a51f3f1423..c53b46a774 100644
--- a/coresimd/x86/sse.rs
+++ b/coresimd/x86/sse.rs
@@ -230,8 +230,10 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `and` instructions, so ignore it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-           assert_instr(andps))]
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
@@ -249,8 +251,10 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `not` and `and` instructions, so ignore
 // it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-           assert_instr(andnps))]
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(andnps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
@@ -265,8 +269,10 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `or` instructions, so we ignore it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-           assert_instr(orps))]
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(orps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
@@ -281,8 +287,10 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `xor` instructions, so we ignore it.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-           assert_instr(xorps))]
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(xorps)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
@@ -968,6 +976,14 @@ pub unsafe fn _mm_setzero_ps() -> __m128 {
     __m128(0.0, 0.0, 0.0, 0.0)
 }
 
+/// A utility function for creating masks to use with Intel shuffle and permute intrinsics.
+#[inline]
+#[allow(non_snake_case)]
+#[stable(feature = "simd_x86", since = "1.28.0")]
+pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> u32 {
+        (z << 6) | (y << 4) | (x << 2) | w
+}
+
 /// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
 /// `b` using `mask`.
 ///
@@ -1117,7 +1133,7 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
 /// #
 /// let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
 /// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
-/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _) ;
+/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _);
 /// // assert_eq!(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
 /// #
 /// #         }
@@ -1132,10 +1148,14 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movhpd))]
 // 32-bit codegen does not generate `movhps` or `movhpd`, but instead
 // `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2).
-#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
-           assert_instr(movlhps))]
-#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
-           assert_instr(unpcklps))]
+#[cfg_attr(
+    all(test, target_arch = "x86", target_feature = "sse2"),
+    assert_instr(movlhps)
+)]
+#[cfg_attr(
+    all(test, target_arch = "x86", not(target_feature = "sse2")),
+    assert_instr(unpcklps)
+)]
 // TODO: This function is actually not limited to floats, but that's what
 // what matches the C type most closely: (__m128, *const __m64) -> __m128
 pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
@@ -1171,7 +1191,7 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
 /// #
 /// let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
 /// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
-/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _) ;
+/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _);
 /// // assert_eq!(r, _mm_setr_ps(5.0, 6.0, 3.0, 4.0));
 /// #
 /// #         }
@@ -1185,11 +1205,15 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
 // #[cfg_attr(test, assert_instr(movlps))]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))]
 // On 32-bit targets with SSE2, it just generates two `movsd`.
-#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
-           assert_instr(movsd))]
+#[cfg_attr(
+    all(test, target_arch = "x86", target_feature = "sse2"),
+    assert_instr(movsd)
+)]
 // It should really generate "movlps", but oh well...
-#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
-           assert_instr(movss))]
+#[cfg_attr(
+    all(test, target_arch = "x86", not(target_feature = "sse2")),
+    assert_instr(movss)
+)]
 // TODO: Like _mm_loadh_pi, this also isn't limited to floats.
 pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
     let q = p as *const f32x2;
@@ -1321,8 +1345,10 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
 // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's
 // fine.
 // On i586 (no SSE2) it just generates plain MOV instructions.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-           assert_instr(movhpd))]
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(movhpd)
+)]
 pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
     #[cfg(target_arch = "x86")]
     {
@@ -1349,8 +1375,10 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
 #[inline]
 #[target_feature(enable = "sse")]
 // On i586 the codegen just generates plane MOVs. No need to test for that.
-#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
-           assert_instr(movlps))]
+#[cfg_attr(
+    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
+    assert_instr(movlps)
+)]
 pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
     #[cfg(target_arch = "x86")]
     {
@@ -1929,7 +1957,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 {
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_TRANSPOSE4_PS(
-    row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128
+    row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128,
 ) {
     let tmp0 = _mm_unpacklo_ps(*row0, *row1);
     let tmp2 = _mm_unpacklo_ps(*row2, *row3);
@@ -2040,6 +2068,8 @@ extern "C" {
     fn pminub(a: __m64, b: __m64) -> __m64;
     #[link_name = "llvm.x86.mmx.pmulhu.w"]
     fn pmulhuw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmull.w"]
+    fn pmullw(a: __m64, b: __m64) -> __m64;
     #[link_name = "llvm.x86.mmx.pavg.b"]
     fn pavgb(a: __m64, b: __m64) -> __m64;
     #[link_name = "llvm.x86.mmx.pavg.w"]
@@ -2157,6 +2187,16 @@ pub unsafe fn _mm_mulhi_pu16(a: __m64, b: __m64) -> __m64 {
     pmulhuw(a, b)
 }
 
+/// Multiplies packed 16-bit integer values and writes the
+/// low-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmullw))]
+pub unsafe fn _mm_mullo_pi16(a: __m64, b: __m64) -> __m64 {
+    pmullw(a, b)
+}
+
 /// Multiplies packed 16-bit unsigned integer values and writes the
 /// high-order 16 bits of each 32-bit product to the corresponding bits in
 /// the destination.
@@ -2722,12 +2762,8 @@ mod tests {
 
         let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
         let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
-        let e2: u32x4 = transmute(_mm_setr_ps(
-            transmute(0xffffffffu32),
-            2.0,
-            3.0,
-            4.0,
-        ));
+        let e2: u32x4 =
+            transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
         assert_eq!(r2, e2);
     }
 
@@ -3441,22 +3477,9 @@ mod tests {
 
     #[simd_test(enable = "sse")]
     unsafe fn test_mm_cvtss_si32() {
-        let inputs = &[
-            42.0f32,
-            -3.1,
-            4.0e10,
-            4.0e-20,
-            NAN,
-            2147483500.1,
-        ];
-        let result = &[
-            42i32,
-            -3,
-            i32::min_value(),
-            0,
-            i32::min_value(),
-            2147483520,
-        ];
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
+        let result =
+            &[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520];
         for i in 0..inputs.len() {
             let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
             let e = result[i];
@@ -3570,6 +3593,13 @@ mod tests {
         assert_eq_m128(r, _mm_set1_ps(0.0));
     }
 
+    #[simd_test(enable = "sse")]
+    unsafe fn test_mm_shuffle() {
+        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
+        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
+        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
+    }
+
     #[simd_test(enable = "sse")]
     unsafe fn test_mm_shuffle_ps() {
         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
@@ -3660,10 +3690,8 @@ mod tests {
         }
 
         let r = _mm_load_ps(p);
-        let e = _mm_add_ps(
-            _mm_setr_ps(1.0, 2.0, 3.0, 4.0),
-            _mm_set1_ps(fixup),
-        );
+        let e =
+            _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
         assert_eq_m128(r, e);
     }
 
@@ -3693,10 +3721,8 @@ mod tests {
         }
 
         let r = _mm_loadr_ps(p);
-        let e = _mm_add_ps(
-            _mm_setr_ps(4.0, 3.0, 2.0, 1.0),
-            _mm_set1_ps(fixup),
-        );
+        let e =
+            _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
         assert_eq_m128(r, e);
     }
 
@@ -3935,9 +3961,7 @@ mod tests {
     #[simd_test(enable = "sse")]
     unsafe fn test_mm_stream_ps() {
         let a = _mm_set1_ps(7.0);
-        let mut mem = Memory {
-            data: [-1.0; 4],
-        };
+        let mut mem = Memory { data: [-1.0; 4] };
 
         _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
         for i in 0..4 {
@@ -4001,6 +4025,13 @@ mod tests {
         assert_eq_m64(r, _mm_set1_pi16(15));
     }
 
+    #[simd_test(enable = "sse,mmx")]
+    unsafe fn test_mm_mullo_pi16() {
+        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
+        let r = _mm_mullo_pi16(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(17960));
+    }
+
     #[simd_test(enable = "sse,mmx")]
     unsafe fn test_m_pmulhuw() {
         let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
@@ -4138,12 +4169,8 @@ mod tests {
 
     #[simd_test(enable = "sse,mmx")]
     unsafe fn test_mm_movemask_pi8() {
-        let a = _mm_setr_pi16(
-            0b1000_0000,
-            0b0100_0000,
-            0b1000_0000,
-            0b0100_0000,
-        );
+        let a =
+            _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
         let r = _mm_movemask_pi8(a);
         assert_eq!(r, 0b10001);
 
diff --git a/coresimd/x86/sse2.rs b/coresimd/x86/sse2.rs
index 7f7af002a4..d7d2a12fc5 100644
--- a/coresimd/x86/sse2.rs
+++ b/coresimd/x86/sse2.rs
@@ -1010,7 +1010,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 // no particular instruction to test
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_epi16(
-    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
+    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
 ) -> __m128i {
     mem::transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
@@ -1095,7 +1095,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 // no particular instruction to test
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_epi16(
-    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
+    e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
 ) -> __m128i {
     _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
 }
@@ -1134,10 +1134,15 @@ pub unsafe fn _mm_setzero_si128() -> __m128i {
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movsd on windows
-#[cfg_attr(all(test, not(windows),
-               not(all(target_os = "linux", target_arch = "x86_64")),
-               target_arch = "x86_64"),
-           assert_instr(movq))]
+#[cfg_attr(
+    all(
+        test,
+        not(windows),
+        not(all(target_os = "linux", target_arch = "x86_64")),
+        target_arch = "x86_64"
+    ),
+    assert_instr(movq)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
     _mm_set_epi64x(0, simd_extract((*mem_addr).as_i64x2(), 0))
@@ -1190,7 +1195,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(maskmovdqu))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskmoveu_si128(
-    a: __m128i, mask: __m128i, mem_addr: *mut i8
+    a: __m128i, mask: __m128i, mem_addr: *mut i8,
 ) {
     maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
 }
@@ -1229,10 +1234,15 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME mov on windows, movlps on i686
-#[cfg_attr(all(test, not(windows),
-               not(all(target_os = "linux", target_arch = "x86_64")),
-               target_arch = "x86_64"),
-           assert_instr(movq))]
+#[cfg_attr(
+    all(
+        test,
+        not(windows),
+        not(all(target_os = "linux", target_arch = "x86_64")),
+        target_arch = "x86_64"
+    ),
+    assert_instr(movq)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
     ptr::copy_nonoverlapping(
@@ -1275,8 +1285,9 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movd on windows, movd on i686
-#[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
-           assert_instr(movq))]
+#[cfg_attr(
+    all(test, not(windows), target_arch = "x86_64"), assert_instr(movq)
+)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
     let zero = _mm_setzero_si128();
@@ -1341,11 +1352,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
-    mem::transmute(simd_insert(
-        a.as_i16x8(),
-        (imm8 & 7) as u32,
-        i as i16,
-    ))
+    mem::transmute(simd_insert(a.as_i16x8(), (imm8 & 7) as u32, i as i16))
 }
 
 /// Return a mask of the most significant bit of each element in `a`.
@@ -1443,16 +1450,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
             simd_shuffle8(
                 a,
                 a,
-                [
-                    0,
-                    1,
-                    2,
-                    3,
-                    $x01 + 4,
-                    $x23 + 4,
-                    $x45 + 4,
-                    $x67 + 4,
-                ],
+                [0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4],
             )
         };
     }
@@ -1567,9 +1565,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_shuffle16(
         a.as_i8x16(),
         b.as_i8x16(),
-        [
-            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
-        ],
+        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
     ))
 }
 
@@ -1630,9 +1626,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_shuffle16(
         a.as_i8x16(),
         b.as_i8x16(),
-        [
-            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
-        ],
+        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
     ))
 }
 
@@ -1644,11 +1638,8 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpcklwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle8(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        [0, 8, 1, 9, 2, 10, 3, 11],
-    );
+    let x =
+        simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
     mem::transmute::<i16x8, _>(x)
 }
 
@@ -1947,11 +1938,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpltsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
-        _mm_cmplt_sd(b, a),
-        1,
-        simd_extract::<_, f64>(a, 1),
-    )
+    simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
 }
 
 /// Return a new vector with the low element of `a` replaced by the
@@ -1963,11 +1950,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmplesd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
-        _mm_cmple_sd(b, a),
-        1,
-        simd_extract::<_, f64>(a, 1),
-    )
+    simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
 }
 
 /// Return a new vector with the low element of `a` replaced by the result
@@ -2042,11 +2025,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
-        _mm_cmpnlt_sd(b, a),
-        1,
-        simd_extract::<_, f64>(a, 1),
-    )
+    simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
 }
 
 /// Return a new vector with the low element of `a` replaced by the
@@ -2058,11 +2037,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(
-        _mm_cmpnle_sd(b, a),
-        1,
-        simd_extract::<_, f64>(a, 1),
-    )
+    simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
 }
 
 /// Compare corresponding elements in `a` and `b` for equality.
@@ -2881,8 +2856,9 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 /// The resulting `__m128d` element is composed by the low-order values of
 /// the two `__m128d` interleaved input elements, i.e.:
 ///
-/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
-/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second
+/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first
+/// input
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
 #[inline]
@@ -3223,22 +3199,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_add_epi8() {
         let a = _mm_setr_epi8(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
         );
         #[cfg_attr(rustfmt, rustfmt_skip)]
         let b = _mm_setr_epi8(
@@ -3290,22 +3251,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_adds_epi8() {
         let a = _mm_setr_epi8(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
         );
         #[cfg_attr(rustfmt, rustfmt_skip)]
         let b = _mm_setr_epi8(
@@ -3363,22 +3309,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_adds_epu8() {
         let a = _mm_setr_epi8(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
         );
         #[cfg_attr(rustfmt, rustfmt_skip)]
         let b = _mm_setr_epi8(
@@ -3629,22 +3560,7 @@ mod tests {
         );
         let r = _mm_slli_si128(a, 1);
         let e = _mm_setr_epi8(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
         );
         assert_eq_m128i(r, e);
 
@@ -3888,41 +3804,10 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpeq_epi8() {
         let a = _mm_setr_epi8(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
-        );
-        let b = _mm_setr_epi8(
-            15,
-            14,
-            2,
-            12,
-            11,
-            10,
-            9,
-            8,
-            7,
-            6,
-            5,
-            4,
-            3,
-            2,
-            1,
-            0,
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
         );
+        let b =
+            _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
         let r = _mm_cmpeq_epi8(a, b);
         #[cfg_attr(rustfmt, rustfmt_skip)]
         assert_eq_m128i(
@@ -4869,9 +4754,7 @@ mod tests {
             pub data: [f64; 2],
         }
         let a = _mm_set1_pd(7.0);
-        let mut mem = Memory {
-            data: [-1.0; 2],
-        };
+        let mut mem = Memory { data: [-1.0; 2] };
 
         _mm_stream_pd(&mut mem.data[0] as *mut f64, a);
         for i in 0..2 {
@@ -4889,9 +4772,7 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_store_pd() {
-        let mut mem = Memory {
-            data: [0.0f64; 4],
-        };
+        let mut mem = Memory { data: [0.0f64; 4] };
         let vals = &mut mem.data;
         let a = _mm_setr_pd(1.0, 2.0);
         let d = vals.as_mut_ptr();
@@ -4903,9 +4784,7 @@ mod tests {
 
     #[simd_test(enable = "sse")]
     unsafe fn test_mm_storeu_pd() {
-        let mut mem = Memory {
-            data: [0.0f64; 4],
-        };
+        let mut mem = Memory { data: [0.0f64; 4] };
         let vals = &mut mem.data;
         let a = _mm_setr_pd(1.0, 2.0);
 
@@ -4929,9 +4808,7 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_store1_pd() {
-        let mut mem = Memory {
-            data: [0.0f64; 4],
-        };
+        let mut mem = Memory { data: [0.0f64; 4] };
         let vals = &mut mem.data;
         let a = _mm_setr_pd(1.0, 2.0);
         let d = vals.as_mut_ptr();
@@ -4943,9 +4820,7 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_store_pd1() {
-        let mut mem = Memory {
-            data: [0.0f64; 4],
-        };
+        let mut mem = Memory { data: [0.0f64; 4] };
         let vals = &mut mem.data;
         let a = _mm_setr_pd(1.0, 2.0);
         let d = vals.as_mut_ptr();
@@ -4957,9 +4832,7 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_storer_pd() {
-        let mut mem = Memory {
-            data: [0.0f64; 4],
-        };
+        let mut mem = Memory { data: [0.0f64; 4] };
         let vals = &mut mem.data;
         let a = _mm_setr_pd(1.0, 2.0);
         let d = vals.as_mut_ptr();
@@ -5013,10 +4886,7 @@ mod tests {
         }
 
         let r = _mm_loadu_pd(d);
-        let e = _mm_add_pd(
-            _mm_setr_pd(1.0, 2.0),
-            _mm_set1_pd(offset as f64),
-        );
+        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
         assert_eq_m128d(r, e);
     }
 
@@ -5091,12 +4961,8 @@ mod tests {
 
         assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
 
-        let a = _mm_setr_ps(
-            -1.1,
-            f32::NEG_INFINITY,
-            f32::MAX,
-            f32::NEG_INFINITY,
-        );
+        let a =
+            _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
         let b = _mm_setr_pd(f64::INFINITY, -5.0);
 
         let r = _mm_cvtsd_ss(a, b);
@@ -5161,12 +5027,8 @@ mod tests {
         let r = _mm_cvttps_epi32(a);
         assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
 
-        let a = _mm_setr_ps(
-            f32::NEG_INFINITY,
-            f32::INFINITY,
-            f32::MIN,
-            f32::MAX,
-        );
+        let a =
+            _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
         let r = _mm_cvttps_epi32(a);
         assert_eq_m128i(
             r,
diff --git a/coresimd/x86/sse41.rs b/coresimd/x86/sse41.rs
index ba65004966..198bb16ba0 100644
--- a/coresimd/x86/sse41.rs
+++ b/coresimd/x86/sse41.rs
@@ -66,13 +66,9 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 #[cfg_attr(test, assert_instr(pblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(
-    a: __m128i, b: __m128i, mask: __m128i
+    a: __m128i, b: __m128i, mask: __m128i,
 ) -> __m128i {
-    mem::transmute(pblendvb(
-        a.as_i8x16(),
-        b.as_i8x16(),
-        mask.as_i8x16(),
-    ))
+    mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
@@ -250,11 +246,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
-    mem::transmute(simd_insert(
-        a.as_i8x16(),
-        (imm8 & 0b1111) as u32,
-        i as i8,
-    ))
+    mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8))
 }
 
 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
@@ -267,11 +259,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
 #[rustc_args_required_const(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
-    mem::transmute(simd_insert(
-        a.as_i32x4(),
-        (imm8 & 0b11) as u32,
-        i,
-    ))
+    mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
@@ -1778,16 +1766,12 @@ mod tests {
         }
         {
             let a = _mm_setr_epi32(
-                15,
-                2, /* ignored */
-                1234567,
-                4, /* ignored */
+                15, 2, /* ignored */
+                1234567, 4, /* ignored */
             );
             let b = _mm_setr_epi32(
-                -20,
-                -256, /* ignored */
-                666666,
-                666666, /* ignored */
+                -20, -256, /* ignored */
+                666666, 666666, /* ignored */
             );
             let r = _mm_mul_epi32(a, b);
             let e = _mm_setr_epi64x(-300, 823043843622);
diff --git a/coresimd/x86/sse42.rs b/coresimd/x86/sse42.rs
index 0ba76b57b9..845c5bff58 100644
--- a/coresimd/x86/sse42.rs
+++ b/coresimd/x86/sse42.rs
@@ -151,7 +151,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 ///     let b = _mm_loadu_si128(chunk.as_ptr() as *const _);
 ///     let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED);
 ///     if idx != 16 {
-///        indexes.push((idx as usize) + (i * hop));
+///         indexes.push((idx as usize) + (i * hop));
 ///     }
 /// }
 /// assert_eq!(indexes, vec![34]);
@@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrm(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> __m128i {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -544,7 +544,7 @@ pub unsafe fn _mm_cmpestrm(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestri(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -567,7 +567,7 @@ pub unsafe fn _mm_cmpestri(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrz(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -590,7 +590,7 @@ pub unsafe fn _mm_cmpestrz(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrc(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -613,7 +613,7 @@ pub unsafe fn _mm_cmpestrc(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrs(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -636,7 +636,7 @@ pub unsafe fn _mm_cmpestrs(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestro(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -660,7 +660,7 @@ pub unsafe fn _mm_cmpestro(
 #[rustc_args_required_const(4)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestra(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32,
 ) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -917,13 +917,8 @@ mod tests {
     unsafe fn test_mm_cmpestra() {
         let a = str_to_m128i(b"Cannot match a");
         let b = str_to_m128i(b"Null after 14");
-        let i = _mm_cmpestra(
-            a,
-            14,
-            b,
-            16,
-            _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK,
-        );
+        let i =
+            _mm_cmpestra(a, 14, b, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK);
         assert_eq!(1, i);
     }
 
diff --git a/coresimd/x86/sse4a.rs b/coresimd/x86/sse4a.rs
index 7c45ca1d11..2c184166fb 100644
--- a/coresimd/x86/sse4a.rs
+++ b/coresimd/x86/sse4a.rs
@@ -25,8 +25,8 @@ extern "C" {
 /// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
 ///
 /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
-/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All other
-/// bits are ignored.
+/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
+/// other bits are ignored.
 ///
 /// If the length is zero, it is interpreted as `64`. If the length and index
 /// are zero, the lower 64 bits of `x` are extracted.
diff --git a/coresimd/x86/ssse3.rs b/coresimd/x86/ssse3.rs
index 2b66847091..e2c415f1cb 100644
--- a/coresimd/x86/ssse3.rs
+++ b/coresimd/x86/ssse3.rs
@@ -596,24 +596,8 @@ mod tests {
             12, 5, 5, 10,
             4, 1, 8, 0,
         );
-        let expected = _mm_setr_epi8(
-            5,
-            0,
-            5,
-            4,
-            9,
-            13,
-            7,
-            4,
-            13,
-            6,
-            6,
-            11,
-            5,
-            2,
-            9,
-            1,
-        );
+        let expected =
+            _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
         let r = _mm_shuffle_epi8(a, b);
         assert_eq_m128i(r, expected);
     }
diff --git a/coresimd/x86/test.rs b/coresimd/x86/test.rs
index 1b5b6b1fb0..72077f383e 100644
--- a/coresimd/x86/test.rs
+++ b/coresimd/x86/test.rs
@@ -121,7 +121,7 @@ mod x86_polyfill {
 
     #[target_feature(enable = "avx2")]
     pub unsafe fn _mm256_insert_epi64(
-        a: __m256i, val: i64, idx: i32
+        a: __m256i, val: i64, idx: i32,
     ) -> __m256i {
         union A {
             a: __m256i,
diff --git a/coresimd/x86/xsave.rs b/coresimd/x86/xsave.rs
index 98df42da42..66816bdbff 100644
--- a/coresimd/x86/xsave.rs
+++ b/coresimd/x86/xsave.rs
@@ -38,11 +38,7 @@ extern "C" {
 #[cfg_attr(test, assert_instr(xsave))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
-    xsave(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial restore of the enabled processor states using
@@ -110,11 +106,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 #[cfg_attr(test, assert_instr(xsaveopt))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
-    xsaveopt(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial save of the enabled processor states to memory
@@ -130,11 +122,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsavec))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
-    xsavec(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial save of the enabled processor states to memory at
@@ -151,11 +139,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsaves))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
-    xsaves(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial restore of the enabled processor states using the
@@ -196,9 +180,7 @@ mod tests {
 
     impl XsaveArea {
         fn new() -> XsaveArea {
-            XsaveArea {
-                data: [0; 2560],
-            }
+            XsaveArea { data: [0; 2560] }
         }
         fn ptr(&mut self) -> *mut u8 {
             &mut self.data[0] as *mut _ as *mut u8
diff --git a/coresimd/x86_64/bmi.rs b/coresimd/x86_64/bmi.rs
index 61bee8f0f9..831f524714 100644
--- a/coresimd/x86_64/bmi.rs
+++ b/coresimd/x86_64/bmi.rs
@@ -28,8 +28,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
 /// Extracts bits of `a` specified by `control` into
 /// the least significant bits of the result.
 ///
-/// Bits `[7,0]` of `control` specify the index to the first bit in the range to
-/// be extracted, and bits `[15,8]` specify the length of the range.
+/// Bits `[7,0]` of `control` specify the index to the first bit in the range
+/// to be extracted, and bits `[15,8]` specify the length of the range.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
 #[inline]
diff --git a/coresimd/x86_64/fxsr.rs b/coresimd/x86_64/fxsr.rs
index d28a6d8d2e..846162e219 100644
--- a/coresimd/x86_64/fxsr.rs
+++ b/coresimd/x86_64/fxsr.rs
@@ -58,7 +58,7 @@ pub unsafe fn _fxrstor64(mem_addr: *const u8) {
 #[cfg(test)]
 mod tests {
     use coresimd::x86_64::*;
-    use std::{fmt, cmp::PartialEq};
+    use std::{cmp::PartialEq, fmt};
     use stdsimd_test::simd_test;
 
     #[repr(align(16))]
diff --git a/coresimd/x86_64/xsave.rs b/coresimd/x86_64/xsave.rs
index a5b630232f..3c0fda249b 100644
--- a/coresimd/x86_64/xsave.rs
+++ b/coresimd/x86_64/xsave.rs
@@ -36,11 +36,7 @@ extern "C" {
 #[cfg_attr(test, assert_instr(xsave64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
-    xsave64(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial restore of the enabled processor states using
@@ -73,11 +69,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
 #[cfg_attr(test, assert_instr(xsaveopt64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
-    xsaveopt64(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial save of the enabled processor states to memory
@@ -93,11 +85,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsavec64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
-    xsavec64(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial save of the enabled processor states to memory at
@@ -114,11 +102,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
 #[cfg_attr(test, assert_instr(xsaves64))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
-    xsaves64(
-        mem_addr,
-        (save_mask >> 32) as u32,
-        save_mask as u32,
-    );
+    xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
 /// Perform a full or partial restore of the enabled processor states using the
diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs
index 3362bdc6f1..317e7cf01b 100644
--- a/crates/assert-instr-macro/src/lib.rs
+++ b/crates/assert-instr-macro/src/lib.rs
@@ -21,7 +21,7 @@ use proc_macro2::TokenStream;
 
 #[proc_macro_attribute]
 pub fn assert_instr(
-    attr: proc_macro::TokenStream, item: proc_macro::TokenStream
+    attr: proc_macro::TokenStream, item: proc_macro::TokenStream,
 ) -> proc_macro::TokenStream {
     let invoc = syn::parse::<Invoc>(attr)
         .expect("expected #[assert_instr(instr, a = b, ...)]");
@@ -36,9 +36,10 @@ pub fn assert_instr(
     let name = &func.ident;
 
     // Disable assert_instr for x86 targets compiled with avx enabled, which
-    // causes LLVM to generate different intrinsics that the ones we are testing
-    // for.
-    let disable_assert_instr = std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
+    // causes LLVM to generate different intrinsics that the ones we are
+    // testing for.
+    let disable_assert_instr =
+        std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
     let maybe_ignore = if cfg!(optimized) && !disable_assert_instr {
         TokenStream::new()
     } else {
@@ -72,11 +73,7 @@ pub fn assert_instr(
             syn::Pat::Ident(ref i) => &i.ident,
             _ => panic!("must have bare arguments"),
         };
-        match invoc
-            .args
-            .iter()
-            .find(|a| *ident == a.0)
-        {
+        match invoc.args.iter().find(|a| *ident == a.0) {
             Some(&(_, ref tts)) => {
                 input_vals.push(quote! { #tts });
             }
@@ -87,7 +84,8 @@ pub fn assert_instr(
         };
     }
 
-    let attrs = func.attrs
+    let attrs = func
+        .attrs
         .iter()
         .filter(|attr| {
             attr.path
@@ -142,9 +140,8 @@ pub fn assert_instr(
         }
     }.into();
     // why? necessary now to get tests to work?
-    let tts: TokenStream = tts.to_string()
-        .parse()
-        .expect("cannot parse tokenstream");
+    let tts: TokenStream =
+        tts.to_string().parse().expect("cannot parse tokenstream");
 
     let tts: TokenStream = quote! {
         #item
diff --git a/crates/coresimd/build.rs b/crates/coresimd/build.rs
index 7126538207..3dc31c52a7 100644
--- a/crates/coresimd/build.rs
+++ b/crates/coresimd/build.rs
@@ -1,8 +1,5 @@
 use std::env;
 
 fn main() {
-    println!(
-        "cargo:rustc-env=TARGET={}",
-        env::var("TARGET").unwrap()
-    );
+    println!("cargo:rustc-env=TARGET={}", env::var("TARGET").unwrap());
 }
diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs
index a09495fe92..0751a3a17b 100644
--- a/crates/coresimd/src/lib.rs
+++ b/crates/coresimd/src/lib.rs
@@ -9,29 +9,35 @@
 #![cfg_attr(stdsimd_strict, deny(warnings))]
 #![allow(dead_code)]
 #![allow(unused_features)]
-#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
-           simd_ffi, asm, proc_macro_gen,
-           integer_atomics, stmt_expr_attributes, core_intrinsics,
-           crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd,
-           staged_api, core_float, core_slice_ext, align_offset,
-           doc_cfg, mmx_target_feature, tbm_target_feature,
-           sse4a_target_feature, arm_target_feature, aarch64_target_feature,
-           mips_target_feature, powerpc_target_feature)]
-#![cfg_attr(test,
-            feature(proc_macro, test, attr_literals, abi_vectorcall,
-                    untagged_unions))]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(inline_always, too_many_arguments, cast_sign_loss,
-                  cast_lossless, cast_possible_wrap,
-                  cast_possible_truncation, cast_precision_loss,
-                  shadow_reuse, cyclomatic_complexity, similar_names,
-                  many_single_char_names))]
+#![feature(
+    const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
+    asm, proc_macro_gen, integer_atomics, stmt_expr_attributes,
+    core_intrinsics, crate_in_paths, no_core, attr_literals, rustc_attrs,
+    stdsimd, staged_api, core_float, core_slice_ext, align_offset, doc_cfg,
+    mmx_target_feature, tbm_target_feature, sse4a_target_feature,
+    arm_target_feature, aarch64_target_feature, mips_target_feature,
+    powerpc_target_feature
+)]
+#![cfg_attr(
+    test,
+    feature(proc_macro, test, attr_literals, abi_vectorcall, untagged_unions)
+)]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(
+        inline_always, too_many_arguments, cast_sign_loss, cast_lossless,
+        cast_possible_wrap, cast_possible_truncation, cast_precision_loss,
+        shadow_reuse, cyclomatic_complexity, similar_names,
+        many_single_char_names
+    )
+)]
 #![cfg_attr(test, allow(unused_imports))]
 #![no_core]
 #![unstable(feature = "stdsimd", issue = "27731")]
-#![doc(test(attr(deny(warnings))),
-       test(attr(allow(dead_code, deprecated, unused_variables,
-                       unused_mut))))]
+#![doc(
+    test(attr(deny(warnings))),
+    test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
+)]
 
 #[cfg_attr(not(test), macro_use)]
 extern crate core as _core;
diff --git a/crates/coresimd/tests/cpu-detection.rs b/crates/coresimd/tests/cpu-detection.rs
index 2a788102a4..46f8194c09 100644
--- a/crates/coresimd/tests/cpu-detection.rs
+++ b/crates/coresimd/tests/cpu-detection.rs
@@ -1,7 +1,9 @@
 #![feature(stdsimd)]
 #![cfg_attr(stdsimd_strict, deny(warnings))]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(option_unwrap_used, print_stdout, use_debug))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(option_unwrap_used, print_stdout, use_debug)
+)]
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[macro_use]
@@ -14,53 +16,20 @@ fn x86_all() {
     println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
     println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
     println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
-    println!(
-        "sse4.1: {:?}",
-        is_x86_feature_detected!("sse4.1")
-    );
-    println!(
-        "sse4.2: {:?}",
-        is_x86_feature_detected!("sse4.2")
-    );
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
     println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
     println!("avx: {:?}", is_x86_feature_detected!("avx"));
     println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
-    println!(
-        "avx512f {:?}",
-        is_x86_feature_detected!("avx512f")
-    );
-    println!(
-        "avx512cd {:?}",
-        is_x86_feature_detected!("avx512cd")
-    );
-    println!(
-        "avx512er {:?}",
-        is_x86_feature_detected!("avx512er")
-    );
-    println!(
-        "avx512pf {:?}",
-        is_x86_feature_detected!("avx512pf")
-    );
-    println!(
-        "avx512bw {:?}",
-        is_x86_feature_detected!("avx512bw")
-    );
-    println!(
-        "avx512dq {:?}",
-        is_x86_feature_detected!("avx512dq")
-    );
-    println!(
-        "avx512vl {:?}",
-        is_x86_feature_detected!("avx512vl")
-    );
-    println!(
-        "avx512_ifma {:?}",
-        is_x86_feature_detected!("avx512ifma")
-    );
-    println!(
-        "avx512_vbmi {:?}",
-        is_x86_feature_detected!("avx512vbmi")
-    );
+    println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
+    println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
+    println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
+    println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
+    println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
+    println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
+    println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
+    println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
+    println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
     println!(
         "avx512_vpopcntdq {:?}",
         is_x86_feature_detected!("avx512vpopcntdq")
@@ -70,23 +39,11 @@ fn x86_all() {
     println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
     println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
     println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
-    println!(
-        "popcnt: {:?}",
-        is_x86_feature_detected!("popcnt")
-    );
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
     println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
     println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
     println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
-    println!(
-        "xsaveopt: {:?}",
-        is_x86_feature_detected!("xsaveopt")
-    );
-    println!(
-        "xsaves: {:?}",
-        is_x86_feature_detected!("xsaves")
-    );
-    println!(
-        "xsavec: {:?}",
-        is_x86_feature_detected!("xsavec")
-    );
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
 }
diff --git a/crates/coresimd/tests/reductions.rs b/crates/coresimd/tests/reductions.rs
index 46908cf503..123410b879 100644
--- a/crates/coresimd/tests/reductions.rs
+++ b/crates/coresimd/tests/reductions.rs
@@ -2,7 +2,7 @@
 #![feature(arm_target_feature)]
 #![feature(aarch64_target_feature)]
 #![feature(powerpc_target_feature)]
-#![allow(unused_attributes)]
+#![allow(unused_attributes, dead_code, unused_imports, unused_macros)]
 
 #[macro_use]
 extern crate stdsimd;
@@ -253,11 +253,7 @@ macro_rules! product_nan_test {
                     }
                 }
                 let v = $id::splat(n0);
-                assert!(
-                    v.product().is_nan(),
-                    "all nans | {:?}",
-                    v
-                );
+                assert!(v.product().is_nan(), "all nans | {:?}", v);
             }
             unsafe { test_fn() };
         }
@@ -355,8 +351,7 @@ mod offset {
                     // tolerate 1 ULP difference:
                     if vsum.as_int() > tsum.as_int() {
                         assert!(
-                            vsum.as_int() - tsum.as_int()
-                                < 2,
+                            vsum.as_int() - tsum.as_int() < 2,
                             "v: {:?} | vsum: {} | tsum: {}",
                             v,
                             vsum,
@@ -364,8 +359,7 @@ mod offset {
                         );
                     } else {
                         assert!(
-                            tsum.as_int() - vsum.as_int()
-                                < 2,
+                            tsum.as_int() - vsum.as_int() < 2,
                             "v: {:?} | vsum: {} | tsum: {}",
                             v,
                             vsum,
diff --git a/crates/simd-test-macro/src/lib.rs b/crates/simd-test-macro/src/lib.rs
index 76da2eb222..e695b7290d 100644
--- a/crates/simd-test-macro/src/lib.rs
+++ b/crates/simd-test-macro/src/lib.rs
@@ -12,7 +12,7 @@ extern crate quote;
 
 use std::env;
 
-use proc_macro2::{Literal, Span, Ident, TokenStream, TokenTree};
+use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree};
 
 fn string(s: &str) -> TokenTree {
     Literal::string(s).into()
@@ -20,11 +20,9 @@ fn string(s: &str) -> TokenTree {
 
 #[proc_macro_attribute]
 pub fn simd_test(
-    attr: proc_macro::TokenStream, item: proc_macro::TokenStream
+    attr: proc_macro::TokenStream, item: proc_macro::TokenStream,
 ) -> proc_macro::TokenStream {
-    let tokens = TokenStream::from(attr)
-        .into_iter()
-        .collect::<Vec<_>>();
+    let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
     if tokens.len() != 3 {
         panic!("expected #[simd_test(enable = \"feature\")]");
     }
@@ -53,18 +51,19 @@ pub fn simd_test(
     let item = TokenStream::from(item);
     let name = find_name(item.clone());
 
-    let name: TokenStream = name.to_string().parse().expect(&format!(
-        "failed to parse name: {}",
-        name.to_string()
-    ));
+    let name: TokenStream = name
+        .to_string()
+        .parse()
+        .expect(&format!("failed to parse name: {}", name.to_string()));
 
     let target = env::var("TARGET")
         .expect("TARGET environment variable should be set for rustc");
     let mut force_test = false;
-    let macro_test = match target.split('-').next().expect(&format!(
-        "target triple contained no \"-\": {}",
-        target
-    )) {
+    let macro_test = match target
+        .split('-')
+        .next()
+        .expect(&format!("target triple contained no \"-\": {}", target))
+    {
         "i686" | "x86_64" | "i586" => "is_x86_feature_detected",
         "arm" | "armv7" => "is_arm_feature_detected",
         "aarch64" => "is_aarch64_feature_detected",
diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs
index f256b83533..91b949b024 100644
--- a/crates/stdsimd-test/src/lib.rs
+++ b/crates/stdsimd-test/src/lib.rs
@@ -5,8 +5,10 @@
 //! assertions about the disassembly of a function.
 
 #![feature(proc_macro)]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(missing_docs_in_private_items, print_stdout))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(missing_docs_in_private_items, print_stdout)
+)]
 
 extern crate assert_instr_macro;
 extern crate backtrace;
@@ -25,7 +27,8 @@ pub use assert_instr_macro::*;
 pub use simd_test_macro::*;
 
 lazy_static! {
-    static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
+    static ref DISASSEMBLY: HashMap<String, Vec<Function>> =
+        disassemble_myself();
 }
 
 struct Function {
@@ -39,14 +42,16 @@ struct Instruction {
 fn disassemble_myself() -> HashMap<String, Vec<Function>> {
     let me = env::current_exe().expect("failed to get current exe");
 
-    if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows")
+    if cfg!(target_arch = "x86_64")
+        && cfg!(target_os = "windows")
         && cfg!(target_env = "msvc")
     {
         let mut cmd = cc::windows_registry::find(
             "x86_64-pc-windows-msvc",
             "dumpbin.exe",
         ).expect("failed to find `dumpbin` tool");
-        let output = cmd.arg("/DISASM")
+        let output = cmd
+            .arg("/DISASM")
             .arg(&me)
             .output()
             .expect("failed to execute dumpbin");
@@ -76,11 +81,14 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
     } else {
         let objdump =
             env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
-        let output = Command::new(objdump)
+        let output = Command::new(objdump.clone())
             .arg("--disassemble")
             .arg(&me)
             .output()
-            .expect("failed to execute objdump");
+            .expect(&format!(
+                "failed to execute objdump. OBJDUMP={}",
+                objdump
+            ));
         println!(
             "{}\n{}",
             output.status,
@@ -257,9 +265,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
     // in the disassembly.
     let mut sym = None;
     backtrace::resolve(fnptr as *mut _, |name| {
-        sym = name.name()
-            .and_then(|s| s.as_str())
-            .map(normalize);
+        sym = name.name().and_then(|s| s.as_str()).map(normalize);
     });
 
     let functions =
@@ -270,26 +276,17 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
                 println!("assumed symbol name: `{}`", sym);
             }
             println!("maybe related functions");
-            for f in DISASSEMBLY
-                .keys()
-                .filter(|k| k.contains(fnname))
-            {
+            for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) {
                 println!("\t- {}", f);
             }
-            panic!(
-                "failed to find disassembly of {:#x} ({})",
-                fnptr, fnname
-            );
+            panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
         };
 
     assert_eq!(functions.len(), 1);
     let function = &functions[0];
 
     let mut instrs = &function.instrs[..];
-    while instrs
-        .last()
-        .map_or(false, |s| s.parts == ["nop"])
-    {
+    while instrs.last().map_or(false, |s| s.parts == ["nop"]) {
         instrs = &instrs[..instrs.len() - 1];
     }
 
@@ -400,10 +397,7 @@ pub fn assert_skip_test_ok(name: &str) {
     if env::var("STDSIMD_TEST_EVERYTHING").is_err() {
         return;
     }
-    panic!(
-        "skipped test `{}` when it shouldn't be skipped",
-        name
-    );
+    panic!("skipped test `{}` when it shouldn't be skipped", name);
 }
 
 // See comment in `assert-instr-macro` crate for why this exists
diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs
index a570e3cd91..0df5a07f06 100644
--- a/crates/stdsimd-verify/tests/x86-intel.rs
+++ b/crates/stdsimd-verify/tests/x86-intel.rs
@@ -1,9 +1,12 @@
 #![feature(proc_macro)]
 #![allow(bad_style)]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(shadow_reuse, cast_lossless, match_same_arms,
-                  nonminimal_bool, print_stdout, use_debug, eq_op,
-                  useless_format))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(
+        shadow_reuse, cast_lossless, match_same_arms, nonminimal_bool,
+        print_stdout, use_debug, eq_op, useless_format
+    )
+)]
 
 #[macro_use]
 extern crate serde_derive;
@@ -249,10 +252,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
             .flat_map(|c| c.to_lowercase())
             .collect::<String>();
 
-        let rust_feature = rust.target_feature.expect(&format!(
-            "no target feature listed for {}",
-            rust.name
-        ));
+        let rust_feature = rust
+            .target_feature
+            .expect(&format!("no target feature listed for {}", rust.name));
         if rust_feature.contains(&cpuid) {
             continue;
         }
@@ -314,25 +316,20 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
         if rust.arguments.len() != intel.parameters.len() {
             bail!("wrong number of arguments on {}", rust.name)
         }
-        for (i, (a, b)) in intel
-            .parameters
-            .iter()
-            .zip(rust.arguments)
-            .enumerate()
+        for (i, (a, b)) in
+            intel.parameters.iter().zip(rust.arguments).enumerate()
         {
             let is_const = rust.required_const.contains(&i);
             equate(b, &a.type_, &intel.name, is_const)?;
         }
     }
 
-    let any_i64 = rust.arguments
-        .iter()
-        .cloned()
-        .chain(rust.ret)
-        .any(|arg| match *arg {
+    let any_i64 = rust.arguments.iter().cloned().chain(rust.ret).any(|arg| {
+        match *arg {
             Type::PrimSigned(64) | Type::PrimUnsigned(64) => true,
             _ => false,
-        });
+        }
+    });
     let any_i64_exempt = match rust.name {
         // These intrinsics have all been manually verified against Clang's
         // headers to be available on x86, and the u64 arguments seem
@@ -363,7 +360,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
 }
 
 fn equate(
-    t: &Type, intel: &str, intrinsic: &str, is_const: bool
+    t: &Type, intel: &str, intrinsic: &str, is_const: bool,
 ) -> Result<(), String> {
     let intel = intel.replace(" *", "*");
     let intel = intel.replace(" const*", "*");
@@ -371,9 +368,7 @@ fn equate(
         if is_const {
             return Ok(());
         }
-        Err(format!(
-            "argument required to be const but isn't"
-        ))
+        Err(format!("argument required to be const but isn't"))
     };
     match (t, &intel[..]) {
         (&Type::PrimFloat(32), "float") => {}
diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs
index abee4fcfd4..4986e839da 100644
--- a/crates/stdsimd/src/lib.rs
+++ b/crates/stdsimd/src/lib.rs
@@ -1,11 +1,7 @@
 //! SIMD and vendor intrinsics support library.
 //!
 //! This crate defines the vendor intrinsics and types primarily used for SIMD
-//! in Rust. The crate here will soon be available in the standard library, but
-//! for now you can also browse the documentation here, primarily in the `arch`
-//! submodule.
-//!
-//! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/
+//! in Rust. 
 
 #![feature(const_fn, integer_atomics, staged_api, stdsimd)]
 #![feature(doc_cfg, allow_internal_unstable)]
diff --git a/crates/stdsimd/tests/cpu-detection.rs b/crates/stdsimd/tests/cpu-detection.rs
index 775b9774f1..962a3fa314 100644
--- a/crates/stdsimd/tests/cpu-detection.rs
+++ b/crates/stdsimd/tests/cpu-detection.rs
@@ -1,155 +1,106 @@
 #![feature(stdsimd)]
 #![cfg_attr(stdsimd_strict, deny(warnings))]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(option_unwrap_used, use_debug, print_stdout))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(option_unwrap_used, use_debug, print_stdout)
+)]
 
-#[cfg(any(target_arch = "arm", target_arch = "aarch64",
-          target_arch = "x86", target_arch = "x86_64",
-          target_arch = "powerpc", target_arch = "powerpc64"))]
+#[cfg(
+    any(
+        target_arch = "arm",
+        target_arch = "aarch64",
+        target_arch = "x86",
+        target_arch = "x86_64",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    )
+)]
 #[macro_use]
 extern crate stdsimd;
 
 #[test]
-#[cfg(all(target_arch = "arm", target_os = "linux"))]
+#[cfg(all(target_arch = "arm",
+          any(target_os = "linux", target_os = "android")))]
 fn arm_linux() {
     println!("neon: {}", is_arm_feature_detected!("neon"));
     println!("pmull: {}", is_arm_feature_detected!("pmull"));
 }
 
 #[test]
-#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+#[cfg(all(target_arch = "aarch64",
+          any(target_os = "linux", target_os = "android")))]
 fn aarch64_linux() {
     println!("fp: {}", is_aarch64_feature_detected!("fp"));
     println!("fp16: {}", is_aarch64_feature_detected!("fp16"));
     println!("neon: {}", is_aarch64_feature_detected!("neon"));
-    println!(
-        "asimd: {}",
-        is_aarch64_feature_detected!("asimd")
-    );
+    println!("asimd: {}", is_aarch64_feature_detected!("asimd"));
     println!("sve: {}", is_aarch64_feature_detected!("sve"));
     println!("crc: {}", is_aarch64_feature_detected!("crc"));
-    println!(
-        "crypto: {}",
-        is_aarch64_feature_detected!("crypto")
-    );
+    println!("crypto: {}", is_aarch64_feature_detected!("crypto"));
     println!("lse: {}", is_aarch64_feature_detected!("lse"));
     println!("rdm: {}", is_aarch64_feature_detected!("rdm"));
     println!("rcpc: {}", is_aarch64_feature_detected!("rcpc"));
-    println!(
-        "dotprod: {}",
-        is_aarch64_feature_detected!("dotprod")
-    );
+    println!("dotprod: {}", is_aarch64_feature_detected!("dotprod"));
 }
 
 #[test]
 #[cfg(all(target_arch = "powerpc", target_os = "linux"))]
 fn powerpc_linux() {
-    println!(
-        "altivec: {}",
-        is_powerpc_feature_detected!("altivec")
-    );
+    println!("altivec: {}", is_powerpc_feature_detected!("altivec"));
     println!("vsx: {}", is_powerpc_feature_detected!("vsx"));
-    println!(
-        "power8: {}",
-        is_powerpc_feature_detected!("power8")
-    );
+    println!("power8: {}", is_powerpc_feature_detected!("power8"));
 }
 
 #[test]
 #[cfg(all(target_arch = "powerpc64", target_os = "linux"))]
 fn powerpc64_linux() {
-    println!(
-        "altivec: {}",
-        is_powerpc64_feature_detected!("altivec")
-    );
+    println!("altivec: {}", is_powerpc64_feature_detected!("altivec"));
     println!("vsx: {}", is_powerpc64_feature_detected!("vsx"));
-    println!(
-        "power8: {}",
-        is_powerpc64_feature_detected!("power8")
-    );
+    println!("power8: {}", is_powerpc64_feature_detected!("power8"));
 }
 
 #[test]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn x86_all() {
+    println!("aes: {:?}", is_x86_feature_detected!("aes"));
+    println!("pcmulqdq: {:?}", is_x86_feature_detected!("pclmulqdq"));
+    println!("rdrand: {:?}", is_x86_feature_detected!("rdrand"));
+    println!("rdseed: {:?}", is_x86_feature_detected!("rdseed"));
+    println!("tsc: {:?}", is_x86_feature_detected!("tsc"));
+    println!("mmx: {:?}", is_x86_feature_detected!("mmx"));
     println!("sse: {:?}", is_x86_feature_detected!("sse"));
     println!("sse2: {:?}", is_x86_feature_detected!("sse2"));
     println!("sse3: {:?}", is_x86_feature_detected!("sse3"));
     println!("ssse3: {:?}", is_x86_feature_detected!("ssse3"));
-    println!(
-        "sse4.1: {:?}",
-        is_x86_feature_detected!("sse4.1")
-    );
-    println!(
-        "sse4.2: {:?}",
-        is_x86_feature_detected!("sse4.2")
-    );
+    println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1"));
+    println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2"));
     println!("sse4a: {:?}", is_x86_feature_detected!("sse4a"));
     println!("sha: {:?}", is_x86_feature_detected!("sha"));
     println!("avx: {:?}", is_x86_feature_detected!("avx"));
     println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
-    println!(
-        "avx512f {:?}",
-        is_x86_feature_detected!("avx512f")
-    );
-    println!(
-        "avx512cd {:?}",
-        is_x86_feature_detected!("avx512cd")
-    );
-    println!(
-        "avx512er {:?}",
-        is_x86_feature_detected!("avx512er")
-    );
-    println!(
-        "avx512pf {:?}",
-        is_x86_feature_detected!("avx512pf")
-    );
-    println!(
-        "avx512bw {:?}",
-        is_x86_feature_detected!("avx512bw")
-    );
-    println!(
-        "avx512dq {:?}",
-        is_x86_feature_detected!("avx512dq")
-    );
-    println!(
-        "avx512vl {:?}",
-        is_x86_feature_detected!("avx512vl")
-    );
-    println!(
-        "avx512_ifma {:?}",
-        is_x86_feature_detected!("avx512ifma")
-    );
-    println!(
-        "avx512_vbmi {:?}",
-        is_x86_feature_detected!("avx512vbmi")
-    );
+    println!("avx512f {:?}", is_x86_feature_detected!("avx512f"));
+    println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd"));
+    println!("avx512er {:?}", is_x86_feature_detected!("avx512er"));
+    println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf"));
+    println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw"));
+    println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq"));
+    println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl"));
+    println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma"));
+    println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi"));
     println!(
         "avx512_vpopcntdq {:?}",
         is_x86_feature_detected!("avx512vpopcntdq")
     );
     println!("fma: {:?}", is_x86_feature_detected!("fma"));
-    println!("abm: {:?}", is_x86_feature_detected!("abm"));
-    println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
+    println!("bmi1: {:?}", is_x86_feature_detected!("bmi1"));
     println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
-    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
-    println!(
-        "popcnt: {:?}",
-        is_x86_feature_detected!("popcnt")
-    );
+    println!("abm: {:?}", is_x86_feature_detected!("abm"));
     println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt"));
+    println!("tbm: {:?}", is_x86_feature_detected!("tbm"));
+    println!("popcnt: {:?}", is_x86_feature_detected!("popcnt"));
     println!("fxsr: {:?}", is_x86_feature_detected!("fxsr"));
     println!("xsave: {:?}", is_x86_feature_detected!("xsave"));
-    println!(
-        "xsaveopt: {:?}",
-        is_x86_feature_detected!("xsaveopt")
-    );
-    println!(
-        "xsaves: {:?}",
-        is_x86_feature_detected!("xsaves")
-    );
-    println!(
-        "xsavec: {:?}",
-        is_x86_feature_detected!("xsavec")
-    );
+    println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
+    println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
+    println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
 }
diff --git a/examples/hex.rs b/examples/hex.rs
index 5b045c6126..878f17125d 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -14,10 +14,13 @@
 
 #![feature(stdsimd)]
 #![cfg_attr(test, feature(test))]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(result_unwrap_used, print_stdout, option_unwrap_used,
-                  shadow_reuse, cast_possible_wrap, cast_sign_loss,
-                  missing_docs_in_private_items))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(
+        result_unwrap_used, print_stdout, option_unwrap_used, shadow_reuse,
+        cast_possible_wrap, cast_sign_loss, missing_docs_in_private_items
+    )
+)]
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[macro_use]
@@ -68,7 +71,7 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
 #[target_feature(enable = "avx2")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 unsafe fn hex_encode_avx2<'a>(
-    mut src: &[u8], dst: &'a mut [u8]
+    mut src: &[u8], dst: &'a mut [u8],
 ) -> Result<&'a str, usize> {
     let ascii_zero = _mm256_set1_epi8(b'0' as i8);
     let nines = _mm256_set1_epi8(9);
@@ -115,16 +118,14 @@ unsafe fn hex_encode_avx2<'a>(
     let i = i as usize;
     let _ = hex_encode_sse41(src, &mut dst[i * 2..]);
 
-    Ok(str::from_utf8_unchecked(
-        &dst[..src.len() * 2 + i * 2],
-    ))
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
 }
 
 // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
 #[target_feature(enable = "sse4.1")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 unsafe fn hex_encode_sse41<'a>(
-    mut src: &[u8], dst: &'a mut [u8]
+    mut src: &[u8], dst: &'a mut [u8],
 ) -> Result<&'a str, usize> {
     let ascii_zero = _mm_set1_epi8(b'0' as i8);
     let nines = _mm_set1_epi8(9);
@@ -157,10 +158,7 @@ unsafe fn hex_encode_sse41<'a>(
         let res2 = _mm_unpackhi_epi8(masked2, masked1);
 
         _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
-        _mm_storeu_si128(
-            dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
-            res2,
-        );
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
         src = &src[16..];
         i += 16;
     }
@@ -168,13 +166,11 @@ unsafe fn hex_encode_sse41<'a>(
     let i = i as usize;
     let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
 
-    Ok(str::from_utf8_unchecked(
-        &dst[..src.len() * 2 + i * 2],
-    ))
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
 }
 
 fn hex_encode_fallback<'a>(
-    src: &[u8], dst: &'a mut [u8]
+    src: &[u8], dst: &'a mut [u8],
 ) -> Result<&'a str, usize> {
     fn hex(byte: u8) -> u8 {
         static TABLE: &[u8] = b"0123456789abcdef";
@@ -199,10 +195,7 @@ mod tests {
     fn test(input: &[u8], output: &str) {
         let tmp = || vec![0; input.len() * 2];
 
-        assert_eq!(
-            hex_encode_fallback(input, &mut tmp()).unwrap(),
-            output
-        );
+        assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output);
         assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output);
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -239,9 +232,7 @@ mod tests {
     fn odd() {
         test(
             &[0; 313],
-            &iter::repeat('0')
-                .take(313 * 2)
-                .collect::<String>(),
+            &iter::repeat('0').take(313 * 2).collect::<String>(),
         );
     }
 
diff --git a/examples/nbody.rs b/examples/nbody.rs
index 8f12ec4365..63281e78e8 100644
--- a/examples/nbody.rs
+++ b/examples/nbody.rs
@@ -5,9 +5,13 @@
 
 #![cfg_attr(stdsimd_strict, deny(warnings))]
 #![feature(stdsimd)]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(similar_names, missing_docs_in_private_items,
-                  shadow_reuse, print_stdout))]
+#![cfg_attr(
+    feature = "cargo-clippy",
+    allow(
+        similar_names, missing_docs_in_private_items, shadow_reuse,
+        print_stdout
+    )
+)]
 
 extern crate stdsimd;
 #[macro_use]
@@ -15,8 +19,6 @@ extern crate cfg_if;
 
 use stdsimd::simd::*;
 
-
-
 const PI: f64 = std::f64::consts::PI;
 const SOLAR_MASS: f64 = 4.0 * PI * PI;
 const DAYS_PER_YEAR: f64 = 365.24;
@@ -81,7 +83,7 @@ struct Body {
 
 impl Body {
     fn new(
-        x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64
+        x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64,
     ) -> Self {
         Self {
             x: [x0, x1, x2],
diff --git a/stdsimd/arch/detect/bit.rs b/stdsimd/arch/detect/bit.rs
index c4ec43bb85..578f0b16b7 100644
--- a/stdsimd/arch/detect/bit.rs
+++ b/stdsimd/arch/detect/bit.rs
@@ -2,7 +2,8 @@
 
 /// Tests the `bit` of `x`.
 #[allow(dead_code)]
-pub fn test(x: usize, bit: u32) -> bool {
+#[inline]
+pub(crate) fn test(x: usize, bit: u32) -> bool {
     debug_assert!(bit < 32, "bit index out-of-bounds");
     x & (1 << bit) != 0
 }
diff --git a/stdsimd/arch/detect/cache.rs b/stdsimd/arch/detect/cache.rs
index 3d6b11d786..ead91ad86a 100644
--- a/stdsimd/arch/detect/cache.rs
+++ b/stdsimd/arch/detect/cache.rs
@@ -12,12 +12,14 @@ use core::sync::atomic::AtomicU64;
 use core::sync::atomic::AtomicU32;
 
 /// Sets the `bit` of `x`.
-pub const fn set_bit(x: u64, bit: u32) -> u64 {
+#[inline]
+const fn set_bit(x: u64, bit: u32) -> u64 {
     x | 1 << bit
 }
 
 /// Tests the `bit` of `x`.
-pub const fn test_bit(x: u64, bit: u32) -> bool {
+#[inline]
+const fn test_bit(x: u64, bit: u32) -> bool {
     x & (1 << bit) != 0
 }
 
@@ -26,7 +28,7 @@ const CACHE_CAPACITY: u32 = 63;
 
 /// This type is used to initialize the cache
 #[derive(Copy, Clone)]
-pub struct Initializer(u64);
+pub(crate) struct Initializer(u64);
 
 impl Default for Initializer {
     fn default() -> Self {
@@ -37,7 +39,8 @@ impl Default for Initializer {
 impl Initializer {
     /// Tests the `bit` of the cache.
     #[allow(dead_code)]
-    pub fn test(&self, bit: u32) -> bool {
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> bool {
         // FIXME: this way of making sure that the cache is large enough is
         // brittle.
         debug_assert!(
@@ -48,7 +51,8 @@ impl Initializer {
     }
 
     /// Sets the `bit` of the cache.
-    pub fn set(&mut self, bit: u32) {
+    #[inline]
+    pub(crate) fn set(&mut self, bit: u32) {
         // FIXME: this way of making sure that the cache is large enough is
         // brittle.
         debug_assert!(
@@ -77,17 +81,20 @@ impl Cache {
         Cache(AtomicU64::new(u64::max_value()))
     }
     /// Is the cache uninitialized?
-    pub fn is_uninitialized(&self) -> bool {
+    #[inline]
+    pub(crate) fn is_uninitialized(&self) -> bool {
         self.0.load(Ordering::Relaxed) == u64::max_value()
     }
 
     /// Is the `bit` in the cache set?
-    pub fn test(&self, bit: u32) -> bool {
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> bool {
         test_bit(CACHE.0.load(Ordering::Relaxed), bit)
     }
 
     /// Initializes the cache.
-    pub fn initialize(&self, value: Initializer) {
+    #[inline]
+    pub(crate) fn initialize(&self, value: Initializer) {
         self.0.store(value.0, Ordering::Relaxed);
     }
 }
@@ -109,12 +116,14 @@ impl Cache {
         )
     }
     /// Is the cache uninitialized?
-    pub fn is_uninitialized(&self) -> bool {
+    #[inline]
+    pub(crate) fn is_uninitialized(&self) -> bool {
         self.1.load(Ordering::Relaxed) == u32::max_value()
     }
 
     /// Is the `bit` in the cache set?
-    pub fn test(&self, bit: u32) -> bool {
+    #[inline]
+    pub(crate) fn test(&self, bit: u32) -> bool {
         if bit < 32 {
             test_bit(CACHE.0.load(Ordering::Relaxed) as u64, bit)
         } else {
@@ -123,7 +132,8 @@ impl Cache {
     }
 
     /// Initializes the cache.
-    pub fn initialize(&self, value: Initializer) {
+    #[inline]
+    pub(crate) fn initialize(&self, value: Initializer) {
         let lo: u32 = value.0 as u32;
         let hi: u32 = (value.0 >> 32) as u32;
         self.0.store(lo, Ordering::Relaxed);
@@ -139,9 +149,8 @@ impl Cache {
 ///
 /// It uses the `Feature` variant to index into this variable as a bitset. If
 /// the bit is set, the feature is enabled, and otherwise it is disabled.
-///
-/// PLEASE: do not use this, it is an implementation detail subject to change.
-pub fn test<F>(bit: u32, f: F) -> bool
+#[inline]
+pub(crate) fn test<F>(bit: u32, f: F) -> bool
 where
     F: FnOnce() -> Initializer,
 {
diff --git a/stdsimd/arch/detect/error_macros.rs b/stdsimd/arch/detect/error_macros.rs
index 0bba7b7cfe..743f7ea952 100644
--- a/stdsimd/arch/detect/error_macros.rs
+++ b/stdsimd/arch/detect/error_macros.rs
@@ -25,7 +25,7 @@ macro_rules! is_x86_feature_detected {
 #[macro_export]
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_arm_feature_detected {
-    ($t: tt) => {
+    ($t:tt) => {
         compile_error!(
             r#"
         is_arm_feature_detected can only be used on ARM targets.
@@ -64,7 +64,8 @@ macro_rules! is_aarch64_feature_detected {
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_powerpc_feature_detected {
     ($t:tt) => {
-        compile_error!(r#"
+        compile_error!(
+            r#"
 is_powerpc_feature_detected can only be used on PowerPC targets.
 You can prevent it from being used in other architectures by
 guarding it behind a cfg(target_arch) as follows:
@@ -72,7 +73,8 @@ guarding it behind a cfg(target_arch) as follows:
     #[cfg(target_arch = "powerpc")] {
         if is_powerpc_feature_detected(...) { ... }
     }
-"#)
+"#
+        )
     };
 }
 
@@ -81,7 +83,8 @@ guarding it behind a cfg(target_arch) as follows:
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_powerpc64_feature_detected {
     ($t:tt) => {
-        compile_error!(r#"
+        compile_error!(
+            r#"
 is_powerpc64_feature_detected can only be used on PowerPC64 targets.
 You can prevent it from being used in other architectures by
 guarding it behind a cfg(target_arch) as follows:
@@ -89,7 +92,8 @@ guarding it behind a cfg(target_arch) as follows:
     #[cfg(target_arch = "powerpc64")] {
         if is_powerpc64_feature_detected(...) { ... }
     }
-"#)
+"#
+        )
     };
 }
 
@@ -97,7 +101,7 @@ guarding it behind a cfg(target_arch) as follows:
 #[macro_export]
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_mips_feature_detected {
-    ($t: tt) => {
+    ($t:tt) => {
         compile_error!(
             r#"
         is_mips_feature_detected can only be used on MIPS targets.
@@ -116,7 +120,7 @@ macro_rules! is_mips_feature_detected {
 #[macro_export]
 #[unstable(feature = "stdsimd", issue = "27731")]
 macro_rules! is_mips64_feature_detected {
-    ($t: tt) => {
+    ($t:tt) => {
         compile_error!(
             r#"
         is_mips64_feature_detected can only be used on MIPS64 targets.
diff --git a/stdsimd/arch/detect/mod.rs b/stdsimd/arch/detect/mod.rs
index 7ed3971711..57cab4f120 100644
--- a/stdsimd/arch/detect/mod.rs
+++ b/stdsimd/arch/detect/mod.rs
@@ -60,8 +60,8 @@ cfg_if! {
 }
 pub use self::arch::Feature;
 
-mod cache;
 mod bit;
+mod cache;
 
 cfg_if! {
     if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
diff --git a/stdsimd/arch/detect/os/linux/aarch64.rs b/stdsimd/arch/detect/os/linux/aarch64.rs
index e46ace5a30..ebddf2d75d 100644
--- a/stdsimd/arch/detect/os/linux/aarch64.rs
+++ b/stdsimd/arch/detect/os/linux/aarch64.rs
@@ -7,6 +7,7 @@ use super::auxvec;
 use super::cpuinfo;
 
 /// Performs run-time feature detection.
+#[inline]
 pub fn check_for(x: Feature) -> bool {
     cache::test(x as u32, detect_features)
 }
@@ -125,7 +126,7 @@ impl AtHwcap {
     ///
     /// The features are enabled approximately like in LLVM host feature detection:
     /// https://github.com/llvm-mirror/llvm/blob/master/lib/Support/Host.cpp#L1273
-    pub fn cache(self) -> cache::Initializer {
+    fn cache(self) -> cache::Initializer {
         let mut value = cache::Initializer::default();
         {
             let mut enable_feature = |f, enable| {
diff --git a/stdsimd/arch/detect/os/linux/arm.rs b/stdsimd/arch/detect/os/linux/arm.rs
index 7b964962cb..9d265e02db 100644
--- a/stdsimd/arch/detect/os/linux/arm.rs
+++ b/stdsimd/arch/detect/os/linux/arm.rs
@@ -7,6 +7,7 @@ use super::auxvec;
 use super::cpuinfo;
 
 /// Performs run-time feature detection.
+#[inline]
 pub fn check_for(x: Feature) -> bool {
     cache::test(x as u32, detect_features)
 }
diff --git a/stdsimd/arch/detect/os/linux/auxvec.rs b/stdsimd/arch/detect/os/linux/auxvec.rs
index b387b70da6..20fbb5f588 100644
--- a/stdsimd/arch/detect/os/linux/auxvec.rs
+++ b/stdsimd/arch/detect/os/linux/auxvec.rs
@@ -7,17 +7,17 @@ use fs::File;
 use io::Read;
 
 /// Key to access the CPU Hardware capabilities bitfield.
-pub const AT_HWCAP: usize = 16;
+pub(crate) const AT_HWCAP: usize = 16;
 /// Key to access the CPU Hardware capabilities 2 bitfield.
 #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
-pub const AT_HWCAP2: usize = 26;
+pub(crate) const AT_HWCAP2: usize = 26;
 
 /// Cache HWCAP bitfields of the ELF Auxiliary Vector.
 ///
 /// If an entry cannot be read all the bits in the bitfield are set to zero.
 /// This should be interpreted as all the features being disabled.
 #[derive(Debug, Copy, Clone)]
-pub struct AuxVec {
+pub(crate) struct AuxVec {
     pub hwcap: usize,
     #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
     pub hwcap2: usize,
@@ -48,7 +48,7 @@ pub struct AuxVec {
 ///
 /// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h
 /// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/
-pub fn auxv() -> Result<AuxVec, ()> {
+pub(crate) fn auxv() -> Result<AuxVec, ()> {
     // Try to call a dynamically-linked getauxval function.
     if let Ok(hwcap) = getauxval(AT_HWCAP) {
         // Targets with only AT_HWCAP:
diff --git a/stdsimd/arch/detect/os/linux/cpuinfo.rs b/stdsimd/arch/detect/os/linux/cpuinfo.rs
index eeb152ccce..fd7b5876f3 100644
--- a/stdsimd/arch/detect/os/linux/cpuinfo.rs
+++ b/stdsimd/arch/detect/os/linux/cpuinfo.rs
@@ -6,20 +6,20 @@ use fs::File;
 use io::{self, Read};
 
 /// cpuinfo
-pub struct CpuInfo {
+pub(crate) struct CpuInfo {
     raw: String,
 }
 
 impl CpuInfo {
     /// Reads /proc/cpuinfo into CpuInfo.
-    pub fn new() -> Result<Self, io::Error> {
+    pub(crate) fn new() -> Result<Self, io::Error> {
         let mut file = File::open("/proc/cpuinfo")?;
         let mut cpui = Self { raw: String::new() };
         file.read_to_string(&mut cpui.raw)?;
         Ok(cpui)
     }
     /// Returns the value of the cpuinfo `field`.
-    pub fn field(&self, field: &str) -> CpuInfoField {
+    pub(crate) fn field(&self, field: &str) -> CpuInfoField {
         for l in self.raw.lines() {
             if l.trim().starts_with(field) {
                 return CpuInfoField::new(l.split(": ").nth(1));
@@ -44,7 +44,7 @@ impl CpuInfo {
 
 /// Field of cpuinfo
 #[derive(Debug)]
-pub struct CpuInfoField<'a>(Option<&'a str>);
+pub(crate) struct CpuInfoField<'a>(Option<&'a str>);
 
 impl<'a> PartialEq<&'a str> for CpuInfoField<'a> {
     fn eq(&self, other: &&'a str) -> bool {
@@ -56,7 +56,7 @@ impl<'a> PartialEq<&'a str> for CpuInfoField<'a> {
 }
 
 impl<'a> CpuInfoField<'a> {
-    pub fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> {
+    pub(crate) fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> {
         match v {
             None => CpuInfoField::<'b>(None),
             Some(f) => CpuInfoField::<'b>(Some(f.trim())),
@@ -64,11 +64,11 @@ impl<'a> CpuInfoField<'a> {
     }
     /// Does the field exist?
     #[cfg(test)]
-    pub fn exists(&self) -> bool {
+    pub(crate) fn exists(&self) -> bool {
         self.0.is_some()
     }
     /// Does the field contain `other`?
-    pub fn has(&self, other: &str) -> bool {
+    pub(crate) fn has(&self, other: &str) -> bool {
         match self.0 {
             None => other.is_empty(),
             Some(f) => {
diff --git a/stdsimd/arch/detect/os/linux/mips.rs b/stdsimd/arch/detect/os/linux/mips.rs
index 2c3ee03d22..92e95f057e 100644
--- a/stdsimd/arch/detect/os/linux/mips.rs
+++ b/stdsimd/arch/detect/os/linux/mips.rs
@@ -6,6 +6,7 @@ use arch::detect::bit;
 use super::auxvec;
 
 /// Performs run-time feature detection.
+#[inline]
 pub fn check_for(x: Feature) -> bool {
     cache::test(x as u32, detect_features)
 }
diff --git a/stdsimd/arch/detect/os/linux/powerpc.rs b/stdsimd/arch/detect/os/linux/powerpc.rs
index 8289c09816..41f58508f2 100644
--- a/stdsimd/arch/detect/os/linux/powerpc.rs
+++ b/stdsimd/arch/detect/os/linux/powerpc.rs
@@ -6,6 +6,7 @@ use super::auxvec;
 use super::cpuinfo;
 
 /// Performs run-time feature detection.
+#[inline]
 pub fn check_for(x: Feature) -> bool {
     cache::test(x as u32, detect_features)
 }
diff --git a/stdsimd/arch/detect/os/other.rs b/stdsimd/arch/detect/os/other.rs
index 2b1b378acf..562cbfe8f1 100644
--- a/stdsimd/arch/detect/os/other.rs
+++ b/stdsimd/arch/detect/os/other.rs
@@ -3,6 +3,7 @@
 use arch::detect::Feature;
 
 /// Performs run-time feature detection.
+#[inline]
 pub fn check_for(_x: Feature) -> bool {
     false
 }
diff --git a/stdsimd/arch/detect/os/x86.rs b/stdsimd/arch/detect/os/x86.rs
index b7647cd637..3ac009a2d3 100644
--- a/stdsimd/arch/detect/os/x86.rs
+++ b/stdsimd/arch/detect/os/x86.rs
@@ -12,6 +12,7 @@ use arch::detect::cache;
 use arch::detect::bit;
 
 /// Performs run-time feature detection.
+#[inline]
 pub fn check_for(x: Feature) -> bool {
     cache::test(x as u32, detect_features)
 }
@@ -32,7 +33,7 @@ pub fn check_for(x: Feature) -> bool {
 /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 #[cfg_attr(feature = "cargo-clippy", allow(similar_names))]
-pub fn detect_features() -> cache::Initializer {
+fn detect_features() -> cache::Initializer {
     let mut value = cache::Initializer::default();
 
     // If the x86 CPU does not support the CPUID instruction then it is too
@@ -115,7 +116,6 @@ pub fn detect_features() -> cache::Initializer {
 
         enable(proc_info_ecx, 0, Feature::sse3);
         enable(proc_info_ecx, 9, Feature::ssse3);
-        enable(proc_info_ecx, 12, Feature::fma);
         enable(proc_info_ecx, 19, Feature::sse4_1);
         enable(proc_info_ecx, 20, Feature::sse4_2);
         enable(proc_info_ecx, 23, Feature::popcnt);
@@ -149,64 +149,77 @@ pub fn detect_features() -> cache::Initializer {
             // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
             let cpu_osxsave = bit::test(proc_info_ecx as usize, 27);
 
-            // 2. The OS must have signaled the CPU that it supports saving and
-            // restoring the SSE and AVX registers by setting `XCR0.SSE[1]` and
-            // `XCR0.AVX[2]` to `1`.
-            //
-            // This is safe because the CPU supports `xsave`
-            let xcr0 = unsafe { _xgetbv(0) };
-            let os_avx_support = xcr0 & 6 == 6;
-            let os_avx512_support = xcr0 & 224 == 224;
-
-            // Only if the OS and the CPU support saving/restoring the AVX
-            // registers we enable `xsave` support:
-            if cpu_osxsave && os_avx_support {
-                // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
-                // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
-                // Developer’s Manual, Volume 1: Basic Architecture":
+            if cpu_osxsave {
+                // 2. The OS must have signaled the CPU that it supports saving and
+                // restoring the:
+                //
+                // * SSE -> `XCR0.SSE[1]`
+                // * AVX -> `XCR0.AVX[2]`
+                // * AVX-512 -> `XCR0.AVX-512[7:5]`.
                 //
-                // "Software enables the XSAVE feature set by setting
-                // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
-                // instruction). If this bit is 0, execution of any of XGETBV,
-                // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
-                // causes an invalid-opcode exception (#UD)"
+                // by setting the corresponding bits of `XCR0` to `1`.
                 //
-                enable(proc_info_ecx, 26, Feature::xsave);
+                // This is safe because the CPU supports `xsave`
+                // and the OS has set `osxsave`.
+                let xcr0 = unsafe { _xgetbv(0) };
+                // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+                let os_avx_support = xcr0 & 6 == 6;
+                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
+                let os_avx512_support = xcr0 & 224 == 224;
 
-                // For `xsaveopt`, `xsavec`, and `xsaves` we need to query:
-                // Processor Extended State Enumeration Sub-leaf (EAX = 0DH,
-                // ECX = 1):
-                if max_basic_leaf >= 0xd {
-                    let CpuidResult {
-                        eax: proc_extended_state1_eax,
-                        ..
-                    } = unsafe { __cpuid_count(0xd_u32, 1) };
-                    enable(proc_extended_state1_eax, 0, Feature::xsaveopt);
-                    enable(proc_extended_state1_eax, 1, Feature::xsavec);
-                    enable(proc_extended_state1_eax, 3, Feature::xsaves);
-                }
+                // Only if the OS and the CPU support saving/restoring the AVX
+                // registers we enable `xsave` support:
+                if os_avx_support {
+                    // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                    // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                    // Developer’s Manual, Volume 1: Basic Architecture":
+                    //
+                    // "Software enables the XSAVE feature set by setting
+                    // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                    // instruction). If this bit is 0, execution of any of XGETBV,
+                    // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                    // causes an invalid-opcode exception (#UD)"
+                    //
+                    enable(proc_info_ecx, 26, Feature::xsave);
+
+                    // For `xsaveopt`, `xsavec`, and `xsaves` we need to query:
+                    // Processor Extended State Enumeration Sub-leaf (EAX = 0DH,
+                    // ECX = 1):
+                    if max_basic_leaf >= 0xd {
+                        let CpuidResult {
+                            eax: proc_extended_state1_eax,
+                            ..
+                        } = unsafe { __cpuid_count(0xd_u32, 1) };
+                        enable(proc_extended_state1_eax, 0, Feature::xsaveopt);
+                        enable(proc_extended_state1_eax, 1, Feature::xsavec);
+                        enable(proc_extended_state1_eax, 3, Feature::xsaves);
+                    }
+
+                    // FMA (uses 256-bit wide registers):
+                    enable(proc_info_ecx, 12, Feature::fma);
 
-                // And AVX/AVX2:
-                enable(proc_info_ecx, 28, Feature::avx);
-                enable(extended_features_ebx, 5, Feature::avx2);
+                    // And AVX/AVX2:
+                    enable(proc_info_ecx, 28, Feature::avx);
+                    enable(extended_features_ebx, 5, Feature::avx2);
 
-                // For AVX-512 the OS also needs to support saving/restoring
-                // the extended state, only then we enable AVX-512 support:
-                if os_avx512_support {
-                    enable(extended_features_ebx, 16, Feature::avx512f);
-                    enable(extended_features_ebx, 17, Feature::avx512dq);
-                    enable(extended_features_ebx, 21, Feature::avx512_ifma);
-                    enable(extended_features_ebx, 26, Feature::avx512pf);
-                    enable(extended_features_ebx, 27, Feature::avx512er);
-                    enable(extended_features_ebx, 28, Feature::avx512cd);
-                    enable(extended_features_ebx, 30, Feature::avx512bw);
-                    enable(extended_features_ebx, 31, Feature::avx512vl);
-                    enable(extended_features_ecx, 1, Feature::avx512_vbmi);
-                    enable(
-                        extended_features_ecx,
-                        14,
-                        Feature::avx512_vpopcntdq,
-                    );
+                    // For AVX-512 the OS also needs to support saving/restoring
+                    // the extended state, only then we enable AVX-512 support:
+                    if os_avx512_support {
+                        enable(extended_features_ebx, 16, Feature::avx512f);
+                        enable(extended_features_ebx, 17, Feature::avx512dq);
+                        enable(extended_features_ebx, 21, Feature::avx512_ifma);
+                        enable(extended_features_ebx, 26, Feature::avx512pf);
+                        enable(extended_features_ebx, 27, Feature::avx512er);
+                        enable(extended_features_ebx, 28, Feature::avx512cd);
+                        enable(extended_features_ebx, 30, Feature::avx512bw);
+                        enable(extended_features_ebx, 31, Feature::avx512vl);
+                        enable(extended_features_ecx, 1, Feature::avx512_vbmi);
+                        enable(
+                            extended_features_ecx,
+                            14,
+                            Feature::avx512_vpopcntdq,
+                        );
+                    }
                 }
             }
         }
diff --git a/stdsimd/mod.rs b/stdsimd/mod.rs
index 9eca5b075a..b76deb520e 100644
--- a/stdsimd/mod.rs
+++ b/stdsimd/mod.rs
@@ -188,14 +188,14 @@
 /// * [`powerpc`]
 /// * [`powerpc64`]
 ///
-/// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html
-/// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html
-/// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html
-/// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html
-/// [`mips`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/mips/index.html
-/// [`mips64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/mips64/index.html
-/// [`powerpc`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/powerpc/index.html
-/// [`powerpc64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/powerpc64/index.html
+/// [`x86`]: x86/index.html
+/// [`x86_64`]: x86_64/index.html
+/// [`arm`]: arm/index.html
+/// [`aarch64`]: aarch64/index.html
+/// [`mips`]: mips/index.html
+/// [`mips64`]: mips64/index.html
+/// [`powerpc`]: powerpc/index.html
+/// [`powerpc64`]: powerpc64/index.html
 ///
 /// # Examples
 ///