diff --git a/.travis.yml b/.travis.yml index 1c6ea4cb9d..29b02cc710 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,16 +9,20 @@ matrix: - env: TARGET=i686-unknown-linux-gnu - env: TARGET=x86_64-unknown-linux-gnu NO_ADD=1 - env: TARGET=x86_64-unknown-linux-gnu-emulated NO_ADD=1 STDSIMD_TEST_EVERYTHING=1 + - env: TARGET=x86_64-linux-android - env: TARGET=arm-unknown-linux-gnueabihf + - env: TARGET=arm-linux-androideabi - env: TARGET=armv7-unknown-linux-gnueabihf - env: TARGET=aarch64-unknown-linux-gnu - env: TARGET=mips-unknown-linux-gnu NORUN=1 - env: TARGET=mipsel-unknown-linux-gnu NORUN=1 - env: TARGET=mips64-unknown-linux-gnuabi64 NORUN=1 - env: TARGET=mips64el-unknown-linux-gnuabi64 NORUN=1 + - env: TARGET=aarch64-linux-android - env: TARGET=powerpc-unknown-linux-gnu - env: TARGET=powerpc64-unknown-linux-gnu - env: TARGET=powerpc64le-unknown-linux-gnu + - env: TARGET=s390x-unknown-linux-gnu NORUN=1 - os: osx env: TARGET=i686-apple-darwin script: ci/run.sh diff --git a/README.md b/README.md index d8e5b22c62..e56d4ded7a 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,30 @@ -stdsimd +stdsimd - Rust's standard library SIMD components ======= [![Travis-CI Status]][travis] [![Appveyor Status]][appveyor] [![Latest Version]][crates.io] [![docs]][docs.rs] -> Experimental support for SIMD destined to eventually become part of Rust's -> standard library +# Usage -This is a **work in progress**. +`stdsimd` is now shipped with Rust's `std` library - its is part of `libcore` +and `libstd`. + +The easiest way to use it is just to import it via `use std::arch`. + +The `std::arch` component for `x86` is available in stable Rust. The `std::arch` +components for other architectures and the `std::simd` component require nightly +Rust. + +Using `stdsimd` master branch is not recommended. It requires nightly Rust, it +only works with particular Rust nightly versions, and it can (and does) break +often. If you need to use `stdsimd` master branch, you can add it to your +`Cargo.toml` as follows: + +```toml +#[dependencies] +stdsimd = { git = "https://github.com/rust-lang-nursery/stdsimd.git" } +``` + +# Documentation * [Documentation - i686][i686] * [Documentation - x86\_64][x86_64] diff --git a/ci/android-install-ndk.sh b/ci/android-install-ndk.sh new file mode 100644 index 0000000000..873f6c52c8 --- /dev/null +++ b/ci/android-install-ndk.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# Copyright 2016 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +set -ex + +curl -O https://dl.google.com/android/repository/android-ndk-r15b-linux-x86_64.zip +unzip -q android-ndk-r15b-linux-x86_64.zip + +case "$1" in + aarch64) + arch=arm64 + ;; + + i686) + arch=x86 + ;; + + *) + arch=$1 + ;; +esac; + +android-ndk-r15b/build/tools/make_standalone_toolchain.py \ + --unified-headers \ + --install-dir /android/ndk-$1 \ + --arch $arch \ + --api 24 + +rm -rf ./android-ndk-r15b-linux-x86_64.zip ./android-ndk-r15b diff --git a/ci/android-install-sdk.sh b/ci/android-install-sdk.sh new file mode 100644 index 0000000000..ab7e14d95b --- /dev/null +++ b/ci/android-install-sdk.sh @@ -0,0 +1,60 @@ +#!/bin/sh +# Copyright 2016 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +set -ex + +# Prep the SDK and emulator +# +# Note that the update process requires that we accept a bunch of licenses, and +# we can't just pipe `yes` into it for some reason, so we take the same strategy +# located in https://github.com/appunite/docker by just wrapping it in a script +# which apparently magically accepts the licenses. + +mkdir sdk +curl https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O +unzip -d sdk sdk-tools-linux-3859397.zip + +case "$1" in + arm | armv7) + abi=armeabi-v7a + ;; + + aarch64) + abi=arm64-v8a + ;; + + i686) + abi=x86 + ;; + + x86_64) + abi=x86_64 + ;; + + *) + echo "invalid arch: $1" + exit 1 + ;; +esac; + +# --no_https avoids +# javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found +echo "yes" | \ + ./sdk/tools/bin/sdkmanager --no_https \ + "emulator" \ + "platform-tools" \ + "platforms;android-24" \ + "system-images;android-24;default;$abi" + +echo "no" | + ./sdk/tools/bin/avdmanager create avd \ + --name $1 \ + --package "system-images;android-24;default;$abi" diff --git a/ci/android-sysimage.sh b/ci/android-sysimage.sh new file mode 100644 index 0000000000..9611dfeb0d --- /dev/null +++ b/ci/android-sysimage.sh @@ -0,0 +1,52 @@ +# Copyright 2017 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +set -ex + +URL=https://dl.google.com/android/repository/sys-img/android + +main() { + local arch=$1 + local name=$2 + local dest=/system + local td=$(mktemp -d) + + apt-get install --no-install-recommends e2tools + + pushd $td + curl -O $URL/$name + unzip -q $name + + local system=$(find . -name system.img) + mkdir -p $dest/{bin,lib,lib64} + + # Extract android linker and libraries to /system + # This allows android executables to be run directly (or with qemu) + if [ $arch = "x86_64" -o $arch = "arm64" ]; then + e2cp -p $system:/bin/linker64 $dest/bin/ + e2cp -p $system:/lib64/libdl.so $dest/lib64/ + e2cp -p $system:/lib64/libc.so $dest/lib64/ + e2cp -p $system:/lib64/libm.so $dest/lib64/ + else + e2cp -p $system:/bin/linker $dest/bin/ + e2cp -p $system:/lib/libdl.so $dest/lib/ + e2cp -p $system:/lib/libc.so $dest/lib/ + e2cp -p $system:/lib/libm.so $dest/lib/ + fi + + # clean up + apt-get purge --auto-remove -y e2tools + + popd + + rm -rf $td +} + +main "${@}" diff --git a/ci/docker/aarch64-linux-android/Dockerfile b/ci/docker/aarch64-linux-android/Dockerfile new file mode 100644 index 0000000000..27bde89c5a --- /dev/null +++ b/ci/docker/aarch64-linux-android/Dockerfile @@ -0,0 +1,47 @@ +FROM ubuntu:16.04 + +RUN dpkg --add-architecture i386 && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + file \ + make \ + curl \ + ca-certificates \ + python \ + unzip \ + expect \ + openjdk-9-jre \ + libstdc++6:i386 \ + libpulse0 \ + gcc \ + libc6-dev + +WORKDIR /android/ +COPY android* /android/ + +ENV ANDROID_ARCH=aarch64 +ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools + +RUN sh /android/android-install-ndk.sh $ANDROID_ARCH +RUN sh /android/android-install-sdk.sh $ANDROID_ARCH +RUN mv /root/.android /tmp +RUN chmod 777 -R /tmp/.android +RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/* + +ENV PATH=$PATH:/rust/bin \ + CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \ + CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \ + OBJDUMP=aarch64-linux-android-objdump \ + HOME=/tmp + +ADD runtest-android.rs /tmp/runtest.rs +ENTRYPOINT [ \ + "bash", \ + "-c", \ + # set SHELL so android can detect a 64bits system, see + # http://stackoverflow.com/a/41789144 + "SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \ + rustc /tmp/runtest.rs -o /tmp/runtest && \ + exec \"$@\"", \ + "--" \ +] diff --git a/ci/docker/arm-linux-androideabi/Dockerfile b/ci/docker/arm-linux-androideabi/Dockerfile new file mode 100644 index 0000000000..995a9e30e6 --- /dev/null +++ b/ci/docker/arm-linux-androideabi/Dockerfile @@ -0,0 +1,47 @@ +FROM ubuntu:16.04 + +RUN dpkg --add-architecture i386 && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + file \ + make \ + curl \ + ca-certificates \ + python \ + unzip \ + expect \ + openjdk-9-jre \ + libstdc++6:i386 \ + libpulse0 \ + gcc \ + libc6-dev + +WORKDIR /android/ +COPY android* /android/ + +ENV ANDROID_ARCH=arm +ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools + +RUN sh /android/android-install-ndk.sh $ANDROID_ARCH +RUN sh /android/android-install-sdk.sh $ANDROID_ARCH +RUN mv /root/.android /tmp +RUN chmod 777 -R /tmp/.android +RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/* + +ENV PATH=$PATH:/rust/bin \ + CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \ + CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \ + OBJDUMP=arm-linux-androideabi-objdump \ + HOME=/tmp + +ADD runtest-android.rs /tmp/runtest.rs +ENTRYPOINT [ \ + "bash", \ + "-c", \ + # set SHELL so android can detect a 64bits system, see + # http://stackoverflow.com/a/41789144 + "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \ + rustc /tmp/runtest.rs -o /tmp/runtest && \ + exec \"$@\"", \ + "--" \ +] diff --git a/ci/docker/s390x-unknown-linux-gnu/Dockerfile b/ci/docker/s390x-unknown-linux-gnu/Dockerfile new file mode 100644 index 0000000000..89d9d87a15 --- /dev/null +++ b/ci/docker/s390x-unknown-linux-gnu/Dockerfile @@ -0,0 +1,13 @@ +FROM ubuntu:17.10 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl ca-certificates \ + gcc libc6-dev \ + gcc-s390x-linux-gnu libc6-dev-s390x-cross \ + qemu-user \ + make \ + file + +ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \ + CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -L /usr/s390x-linux-gnu" \ + OBJDUMP=s390x-linux-gnu-objdump \ No newline at end of file diff --git a/ci/docker/x86_64-linux-android/Dockerfile b/ci/docker/x86_64-linux-android/Dockerfile new file mode 100644 index 0000000000..d52dd45b12 --- /dev/null +++ b/ci/docker/x86_64-linux-android/Dockerfile @@ -0,0 +1,29 @@ +FROM ubuntu:16.04 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + gcc \ + libc-dev \ + python \ + unzip \ + file \ + make + +WORKDIR /android/ +ENV ANDROID_ARCH=x86_64 +COPY android-install-ndk.sh /android/ +RUN sh /android/android-install-ndk.sh $ANDROID_ARCH + +# We do not run x86_64-linux-android tests on an android emulator. +# See ci/android-sysimage.sh for informations about how tests are run. +COPY android-sysimage.sh /android/ +RUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip + +ENV PATH=$PATH:/rust/bin:/android/ndk-$ANDROID_ARCH/bin \ + CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android-gcc \ + CC_x86_64_linux_android=x86_64-linux-android-gcc \ + CXX_x86_64_linux_android=x86_64-linux-android-g++ \ + OBJDUMP=x86_64-linux-android-objdump \ + HOME=/tmp diff --git a/ci/run-docker.sh b/ci/run-docker.sh index e07e1b0dc1..0c560c825c 100755 --- a/ci/run-docker.sh +++ b/ci/run-docker.sh @@ -5,7 +5,7 @@ set -ex run() { echo "Building docker container for TARGET=${1}" - docker build -t stdsimd ci/docker/$1 + docker build -t stdsimd -f ci/docker/$1/Dockerfile ci/ mkdir -p target target=$(echo $1 | sed 's/-emulated//') echo "Running docker" @@ -18,6 +18,7 @@ run() { --volume `rustc --print sysroot`:/rust:ro \ --env TARGET=$target \ --env STDSIMD_TEST_EVERYTHING \ + --env STDSIMD_ASSERT_INSTR_IGNORE \ --volume `pwd`:/checkout:ro \ --volume `pwd`/target:/checkout/target \ --workdir /checkout \ diff --git a/ci/run.sh b/ci/run.sh index 708d6ba341..ae4522987d 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -38,6 +38,9 @@ case ${TARGET} in i686-* | i586-*) export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static" ;; + *android*) + export STDSIMD_DISABLE_ASSERT_INSTR=1 + ;; *) ;; esac @@ -46,6 +49,7 @@ echo "RUSTFLAGS=${RUSTFLAGS}" echo "FEATURES=${FEATURES}" echo "OBJDUMP=${OBJDUMP}" echo "STDSIMD_DISABLE_ASSERT_INSTR=${STDSIMD_DISABLE_ASSERT_INSTR}" +echo "STDSIMD_TEST_EVERYTHING=${STDSIMD_TEST_EVERYTHING}" cargo_test() { cmd="cargo test --target=$TARGET $1" diff --git a/ci/runtest-android.rs b/ci/runtest-android.rs new file mode 100644 index 0000000000..d8968f99f4 --- /dev/null +++ b/ci/runtest-android.rs @@ -0,0 +1,41 @@ +use std::env; +use std::process::Command; +use std::path::{Path, PathBuf}; + +fn main() { + assert_eq!(env::args_os().len(), 2); + let test = PathBuf::from(env::args_os().nth(1).unwrap()); + let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap()); + + let status = Command::new("adb") + .arg("wait-for-device") + .status() + .expect("failed to run: adb wait-for-device"); + assert!(status.success()); + + let status = Command::new("adb") + .arg("push") + .arg(&test) + .arg(&dst) + .status() + .expect("failed to run: adb pushr"); + assert!(status.success()); + + let output = Command::new("adb") + .arg("shell") + .arg(&dst) + .output() + .expect("failed to run: adb shell"); + assert!(status.success()); + + println!("status: {}\nstdout ---\n{}\nstderr ---\n{}", + output.status, + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr)); + + let stdout = String::from_utf8_lossy(&output.stdout); + let mut lines = stdout.lines().filter(|l| l.starts_with("test result")); + if !lines.all(|l| l.contains("test result: ok") && l.contains("0 failed")) { + panic!("failed to find successful test run"); + } +} diff --git a/coresimd/aarch64/crypto.rs b/coresimd/aarch64/crypto.rs index a71c0b460d..75f247585c 100644 --- a/coresimd/aarch64/crypto.rs +++ b/coresimd/aarch64/crypto.rs @@ -16,36 +16,36 @@ extern "C" { fn vsha1h_u32_(hash_e: u32) -> u32; #[link_name = "llvm.aarch64.crypto.sha1su0"] fn vsha1su0q_u32_( - w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t + w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t, ) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha1su1"] fn vsha1su1q_u32_(tw0_3: uint32x4_t, w12_15: uint32x4_t) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha1c"] fn vsha1cq_u32_( - hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t, ) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha1p"] fn vsha1pq_u32_( - hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t, ) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha1m"] fn vsha1mq_u32_( - hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t, ) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha256h"] fn vsha256hq_u32_( - hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t, ) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha256h2"] fn vsha256h2q_u32_( - hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t + hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t, ) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha256su0"] fn vsha256su0q_u32_(w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t; #[link_name = "llvm.aarch64.crypto.sha256su1"] fn vsha256su1q_u32_( - tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t + tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t, ) -> uint32x4_t; } @@ -97,7 +97,7 @@ pub unsafe fn vsha1h_u32(hash_e: u32) -> u32 { #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha1c))] pub unsafe fn vsha1cq_u32( - hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t, ) -> uint32x4_t { vsha1cq_u32_(hash_abcd, hash_e, wk) } @@ -107,7 +107,7 @@ pub unsafe fn vsha1cq_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha1m))] pub unsafe fn vsha1mq_u32( - hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t, ) -> uint32x4_t { vsha1mq_u32_(hash_abcd, hash_e, wk) } @@ -117,7 +117,7 @@ pub unsafe fn vsha1mq_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha1p))] pub unsafe fn vsha1pq_u32( - hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_e: u32, wk: uint32x4_t, ) -> uint32x4_t { vsha1pq_u32_(hash_abcd, hash_e, wk) } @@ -127,7 +127,7 @@ pub unsafe fn vsha1pq_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha1su0))] pub unsafe fn vsha1su0q_u32( - w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t + w0_3: uint32x4_t, w4_7: uint32x4_t, w8_11: uint32x4_t, ) -> uint32x4_t { vsha1su0q_u32_(w0_3, w4_7, w8_11) } @@ -137,7 +137,7 @@ pub unsafe fn vsha1su0q_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha1su1))] pub unsafe fn vsha1su1q_u32( - tw0_3: uint32x4_t, w12_15: uint32x4_t + tw0_3: uint32x4_t, w12_15: uint32x4_t, ) -> uint32x4_t { vsha1su1q_u32_(tw0_3, w12_15) } @@ -147,7 +147,7 @@ pub unsafe fn vsha1su1q_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha256h))] pub unsafe fn vsha256hq_u32( - hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t + hash_abcd: uint32x4_t, hash_efgh: uint32x4_t, wk: uint32x4_t, ) -> uint32x4_t { vsha256hq_u32_(hash_abcd, hash_efgh, wk) } @@ -157,7 +157,7 @@ pub unsafe fn vsha256hq_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha256h2))] pub unsafe fn vsha256h2q_u32( - hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t + hash_efgh: uint32x4_t, hash_abcd: uint32x4_t, wk: uint32x4_t, ) -> uint32x4_t { vsha256h2q_u32_(hash_efgh, hash_abcd, wk) } @@ -167,7 +167,7 @@ pub unsafe fn vsha256h2q_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha256su0))] pub unsafe fn vsha256su0q_u32( - w0_3: uint32x4_t, w4_7: uint32x4_t + w0_3: uint32x4_t, w4_7: uint32x4_t, ) -> uint32x4_t { vsha256su0q_u32_(w0_3, w4_7) } @@ -177,7 +177,7 @@ pub unsafe fn vsha256su0q_u32( #[target_feature(enable = "crypto")] #[cfg_attr(test, assert_instr(sha256su1))] pub unsafe fn vsha256su1q_u32( - tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t + tw0_3: uint32x4_t, w8_11: uint32x4_t, w12_15: uint32x4_t, ) -> uint32x4_t { vsha256su1q_u32_(tw0_3, w8_11, w12_15) } @@ -199,22 +199,8 @@ mod tests { assert_eq!( r, u8x16::new( - 124, - 123, - 124, - 118, - 124, - 123, - 124, - 197, - 124, - 123, - 124, - 118, - 124, - 123, - 124, - 197 + 124, 123, 124, 118, 124, 123, 124, 197, 124, 123, 124, 118, + 124, 123, 124, 197 ) ); } @@ -229,22 +215,7 @@ mod tests { assert_eq!( r, u8x16::new( - 9, - 213, - 9, - 251, - 9, - 213, - 9, - 56, - 9, - 213, - 9, - 251, - 9, - 213, - 9, - 56 + 9, 213, 9, 251, 9, 213, 9, 56, 9, 213, 9, 251, 9, 213, 9, 56 ) ); } @@ -256,24 +227,7 @@ mod tests { let r: u8x16 = vaesmcq_u8(data).into_bits(); assert_eq!( r, - u8x16::new( - 3, - 4, - 9, - 10, - 15, - 8, - 21, - 30, - 3, - 4, - 9, - 10, - 15, - 8, - 21, - 30 - ) + u8x16::new(3, 4, 9, 10, 15, 8, 21, 30, 3, 4, 9, 10, 15, 8, 21, 30) ); } @@ -285,22 +239,8 @@ mod tests { assert_eq!( r, u8x16::new( - 43, - 60, - 33, - 50, - 103, - 80, - 125, - 70, - 43, - 60, - 33, - 50, - 103, - 80, - 125, - 70 + 43, 60, 33, 50, 103, 80, 125, 70, 43, 60, 33, 50, 103, 80, + 125, 70 ) ); } diff --git a/coresimd/aarch64/neon.rs b/coresimd/aarch64/neon.rs index b05113056e..9656c36302 100644 --- a/coresimd/aarch64/neon.rs +++ b/coresimd/aarch64/neon.rs @@ -546,7 +546,6 @@ pub unsafe fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { vpmaxq_f64_(a, b) } - #[cfg(test)] mod tests { use coresimd::aarch64::*; @@ -800,20 +799,11 @@ mod tests { #[simd_test(enable = "neon")] unsafe fn test_vpminq_s8() { #[cfg_attr(rustfmt, skip)] - let a = i8x16::new( - 1, -2, 3, -4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8 - ); + let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); #[cfg_attr(rustfmt, skip)] - let b = i8x16::new( - 0, 3, 2, 5, 4, 7, 6, 9, - 0, 3, 2, 5, 4, 7, 6, 9 - ); + let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9); #[cfg_attr(rustfmt, skip)] - let e = i8x16::new( - -2, -4, 5, 7, 1, 3, 5, 7, - 0, 2, 4, 6, 0, 2, 4, 6, - ); + let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6); let r: i8x16 = vpminq_s8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } @@ -839,20 +829,11 @@ mod tests { #[simd_test(enable = "neon")] unsafe fn test_vpminq_u8() { #[cfg_attr(rustfmt, skip)] - let a = u8x16::new( - 1, 2, 3, 4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8 - ); + let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); #[cfg_attr(rustfmt, skip)] - let b = u8x16::new( - 0, 3, 2, 5, 4, 7, 6, 9, - 0, 3, 2, 5, 4, 7, 6, 9 - ); + let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9); #[cfg_attr(rustfmt, skip)] - let e = u8x16::new( - 1, 3, 5, 7, 1, 3, 5, 7, - 0, 2, 4, 6, 0, 2, 4, 6, - ); + let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6); let r: u8x16 = vpminq_u8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } @@ -896,20 +877,11 @@ mod tests { #[simd_test(enable = "neon")] unsafe fn test_vpmaxq_s8() { #[cfg_attr(rustfmt, skip)] - let a = i8x16::new( - 1, -2, 3, -4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8 - ); + let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); #[cfg_attr(rustfmt, skip)] - let b = i8x16::new( - 0, 3, 2, 5, 4, 7, 6, 9, - 0, 3, 2, 5, 4, 7, 6, 9 - ); + let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9); #[cfg_attr(rustfmt, skip)] - let e = i8x16::new( - 1, 3, 6, 8, 2, 4, 6, 8, - 3, 5, 7, 9, 3, 5, 7, 9, - ); + let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9); let r: i8x16 = vpmaxq_s8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } @@ -935,20 +907,11 @@ mod tests { #[simd_test(enable = "neon")] unsafe fn test_vpmaxq_u8() { #[cfg_attr(rustfmt, skip)] - let a = u8x16::new( - 1, 2, 3, 4, 5, 6, 7, 8, - 1, 2, 3, 4, 5, 6, 7, 8 - ); + let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); #[cfg_attr(rustfmt, skip)] - let b = u8x16::new( - 0, 3, 2, 5, 4, 7, 6, 9, - 0, 3, 2, 5, 4, 7, 6, 9 - ); + let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9); #[cfg_attr(rustfmt, skip)] - let e = u8x16::new( - 2, 4, 6, 8, 2, 4, 6, 8, - 3, 5, 7, 9, 3, 5, 7, 9, - ); + let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9); let r: u8x16 = vpmaxq_u8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs index 9798db59bc..10648eff38 100644 --- a/coresimd/arm/mod.rs +++ b/coresimd/arm/mod.rs @@ -19,11 +19,19 @@ pub use self::v7::*; // NEON is supported on AArch64, and on ARM when built with the v7 and neon // features. Building ARM without neon produces incorrect codegen. -#[cfg(any(target_arch = "aarch64", - all(target_feature = "v7", target_feature = "neon"), - dox))] +#[cfg( + any( + target_arch = "aarch64", + all(target_feature = "v7", target_feature = "neon"), + dox + ) +)] mod neon; -#[cfg(any(target_arch = "aarch64", - all(target_feature = "v7", target_feature = "neon"), - dox))] +#[cfg( + any( + target_arch = "aarch64", + all(target_feature = "v7", target_feature = "neon"), + dox + ) +)] pub use self::neon::*; diff --git a/coresimd/arm/neon.rs b/coresimd/arm/neon.rs index 1d786144d9..f00096505a 100644 --- a/coresimd/arm/neon.rs +++ b/coresimd/arm/neon.rs @@ -366,52 +366,82 @@ impl_from_bits_!( #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32" + )] #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")] fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v8i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v8i8" + )] fn vpmins_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v4i16")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v4i16" + )] fn vpmins_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2i32")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.sminp.v2i32" + )] fn vpmins_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v8i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v8i8" + )] fn vpminu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v4i16")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v4i16" + )] fn vpminu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpminu.v2i32")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.uminp.v2i32" + )] fn vpminu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmins.v2f32")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminp.v2f32" + )] fn vpminf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v8i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v8i8" + )] fn vpmaxs_v8i8(a: int8x8_t, b: int8x8_t) -> int8x8_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v4i16")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v4i16" + )] fn vpmaxs_v4i16(a: int16x4_t, b: int16x4_t) -> int16x4_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2i32")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.smaxp.v2i32" + )] fn vpmaxs_v2i32(a: int32x2_t, b: int32x2_t) -> int32x2_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v8i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v8i8" + )] fn vpmaxu_v8i8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v4i16")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v4i16" + )] fn vpmaxu_v4i16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxu.v2i32")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.umaxp.v2i32" + )] fn vpmaxu_v2i32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpmaxs.v2f32")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")] + #[cfg_attr( + target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32" + )] fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; } @@ -782,7 +812,7 @@ pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))] -pub unsafe fn vpmin_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t { +pub unsafe fn vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { vpmins_v8i8(a, b) } @@ -792,7 +822,7 @@ pub unsafe fn vpmin_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))] -pub unsafe fn vpmin_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t { +pub unsafe fn vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { vpmins_v4i16(a, b) } @@ -802,7 +832,7 @@ pub unsafe fn vpmin_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sminp))] -pub unsafe fn vpmin_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t { +pub unsafe fn vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { vpmins_v2i32(a, b) } @@ -812,7 +842,7 @@ pub unsafe fn vpmin_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))] -pub unsafe fn vpmin_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { +pub unsafe fn vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { vpminu_v8i8(a, b) } @@ -822,7 +852,7 @@ pub unsafe fn vpmin_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))] -pub unsafe fn vpmin_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { +pub unsafe fn vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { vpminu_v4i16(a, b) } @@ -832,7 +862,7 @@ pub unsafe fn vpmin_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uminp))] -pub unsafe fn vpmin_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { +pub unsafe fn vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { vpminu_v2i32(a, b) } @@ -842,7 +872,7 @@ pub unsafe fn vpmin_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmin))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminp))] -pub unsafe fn vpmin_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t { +pub unsafe fn vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { vpminf_v2f32(a, b) } @@ -852,7 +882,7 @@ pub unsafe fn vpmin_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))] -pub unsafe fn vpmax_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t { +pub unsafe fn vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { vpmaxs_v8i8(a, b) } @@ -862,7 +892,7 @@ pub unsafe fn vpmax_s8 (a: int8x8_t, b: int8x8_t) -> int8x8_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))] -pub unsafe fn vpmax_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t { +pub unsafe fn vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { vpmaxs_v4i16(a, b) } @@ -872,7 +902,7 @@ pub unsafe fn vpmax_s16 (a: int16x4_t, b: int16x4_t) -> int16x4_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smaxp))] -pub unsafe fn vpmax_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t { +pub unsafe fn vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { vpmaxs_v2i32(a, b) } @@ -882,7 +912,7 @@ pub unsafe fn vpmax_s32 (a: int32x2_t, b: int32x2_t) -> int32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))] -pub unsafe fn vpmax_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { +pub unsafe fn vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { vpmaxu_v8i8(a, b) } @@ -892,7 +922,7 @@ pub unsafe fn vpmax_u8 (a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))] -pub unsafe fn vpmax_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { +pub unsafe fn vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { vpmaxu_v4i16(a, b) } @@ -902,7 +932,7 @@ pub unsafe fn vpmax_u16 (a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umaxp))] -pub unsafe fn vpmax_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { +pub unsafe fn vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { vpmaxu_v2i32(a, b) } @@ -912,11 +942,10 @@ pub unsafe fn vpmax_u32 (a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpmax))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxp))] -pub unsafe fn vpmax_f32 (a: float32x2_t, b: float32x2_t) -> float32x2_t { +pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { vpmaxf_v2f32(a, b) } - #[cfg(test)] mod tests { use coresimd::arm::*; diff --git a/coresimd/mod.rs b/coresimd/mod.rs index 5007ac30de..6fc312f420 100644 --- a/coresimd/mod.rs +++ b/coresimd/mod.rs @@ -134,7 +134,6 @@ pub mod arch { /// Platform-specific intrinsics for the `PowerPC64` platform. /// /// See the [module documentation](../index.html) for more details. - #[cfg(target_arch = "powerpc64")] #[cfg(any(target_arch = "powerpc64", dox))] #[doc(cfg(target_arch = "powerpc64"))] #[unstable(feature = "stdsimd", issue = "27731")] diff --git a/coresimd/powerpc/altivec.rs b/coresimd/powerpc/altivec.rs index 1765f79bb5..0790474b2b 100644 --- a/coresimd/powerpc/altivec.rs +++ b/coresimd/powerpc/altivec.rs @@ -75,8 +75,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - i8x16: - vector_signed_char, + i8x16: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -114,8 +113,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - u8x16: - vector_signed_char, + u8x16: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -135,11 +133,7 @@ impl_from_bits_!( vector_bool_short, vector_bool_int ); -impl_from_bits_!( - m8x16: vector_bool_char, - vector_bool_short, - vector_bool_int -); +impl_from_bits_!(m8x16: vector_bool_char, vector_bool_short, vector_bool_int); impl_from_bits_!( vector_signed_short: u64x2, @@ -166,8 +160,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - i16x8: - vector_signed_char, + i16x8: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -204,8 +197,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - u16x8: - vector_signed_char, + u16x8: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -251,8 +243,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - i32x4: - vector_signed_char, + i32x4: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -289,8 +280,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - u32x4: - vector_signed_char, + u32x4: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -345,8 +335,7 @@ impl_from_bits_!( vector_bool_int ); impl_from_bits_!( - f32x4: - vector_signed_char, + f32x4: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -360,10 +349,18 @@ impl_from_bits_!( #[allow(improper_ctypes)] extern "C" { -#[ link_name = "llvm.ppc.altivec.vperm" ] -fn vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int; -#[ link_name = "llvm.ppc.altivec.vmhaddshs" ] -fn vmhaddshs(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short; + #[link_name = "llvm.ppc.altivec.vperm"] + fn vperm( + a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char, + ) -> vector_signed_int; + #[link_name = "llvm.ppc.altivec.vmhaddshs"] + fn vmhaddshs( + a: vector_signed_short, b: vector_signed_short, c: vector_signed_short, + ) -> vector_signed_short; + #[link_name = "llvm.ppc.altivec.vmhraddshs"] + fn vmhraddshs( + a: vector_signed_short, b: vector_signed_short, c: vector_signed_short, + ) -> vector_signed_short; } mod sealed { @@ -373,7 +370,9 @@ mod sealed { #[inline] #[target_feature(enable = "altivec")] #[cfg_attr(test, assert_instr(vperm))] - unsafe fn vec_vperm(a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char) -> vector_signed_int { + unsafe fn vec_vperm( + a: vector_signed_int, b: vector_signed_int, c: vector_unsigned_char, + ) -> vector_signed_int { vperm(a, b, c) } @@ -703,7 +702,6 @@ where a.vec_add(b) } - /// Endian-biased intrinsics #[cfg(target_endian = "little")] mod endian { @@ -718,8 +716,10 @@ mod endian { // vperm has big-endian bias // // Xor the mask and flip the arguments - let d = u8x16::new(255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255).into_bits(); + let d = u8x16::new( + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, + ).into_bits(); let c = simd_xor(c, d); b.vec_vperm(a, c) @@ -730,10 +730,22 @@ mod endian { #[inline] #[target_feature(enable = "altivec")] #[cfg_attr(test, assert_instr(vmhaddshs))] -pub unsafe fn vec_madds(a: vector_signed_short, b: vector_signed_short, c: vector_signed_short) -> vector_signed_short { +pub unsafe fn vec_madds( + a: vector_signed_short, b: vector_signed_short, c: vector_signed_short, +) -> vector_signed_short { vmhaddshs(a, b, c) } +/// Vector Multiply Round and Add Saturated +#[inline] +#[target_feature(enable = "altivec")] +#[cfg_attr(test, assert_instr(vmhraddshs))] +pub unsafe fn vec_mradds( + a: vector_signed_short, b: vector_signed_short, c: vector_signed_short, +) -> vector_signed_short { + vmhraddshs(a, b, c) +} + #[cfg(target_endian = "big")] mod endian { use super::*; @@ -776,89 +788,122 @@ mod tests { } test_vec_perm!{test_vec_perm_u8x16, - u8x16, vector_unsigned_char, - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], - [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, - 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], - [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]} + u8x16, vector_unsigned_char, + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], + [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, + 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], + [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]} test_vec_perm!{test_vec_perm_i8x16, - i8x16, vector_signed_char, - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], - [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, - 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], - [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]} + i8x16, vector_signed_char, + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], + [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, + 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], + [0, 1, 100, 101, 2, 3, 102, 103, 4, 5, 104, 105, 6, 7, 106, 107]} test_vec_perm!{test_vec_perm_m8x16, - m8x16, vector_bool_char, - [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false], - [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], - [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, - 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], - [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]} + m8x16, vector_bool_char, + [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false], + [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], + [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, + 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], + [false, false, true, true, false, false, true, true, false, false, true, true, false, false, true, true]} test_vec_perm!{test_vec_perm_u16x8, - u16x8, vector_unsigned_short, - [0, 1, 2, 3, 4, 5, 6, 7], - [10, 11, 12, 13, 14, 15, 16, 17], - [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, - 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], - [0, 10, 1, 11, 2, 12, 3, 13]} + u16x8, vector_unsigned_short, + [0, 1, 2, 3, 4, 5, 6, 7], + [10, 11, 12, 13, 14, 15, 16, 17], + [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, + 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], + [0, 10, 1, 11, 2, 12, 3, 13]} test_vec_perm!{test_vec_perm_i16x8, - i16x8, vector_signed_short, - [0, 1, 2, 3, 4, 5, 6, 7], - [10, 11, 12, 13, 14, 15, 16, 17], - [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, - 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], - [0, 10, 1, 11, 2, 12, 3, 13]} + i16x8, vector_signed_short, + [0, 1, 2, 3, 4, 5, 6, 7], + [10, 11, 12, 13, 14, 15, 16, 17], + [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, + 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], + [0, 10, 1, 11, 2, 12, 3, 13]} test_vec_perm!{test_vec_perm_m16x8, - m16x8, vector_bool_short, - [false, false, false, false, false, false, false, false], - [true, true, true, true, true, true, true, true], - [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, - 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], - [false, true, false, true, false, true, false, true]} + m16x8, vector_bool_short, + [false, false, false, false, false, false, false, false], + [true, true, true, true, true, true, true, true], + [0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, + 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17], + [false, true, false, true, false, true, false, true]} test_vec_perm!{test_vec_perm_u32x4, - u32x4, vector_unsigned_int, - [0, 1, 2, 3], - [10, 11, 12, 13], - [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, - 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], - [0, 10, 1, 11]} + u32x4, vector_unsigned_int, + [0, 1, 2, 3], + [10, 11, 12, 13], + [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], + [0, 10, 1, 11]} test_vec_perm!{test_vec_perm_i32x4, - i32x4, vector_signed_int, - [0, 1, 2, 3], - [10, 11, 12, 13], - [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, - 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], - [0, 10, 1, 11]} + i32x4, vector_signed_int, + [0, 1, 2, 3], + [10, 11, 12, 13], + [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], + [0, 10, 1, 11]} test_vec_perm!{test_vec_perm_m32x4, - m32x4, vector_bool_int, - [false, false, false, false], - [true, true, true, true], - [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, - 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], - [false, true, false, true]} + m32x4, vector_bool_int, + [false, false, false, false], + [true, true, true, true], + [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], + [false, true, false, true]} test_vec_perm!{test_vec_perm_f32x4, - f32x4, vector_float, - [0.0, 1.0, 2.0, 3.0], - [1.0, 1.1, 1.2, 1.3], - [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, - 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], - [0.0, 1.0, 1.0, 1.1]} + f32x4, vector_float, + [0.0, 1.0, 2.0, 3.0], + [1.0, 1.1, 1.2, 1.3], + [0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17], + [0.0, 1.0, 1.0, 1.1]} #[simd_test(enable = "altivec")] unsafe fn test_vec_madds() { - let a: vector_signed_short = i16x8::new(0 * 256, 1 * 256, 2 * 256, 3 * 256, 4 * 256, 5 * 256, 6 * 256, 7 * 256).into_bits(); - let b: vector_signed_short = i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits(); - let c: vector_signed_short = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits(); + let a: vector_signed_short = i16x8::new( + 0 * 256, + 1 * 256, + 2 * 256, + 3 * 256, + 4 * 256, + 5 * 256, + 6 * 256, + 7 * 256, + ).into_bits(); + let b: vector_signed_short = + i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits(); + let c: vector_signed_short = + i16x8::new(0, 1, 2, 3, 4, 5, 6, 7).into_bits(); let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, 21); assert_eq!(d, vec_madds(a, b, c).into_bits()); } + #[simd_test(enable = "altivec")] + unsafe fn test_vec_mradds() { + let a: vector_signed_short = i16x8::new( + 0 * 256, + 1 * 256, + 2 * 256, + 3 * 256, + 4 * 256, + 5 * 256, + 6 * 256, + 7 * 256, + ).into_bits(); + let b: vector_signed_short = + i16x8::new(256, 256, 256, 256, 256, 256, 256, 256).into_bits(); + let c: vector_signed_short = + i16x8::new(0, 1, 2, 3, 4, 5, 6, i16::max_value() - 1).into_bits(); + + let d = i16x8::new(0, 3, 6, 9, 12, 15, 18, i16::max_value()); + + assert_eq!(d, vec_mradds(a, b, c).into_bits()); + } + #[simd_test(enable = "altivec")] unsafe fn vec_add_i32x4_i32x4() { let x = i32x4::new(1, 2, 3, 4); diff --git a/coresimd/powerpc64/mod.rs b/coresimd/powerpc64/mod.rs index 9049b294d0..4d7d9076fb 100644 --- a/coresimd/powerpc64/mod.rs +++ b/coresimd/powerpc64/mod.rs @@ -1,6 +1,7 @@ //! PowerPC 64 //! -//! The reference is the [64-Bit ELF V2 ABI Specification - Power Architecture]. +//! The reference is the [64-Bit ELF V2 ABI Specification - Power +//! Architecture]. //! //! [64-Bit ELF V2 ABI Specification - Power Architecture]: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf diff --git a/coresimd/powerpc64/vsx.rs b/coresimd/powerpc64/vsx.rs index 1953e1501a..51a8e824c1 100644 --- a/coresimd/powerpc64/vsx.rs +++ b/coresimd/powerpc64/vsx.rs @@ -63,8 +63,7 @@ impl_from_bits_!( vector_double ); impl_from_bits_!( - i64x2: - vector_signed_char, + i64x2: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -109,8 +108,7 @@ impl_from_bits_!( vector_double ); impl_from_bits_!( - u64x2: - vector_signed_char, + u64x2: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -155,8 +153,7 @@ impl_from_bits_!( vector_bool_long ); impl_from_bits_!( - f64x2: - vector_signed_char, + f64x2: vector_signed_char, vector_unsigned_char, vector_bool_char, vector_signed_short, @@ -234,8 +231,12 @@ mod sealed { // xxpermdi has an big-endian bias and extended mnemonics #[inline] #[target_feature(enable = "vsx")] - #[cfg_attr(all(test, target_endian="little"), assert_instr(xxmrgld, dm = 0x0))] - #[cfg_attr(all(test, target_endian="big"), assert_instr(xxspltd, dm = 0x0))] + #[cfg_attr( + all(test, target_endian = "little"), assert_instr(xxmrgld, dm = 0x0) + )] + #[cfg_attr( + all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0) + )] unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 { match dm & 0b11 { 0 => simd_shuffle2(a, b, [0b00, 0b10]), diff --git a/coresimd/ppsv/api/arithmetic_reductions.rs b/coresimd/ppsv/api/arithmetic_reductions.rs index c8d4d23566..7b324a7bab 100644 --- a/coresimd/ppsv/api/arithmetic_reductions.rs +++ b/coresimd/ppsv/api/arithmetic_reductions.rs @@ -165,7 +165,6 @@ macro_rules! impl_float_arithmetic_reductions { }; } - #[cfg(test)] macro_rules! test_int_arithmetic_reductions { ($id:ident, $elem_ty:ident) => { @@ -237,10 +236,7 @@ macro_rules! test_float_arithmetic_reductions { let v = $id::splat(1 as $elem_ty); assert_eq!(v.sum(), $id::lanes() as $elem_ty); let v = alternating(2); - assert_eq!( - v.sum(), - ($id::lanes() / 2 + $id::lanes()) as $elem_ty - ); + assert_eq!(v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty); } #[test] fn product() { diff --git a/coresimd/ppsv/api/float_math.rs b/coresimd/ppsv/api/float_math.rs index 32de85eb55..9092460a76 100644 --- a/coresimd/ppsv/api/float_math.rs +++ b/coresimd/ppsv/api/float_math.rs @@ -59,7 +59,6 @@ macro_rules! impl_float_math { macro_rules! test_float_math { ($id:ident, $elem_ty:ident) => { - fn sqrt2() -> $elem_ty { match ::mem::size_of::<$elem_ty>() { 4 => 1.4142135 as $elem_ty, diff --git a/coresimd/ppsv/api/load_store.rs b/coresimd/ppsv/api/load_store.rs index 11ea10d30c..59749da0e1 100644 --- a/coresimd/ppsv/api/load_store.rs +++ b/coresimd/ppsv/api/load_store.rs @@ -46,7 +46,7 @@ macro_rules! impl_load_store { /// undefined. #[inline] pub unsafe fn store_aligned_unchecked( - self, slice: &mut [$elem_ty] + self, slice: &mut [$elem_ty], ) { *(slice.get_unchecked_mut(0) as *mut $elem_ty as *mut Self) = self; @@ -59,7 +59,7 @@ macro_rules! impl_load_store { /// If `slice.len() < Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn store_unaligned_unchecked( - self, slice: &mut [$elem_ty] + self, slice: &mut [$elem_ty], ) { let target_ptr = slice.get_unchecked_mut(0) as *mut $elem_ty as *mut u8; @@ -121,7 +121,7 @@ macro_rules! impl_load_store { /// If `slice.len() < Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn load_unaligned_unchecked( - slice: &[$elem_ty] + slice: &[$elem_ty], ) -> Self { use mem::size_of; let target_ptr = @@ -238,7 +238,8 @@ macro_rules! test_load_store { data: [0 as $elem_ty; 2 * $id::lanes()], }; // offset the aligned data by one byte: - let s: &mut [u8; 2 * $id::lanes() + let s: &mut [u8; 2 + * $id::lanes() * mem::size_of::<$elem_ty>()] = mem::transmute(&mut aligned.data); let s: &mut [$elem_ty] = slice::from_raw_parts_mut( @@ -296,7 +297,8 @@ macro_rules! test_load_store { data: [0 as $elem_ty; 2 * $id::lanes()], }; // offset the aligned data by one byte: - let s: &[u8; 2 * $id::lanes() + let s: &[u8; 2 + * $id::lanes() * mem::size_of::<$elem_ty>()] = mem::transmute(&aligned.data); let s: &[$elem_ty] = slice::from_raw_parts( diff --git a/coresimd/ppsv/api/minmax.rs b/coresimd/ppsv/api/minmax.rs old mode 100644 new mode 100755 index 7ba93b22f3..c1c7499c06 --- a/coresimd/ppsv/api/minmax.rs +++ b/coresimd/ppsv/api/minmax.rs @@ -27,7 +27,7 @@ macro_rules! impl_int_minmax_ops { /// Maximum of two vectors. /// - /// Returns a new vector containing the minimum value of each of + /// Returns a new vector containing the maximum value of each of /// the input vector lanes. #[inline] pub fn max(self, x: Self) -> Self { @@ -86,7 +86,7 @@ macro_rules! impl_float_minmax_ops { /// Maximum of two vectors. /// - /// Returns a new vector containing the minimum value of each of the + /// Returns a new vector containing the maximum value of each of the /// input vector lanes. The lane-wise semantics are the same as that /// of `max` for the primitive floating-point types. #[inline] diff --git a/coresimd/ppsv/api/mod.rs b/coresimd/ppsv/api/mod.rs index 857264b9f3..4379e1c713 100644 --- a/coresimd/ppsv/api/mod.rs +++ b/coresimd/ppsv/api/mod.rs @@ -1,57 +1,4 @@ //! This module defines the API of portable vector types. -//! -//! # API -//! -//! ## Traits -//! -//! All portable vector types implement the following traits: -//! -//! * [x] `Copy`, -//! * [x] `Clone`, -//! * [x] `Debug`, -//! * [x] `Default` -//! * [x] `PartialEq` -//! * [x] `PartialOrd` (TODO: tests) -//! -//! Non-floating-point vector types also implement: -//! -//! * [x] `Hash`, -//! * [x] `Eq`, and -//! * [x] `Ord`. -//! -//! Integer vector types also implement: -//! -//! * [x] `fmt::LowerHex`. -//! -//! ## Conversions -//! -//! * [x]: `FromBits/IntoBits`: bitwise lossless transmutes between vectors of -//! the same size (i.e., same `mem::size_of`). -//! * [x]: `From/Into`: casts between vectors with the same number of lanes -//! (potentially lossy). -//! -//! ## Inherent methods -//! -//! * [x] minimal API: implemented by all vector types except for boolean -//! vectors. -//! * [x] minimal boolean vector API: implemented by boolean vectors. -//! * [x] load/store API: aligned and unaligned memory loads and -//! stores - implemented by all vectors. -//! * [x] comparison API: vector lane-wise comparison producing -//! boolean vectors - implemented by all vectors. -//! * [x] arithmetic operations: implemented by all non-boolean vectors. -//! * [x] `std::ops::Neg`: implemented by signed-integer and floating-point -//! vectors. -//! * [x] bitwise operations: implemented by integer and boolean -//! vectors. -//! * [x] shift operations: implemented by integer vectors. -//! * [x] arithmetic reductions: implemented by integer and floating-point -//! vectors. -//! * [x] bitwise reductions: implemented by integer and boolean -//! vectors. -//! * [x] boolean reductions: implemented by boolean vectors. -//! * [ ] portable shuffles: `shufflevector`. -//! * [ ] portable `gather`/`scatter`: #![allow(unused)] /// Adds the vector type `$id`, with elements of types `$elem_tys`. @@ -59,7 +6,8 @@ macro_rules! define_ty { ($id:ident, $($elem_tys:ident),+ | $(#[$doc:meta])*) => { $(#[$doc])* #[repr(simd)] - #[derive(Copy, Clone, Debug, /*FIXME:*/ PartialOrd)] + #[derive(Copy, Clone, Debug, + /*FIXME: manually implement and add tests*/ PartialOrd)] #[allow(non_camel_case_types)] pub struct $id($($elem_tys),*); } diff --git a/coresimd/ppsv/api/scalar_shifts.rs b/coresimd/ppsv/api/scalar_shifts.rs index dac89e6bbb..586d909c32 100644 --- a/coresimd/ppsv/api/scalar_shifts.rs +++ b/coresimd/ppsv/api/scalar_shifts.rs @@ -41,18 +41,7 @@ macro_rules! impl_shifts { macro_rules! impl_all_scalar_shifts { ($id:ident, $elem_ty:ident) => { impl_shifts!( - $id, - $elem_ty, - u8, - u16, - u32, - u64, - usize, - i8, - i16, - i32, - i64, - isize + $id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize ); }; } @@ -125,18 +114,7 @@ macro_rules! test_shift_ops { macro_rules! test_all_scalar_shift_ops { ($id:ident, $elem_ty:ident) => { test_shift_ops!( - $id, - $elem_ty, - u8, - u16, - u32, - u64, - usize, - i8, - i16, - i32, - i64, - isize + $id, $elem_ty, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize ); }; } diff --git a/coresimd/ppsv/codegen/abs.rs b/coresimd/ppsv/codegen/abs.rs index edca549c24..c829ff8c5b 100644 --- a/coresimd/ppsv/codegen/abs.rs +++ b/coresimd/ppsv/codegen/abs.rs @@ -1,9 +1,14 @@ //! Vector absolute value - +#![allow(dead_code)] use coresimd::simd::*; #[allow(improper_ctypes)] extern "C" { + #[link_name = "llvm.fabs.f32"] + fn abs_f32(x: f32) -> f32; + #[link_name = "llvm.fabs.f64"] + fn abs_f64(x: f64) -> f64; + #[link_name = "llvm.fabs.v2f32"] fn abs_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.fabs.v4f32"] @@ -24,14 +29,43 @@ pub(crate) trait FloatAbs { fn abs(self) -> Self; } +trait RawAbs { + fn raw_abs(self) -> Self; +} + +impl RawAbs for f32 { + fn raw_abs(self) -> Self { + unsafe { abs_f32(self) } + } +} + +impl RawAbs for f64 { + fn raw_abs(self) -> Self { + unsafe { abs_f64(self) } + } +} + + macro_rules! impl_fabs { - ($id:ident: $fn:ident) => { + ($id:ident : $fn:ident) => { + #[cfg(not(target_arch = "s390x"))] impl FloatAbs for $id { fn abs(self) -> Self { unsafe { $fn(self) } } } - } + // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501 + #[cfg(target_arch = "s390x")] + impl FloatAbs for $id { + fn abs(self) -> Self { + let mut v = $id::splat(0.); + for i in 0..$id::lanes() { + v = v.replace(i, self.extract(i).raw_abs()) + } + v + } + } + }; } impl_fabs!(f32x2: abs_v2f32); diff --git a/coresimd/ppsv/codegen/cos.rs b/coresimd/ppsv/codegen/cos.rs index fdc61ea464..38dce584f8 100644 --- a/coresimd/ppsv/codegen/cos.rs +++ b/coresimd/ppsv/codegen/cos.rs @@ -1,9 +1,14 @@ //! Exact vector cos - +#![allow(dead_code)] use coresimd::simd::*; #[allow(improper_ctypes)] extern "C" { + #[link_name = "llvm.cos.f32"] + fn cos_f32(x: f32) -> f32; + #[link_name = "llvm.cos.f64"] + fn cos_f64(x: f64) -> f64; + #[link_name = "llvm.cos.v2f32"] fn cos_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.cos.v4f32"] @@ -24,14 +29,44 @@ pub(crate) trait FloatCos { fn cos(self) -> Self; } +trait RawCos { + fn raw_cos(self) -> Self; +} + +impl RawCos for f32 { + fn raw_cos(self) -> Self { + unsafe { cos_f32(self) } + } +} + +impl RawCos for f64 { + fn raw_cos(self) -> Self { + unsafe { cos_f64(self) } + } +} + + macro_rules! impl_fcos { - ($id:ident: $fn:ident) => { + ($id:ident : $fn:ident) => { + #[cfg(not(target_arch = "s390x"))] impl FloatCos for $id { fn cos(self) -> Self { unsafe { $fn(self) } } } - } + + // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501 + #[cfg(target_arch = "s390x")] + impl FloatCos for $id { + fn cos(self) -> Self { + let mut v = $id::splat(0.); + for i in 0..$id::lanes() { + v = v.replace(i, self.extract(i).raw_cos()) + } + v + } + } + }; } impl_fcos!(f32x2: cos_v2f32); diff --git a/coresimd/ppsv/codegen/fma.rs b/coresimd/ppsv/codegen/fma.rs index 9d63ac6bee..a0f0e8f729 100644 --- a/coresimd/ppsv/codegen/fma.rs +++ b/coresimd/ppsv/codegen/fma.rs @@ -1,5 +1,5 @@ //! Vector fused multiply add - +#![allow(dead_code)] use coresimd::simd::*; #[allow(improper_ctypes)] @@ -25,13 +25,21 @@ pub(crate) trait FloatFma { } macro_rules! impl_fma { - ($id:ident: $fn:ident) => { + ($id:ident : $fn:ident) => { + #[cfg(not(target_arch = "s390x"))] impl FloatFma for $id { fn fma(self, y: Self, z: Self) -> Self { unsafe { $fn(self, y, z) } } } - } + // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501 + #[cfg(target_arch = "s390x")] + impl FloatFma for $id { + fn fma(self, y: Self, z: Self) -> Self { + self * y + z + } + } + }; } impl_fma!(f32x2: fma_v2f32); diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs index b06c2d0a29..617f1fd300 100644 --- a/coresimd/ppsv/codegen/masks_reductions.rs +++ b/coresimd/ppsv/codegen/masks_reductions.rs @@ -25,11 +25,13 @@ macro_rules! default_impl { impl All for $id { #[inline] unsafe fn all(self) -> bool { - #[cfg(not(target_arch = "aarch64"))] { + #[cfg(not(target_arch = "aarch64"))] + { use coresimd::simd_llvm::simd_reduce_all; simd_reduce_all(self) } - #[cfg(target_arch = "aarch64")] { + #[cfg(target_arch = "aarch64")] + { // FIXME: Broken on AArch64 // https://bugs.llvm.org/show_bug.cgi?id=36796 self.and() @@ -40,11 +42,13 @@ macro_rules! default_impl { impl Any for $id { #[inline] unsafe fn any(self) -> bool { - #[cfg(not(target_arch = "aarch64"))] { + #[cfg(not(target_arch = "aarch64"))] + { use coresimd::simd_llvm::simd_reduce_any; simd_reduce_any(self) } - #[cfg(target_arch = "aarch64")] { + #[cfg(target_arch = "aarch64")] + { // FIXME: Broken on AArch64 // https://bugs.llvm.org/show_bug.cgi?id=36796 self.or() @@ -63,7 +67,12 @@ macro_rules! default_impl { // or floating point vectors, we can't currently work around this yet. The // performance impact for this shouldn't be large, but this is filled as: // https://bugs.llvm.org/show_bug.cgi?id=37087 -#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] +#[cfg( + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse2" + ) +)] macro_rules! x86_128_sse2_movemask_impl { ($id:ident) => { impl All for $id { @@ -71,13 +80,15 @@ macro_rules! x86_128_sse2_movemask_impl { #[target_feature(enable = "sse2")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] - use ::coresimd::arch::x86::_mm_movemask_epi8; + use coresimd::arch::x86::_mm_movemask_epi8; #[cfg(target_arch = "x86_64")] - use ::coresimd::arch::x86_64::_mm_movemask_epi8; - // _mm_movemask_epi8(a) creates a 16bit mask containing the most - // significant bit of each byte of `a`. If all bits are set, - // then all 16 lanes of the mask are true. - _mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32 + use coresimd::arch::x86_64::_mm_movemask_epi8; + // _mm_movemask_epi8(a) creates a 16bit mask containing the + // most significant bit of each byte of `a`. If all + // bits are set, then all 16 lanes of the mask are + // true. + _mm_movemask_epi8(::mem::transmute(self)) + == u16::max_value() as i32 } } impl Any for $id { @@ -85,14 +96,14 @@ macro_rules! x86_128_sse2_movemask_impl { #[target_feature(enable = "sse2")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] - use ::coresimd::arch::x86::_mm_movemask_epi8; + use coresimd::arch::x86::_mm_movemask_epi8; #[cfg(target_arch = "x86_64")] - use ::coresimd::arch::x86_64::_mm_movemask_epi8; + use coresimd::arch::x86_64::_mm_movemask_epi8; _mm_movemask_epi8(::mem::transmute(self)) != 0 } } - } + }; } // On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256. @@ -103,7 +114,12 @@ macro_rules! x86_128_sse2_movemask_impl { // integer or floating point vectors, we can't currently work around this yet. // // TODO: investigate perf impact and fill LLVM bugs as necessary. -#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))] +#[cfg( + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx" + ) +)] macro_rules! x86_256_avx_test_impl { ($id:ident) => { impl All for $id { @@ -111,11 +127,13 @@ macro_rules! x86_256_avx_test_impl { #[target_feature(enable = "avx")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] - use ::coresimd::arch::x86::_mm256_testc_si256; + use coresimd::arch::x86::_mm256_testc_si256; #[cfg(target_arch = "x86_64")] - use ::coresimd::arch::x86_64::_mm256_testc_si256; - _mm256_testc_si256(::mem::transmute(self), - ::mem::transmute($id::splat(true))) != 0 + use coresimd::arch::x86_64::_mm256_testc_si256; + _mm256_testc_si256( + ::mem::transmute(self), + ::mem::transmute($id::splat(true)), + ) != 0 } } impl Any for $id { @@ -123,20 +141,27 @@ macro_rules! x86_256_avx_test_impl { #[target_feature(enable = "avx")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] - use ::coresimd::arch::x86::_mm256_testz_si256; + use coresimd::arch::x86::_mm256_testz_si256; #[cfg(target_arch = "x86_64")] - use ::coresimd::arch::x86_64::_mm256_testz_si256; - _mm256_testz_si256(::mem::transmute(self), - ::mem::transmute(self)) == 0 + use coresimd::arch::x86_64::_mm256_testz_si256; + _mm256_testz_si256( + ::mem::transmute(self), + ::mem::transmute(self), + ) == 0 } } - } + }; } -// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing -// the algorithm for 128-bit on the higher and lower elements of the vector -// independently. -#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))] +// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by +// executing the algorithm for 128-bit on the higher and lower elements of the +// vector independently. +#[cfg( + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse2" + ) +)] macro_rules! x86_256_sse2_impl { ($id:ident, $v128:ident) => { impl All for $id { @@ -146,9 +171,9 @@ macro_rules! x86_256_sse2_impl { unsafe { union U { halves: ($v128, $v128), - vec: $id + vec: $id, } - let halves = U {vec: self}.halves; + let halves = U { vec: self }.halves; halves.0.all() && halves.1.all() } } @@ -160,14 +185,14 @@ macro_rules! x86_256_sse2_impl { unsafe { union U { halves: ($v128, $v128), - vec: $id + vec: $id, } - let halves = U {vec: self}.halves; + let halves = U { vec: self }.halves; halves.0.any() || halves.1.any() } } } - } + }; } // Implementation for 64-bit wide masks on x86. @@ -179,13 +204,14 @@ macro_rules! x86_64_mmx_movemask_impl { #[target_feature(enable = "mmx")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] - use ::coresimd::arch::x86::_mm_movemask_pi8; + use coresimd::arch::x86::_mm_movemask_pi8; #[cfg(target_arch = "x86_64")] - use ::coresimd::arch::x86_64::_mm_movemask_pi8; + use coresimd::arch::x86_64::_mm_movemask_pi8; // _mm_movemask_pi8(a) creates an 8bit mask containing the most // significant bit of each byte of `a`. If all bits are set, // then all 8 lanes of the mask are true. - _mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32 + _mm_movemask_pi8(::mem::transmute(self)) + == u8::max_value() as i32 } } impl Any for $id { @@ -193,14 +219,14 @@ macro_rules! x86_64_mmx_movemask_impl { #[target_feature(enable = "mmx")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] - use ::coresimd::arch::x86::_mm_movemask_pi8; + use coresimd::arch::x86::_mm_movemask_pi8; #[cfg(target_arch = "x86_64")] - use ::coresimd::arch::x86_64::_mm_movemask_pi8; + use coresimd::arch::x86_64::_mm_movemask_pi8; _mm_movemask_pi8(::mem::transmute(self)) != 0 } } - } + }; } // Implementation for 128-bit wide masks on x86 @@ -214,7 +240,7 @@ macro_rules! x86_128_impl { default_impl!($id); } } - } + }; } // Implementation for 256-bit wide masks on x86 @@ -230,22 +256,25 @@ macro_rules! x86_256_impl { default_impl!($id); } } - } + }; } // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding // minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors. -#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +#[cfg( + all(target_arch = "arm", target_feature = "v7", target_feature = "neon") +)] macro_rules! arm_64_x2_v7_neon_impl { ($id:ident, $vpmin:ident, $vpmax:ident) => { impl All for $id { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn all(self) -> bool { - use ::coresimd::arch::arm::$vpmin; - use ::mem::transmute; + use coresimd::arch::arm::$vpmin; + use mem::transmute; // pmin((a, b), (-,-)) => (b, -).0 => b - let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized())); + let tmp: $id = + transmute($vpmin(transmute(self), ::mem::uninitialized())); tmp.extract(0) } } @@ -253,27 +282,30 @@ macro_rules! arm_64_x2_v7_neon_impl { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn any(self) -> bool { - use ::coresimd::arch::arm::$vpmax; - use ::mem::transmute; + use coresimd::arch::arm::$vpmax; + use mem::transmute; // pmax((a, b), (-,-)) => (b, -).0 => b - let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized())); + let tmp: $id = + transmute($vpmax(transmute(self), ::mem::uninitialized())); tmp.extract(0) } } - } + }; } // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding // minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors. -#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +#[cfg( + all(target_arch = "arm", target_feature = "v7", target_feature = "neon") +)] macro_rules! arm_64_x4_v7_neon_impl { ($id:ident, $vpmin:ident, $vpmax:ident) => { impl All for $id { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn all(self) -> bool { - use ::coresimd::arch::arm::$vpmin; - use ::mem::transmute; + use coresimd::arch::arm::$vpmin; + use mem::transmute; // tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -) let tmp = $vpmin(transmute(self), ::mem::uninitialized()); // tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c @@ -285,29 +317,31 @@ macro_rules! arm_64_x4_v7_neon_impl { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn any(self) -> bool { - use ::coresimd::arch::arm::$vpmax; - use ::mem::transmute; + use coresimd::arch::arm::$vpmax; + use mem::transmute; // tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -) - let tmp = $vpmax(transmute(self), ::mem::uninitialized()); + let tmp = $vpmax(transmute(self), ::mem::uninitialized()); // tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized())); tmp.extract(0) } } - } + }; } // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding // minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors. -#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +#[cfg( + all(target_arch = "arm", target_feature = "v7", target_feature = "neon") +)] macro_rules! arm_64_x8_v7_neon_impl { ($id:ident, $vpmin:ident, $vpmax:ident) => { impl All for $id { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn all(self) -> bool { - use ::coresimd::arch::arm::$vpmin; - use ::mem::transmute; + use coresimd::arch::arm::$vpmin; + use mem::transmute; // tmp = pmin( // (a, b, c, d, e, f, g, h), // (-, -, -, -, -, -, -, -) @@ -330,8 +364,8 @@ macro_rules! arm_64_x8_v7_neon_impl { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn any(self) -> bool { - use ::coresimd::arch::arm::$vpmax; - use ::mem::transmute; + use coresimd::arch::arm::$vpmax; + use mem::transmute; // tmp = pmax( // (a, b, c, d, e, f, g, h), // (-, -, -, -, -, -, -, -) @@ -350,28 +384,32 @@ macro_rules! arm_64_x8_v7_neon_impl { tmp.extract(0) } } - } + }; } - // Implementation for ARM + v7 + NEON using vpmin and vpmax (folding // minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with // more than two elements. -#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] +#[cfg( + all(target_arch = "arm", target_feature = "v7", target_feature = "neon") +)] macro_rules! arm_128_v7_neon_impl { ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { impl All for $id { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn all(self) -> bool { - use ::coresimd::arch::arm::$vpmin; - use ::mem::transmute; + use coresimd::arch::arm::$vpmin; + use mem::transmute; union U { halves: ($half, $half), - vec: $id + vec: $id, } let halves = U { vec: self }.halves; - let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1))); + let h: $half = transmute($vpmin( + transmute(halves.0), + transmute(halves.1), + )); h.all() } } @@ -379,18 +417,21 @@ macro_rules! arm_128_v7_neon_impl { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn any(self) -> bool { - use ::coresimd::arch::arm::$vpmax; - use ::mem::transmute; + use coresimd::arch::arm::$vpmax; + use mem::transmute; union U { halves: ($half, $half), - vec: $id + vec: $id, } let halves = U { vec: self }.halves; - let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1))); + let h: $half = transmute($vpmax( + transmute(halves.0), + transmute(halves.1), + )); h.any() } } - } + }; } // Implementation for AArch64 + NEON using vmin and vmax (horizontal vector @@ -402,7 +443,7 @@ macro_rules! aarch64_128_neon_impl { #[inline] #[target_feature(enable = "neon")] unsafe fn all(self) -> bool { - use ::coresimd::arch::aarch64::$vmin; + use coresimd::arch::aarch64::$vmin; $vmin(::mem::transmute(self)) != 0 } } @@ -410,11 +451,11 @@ macro_rules! aarch64_128_neon_impl { #[inline] #[target_feature(enable = "neon")] unsafe fn any(self) -> bool { - use ::coresimd::arch::aarch64::$vmax; + use coresimd::arch::aarch64::$vmax; $vmax(::mem::transmute(self)) != 0 } } - } + }; } // Implementation for AArch64 + NEON using vmin and vmax (horizontal vector @@ -431,9 +472,12 @@ macro_rules! aarch64_64_neon_impl { unsafe fn all(self) -> bool { union U { halves: ($id, $id), - vec: $vec128 + vec: $vec128, } - U { halves: (self, self) }.vec.all() + U { + halves: (self, self), + }.vec + .all() } } impl Any for $id { @@ -442,12 +486,15 @@ macro_rules! aarch64_64_neon_impl { unsafe fn any(self) -> bool { union U { halves: ($id, $id), - vec: $vec128 + vec: $vec128, } - U { halves: (self, self) }.vec.any() + U { + halves: (self, self), + }.vec + .any() } } - } + }; } macro_rules! impl_mask_all_any { diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs index a1e8c24f6b..004f7b66f5 100644 --- a/coresimd/ppsv/codegen/mod.rs +++ b/coresimd/ppsv/codegen/mod.rs @@ -5,8 +5,10 @@ pub mod wrapping; pub mod masks_reductions; -pub mod sqrt; pub mod abs; +pub mod cos; pub mod fma; pub mod sin; -pub mod cos; +pub mod sqrt; + +pub mod swap_bytes; diff --git a/coresimd/ppsv/codegen/sin.rs b/coresimd/ppsv/codegen/sin.rs index cf7f3dea20..c13ae31d34 100644 --- a/coresimd/ppsv/codegen/sin.rs +++ b/coresimd/ppsv/codegen/sin.rs @@ -1,9 +1,14 @@ //! Exact vector sin - +#![allow(dead_code)] use coresimd::simd::*; #[allow(improper_ctypes)] extern "C" { + #[link_name = "llvm.sin.f32"] + fn sin_f32(x: f32) -> f32; + #[link_name = "llvm.sin.f64"] + fn sin_f64(x: f64) -> f64; + #[link_name = "llvm.sin.v2f32"] fn sin_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.sin.v4f32"] @@ -24,14 +29,44 @@ pub(crate) trait FloatSin { fn sin(self) -> Self; } +trait RawSin { + fn raw_sin(self) -> Self; +} + +impl RawSin for f32 { + fn raw_sin(self) -> Self { + unsafe { sin_f32(self) } + } +} + +impl RawSin for f64 { + fn raw_sin(self) -> Self { + unsafe { sin_f64(self) } + } +} + macro_rules! impl_fsin { - ($id:ident: $fn:ident) => { + ($id:ident : $fn:ident) => { + #[cfg(not(target_arch = "s390x"))] impl FloatSin for $id { fn sin(self) -> Self { unsafe { $fn(self) } } } - } + + // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501 + #[cfg(target_arch = "s390x")] + impl FloatSin for $id { + fn sin(self) -> Self { + let mut v = $id::splat(0.); + for i in 0..$id::lanes() { + v = v.replace(i, self.extract(i).raw_sin()) + } + v + } + } + + }; } impl_fsin!(f32x2: sin_v2f32); diff --git a/coresimd/ppsv/codegen/sqrt.rs b/coresimd/ppsv/codegen/sqrt.rs index 8e86650555..6a18589e71 100644 --- a/coresimd/ppsv/codegen/sqrt.rs +++ b/coresimd/ppsv/codegen/sqrt.rs @@ -1,9 +1,14 @@ //! Exact vector square-root - +#![allow(dead_code)] use coresimd::simd::*; #[allow(improper_ctypes)] extern "C" { + #[link_name = "llvm.sqrt.f32"] + fn sqrt_f32(x: f32) -> f32; + #[link_name = "llvm.sqrt.f64"] + fn sqrt_f64(x: f64) -> f64; + #[link_name = "llvm.sqrt.v2f32"] fn sqrt_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.sqrt.v4f32"] @@ -24,14 +29,43 @@ pub(crate) trait FloatSqrt { fn sqrt(self) -> Self; } +trait RawSqrt { + fn raw_sqrt(self) -> Self; +} + +impl RawSqrt for f32 { + fn raw_sqrt(self) -> Self { + unsafe { sqrt_f32(self) } + } +} + +impl RawSqrt for f64 { + fn raw_sqrt(self) -> Self { + unsafe { sqrt_f64(self) } + } +} + macro_rules! impl_fsqrt { - ($id:ident: $fn:ident) => { + ($id:ident : $fn:ident) => { + #[cfg(not(target_arch = "s390x"))] impl FloatSqrt for $id { fn sqrt(self) -> Self { unsafe { $fn(self) } } } - } + // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/501 + #[cfg(target_arch = "s390x")] + impl FloatSqrt for $id { + fn sqrt(self) -> Self { + let mut v = $id::splat(0.); + for i in 0..$id::lanes() { + v = v.replace(i, self.extract(i).raw_sqrt()); + } + v + } + } + + }; } impl_fsqrt!(f32x2: sqrt_v2f32); diff --git a/coresimd/ppsv/codegen/swap_bytes.rs b/coresimd/ppsv/codegen/swap_bytes.rs new file mode 100644 index 0000000000..a9df5c1fa2 --- /dev/null +++ b/coresimd/ppsv/codegen/swap_bytes.rs @@ -0,0 +1,141 @@ +//! swap bytes horizontally + +use coresimd::simd::*; + +pub(crate) trait SwapBytes { + fn swap_bytes(self) -> Self; +} + +macro_rules! impl_swap_bytes { + ($vec8:ident, $shuf:ident, $indices:expr, $id:ident) => ( + impl SwapBytes for $id { + fn swap_bytes(self) -> Self { + let vec8 = $vec8::from_bits(self); + let shuffled: $vec8 = unsafe { $shuf(vec8, vec8, $indices) }; + $id::from_bits(shuffled) + } + } + ); + + // bulk impl for a vector width + ($vec8:ident, $shuf:ident, $indices:expr, $($id:ident,)+) => ($( + impl_swap_bytes! { $vec8, $shuf, $indices, $id } + )+); +} + +impl_swap_bytes! { + u8x2, + simd_shuffle2, + [1, 0], + u8x2, i8x2, +} + +impl_swap_bytes! { + u8x4, + simd_shuffle4, + [3, 2, 1, 0], + u8x4, i8x4, + u16x2, i16x2, +} + +impl_swap_bytes! { + u8x8, + simd_shuffle8, + [7, 6, 5, 4, 3, 2, 1, 0], + u8x8, i8x8, + u16x4, i16x4, + u32x2, i32x2, +} + +impl_swap_bytes! { + u8x16, + simd_shuffle16, + [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + u8x16, i8x16, + u16x8, i16x8, + u32x4, i32x4, + u64x2, i64x2, +} + +impl_swap_bytes! { + u8x32, + simd_shuffle32, + [ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + ], + u8x32, i8x32, + u16x16, i16x16, + u32x8, i32x8, + u64x4, i64x4, +} + +impl_swap_bytes! { + u8x64, + simd_shuffle64, + [ + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, + 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + ], + u8x64, i8x64, + u16x32, i16x32, + u32x16, i32x16, + u64x8, i64x8, +} + +#[cfg(test)] +mod tests { + use super::*; + use std::mem; + + // testing larger vectors is less simple + #[test] + #[cfg(feature = "simd_support")] + fn swap_bytes_128() { + let x: u128 = 0x2d99787926d46932a4c1f32680f70c55; + let expected = x.swap_bytes(); + + let vec: u8x16 = unsafe { mem::transmute(x) }; + let actual = unsafe { mem::transmute(vec.swap_bytes()) }; + + assert_eq!(expected, actual); + } + + #[test] + #[cfg(feature = "simd_support")] + fn swap_bytes_64() { + let x: u64 = 0x2d99787926d46932; + let expected = x.swap_bytes(); + + let vec: u8x8 = unsafe { mem::transmute(x) }; + let actual = unsafe { mem::transmute(vec.swap_bytes()) }; + + assert_eq!(expected, actual); + } + + #[test] + #[cfg(feature = "simd_support")] + fn swap_bytes_32() { + let x: u32 = 0x2d997872; + let expected = x.swap_bytes(); + + let vec: u8x4 = unsafe { mem::transmute(x) }; + let actual = unsafe { mem::transmute(vec.swap_bytes()) }; + + assert_eq!(expected, actual); + } + + #[test] + #[cfg(feature = "simd_support")] + fn swap_bytes_16() { + let x: u16 = 0x2d99; + let expected = x.swap_bytes(); + + let vec: u8x2 = unsafe { mem::transmute(x) }; + let actual = unsafe { mem::transmute(vec.swap_bytes()) }; + + assert_eq!(expected, actual); + } +} diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs index 4d5c92dad0..eb2ba49541 100644 --- a/coresimd/ppsv/mod.rs +++ b/coresimd/ppsv/mod.rs @@ -66,8 +66,12 @@ where U: FromBits, { // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449 - #[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))] - #[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)] + #[cfg_attr( + any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always) + )] + #[cfg_attr( + not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline + )] fn into_bits(self) -> U { debug_assert!(::mem::size_of::() == ::mem::size_of::()); U::from_bits(self) @@ -77,8 +81,12 @@ where // FromBits (and thus IntoBits) is reflexive. impl FromBits for T { // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/449 - #[cfg_attr(any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always))] - #[cfg_attr(not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline)] + #[cfg_attr( + any(target_arch = "powerpc", target_arch = "powerpc64"), inline(always) + )] + #[cfg_attr( + not(any(target_arch = "powerpc", target_arch = "powerpc64")), inline + )] fn from_bits(t: Self) -> Self { t } diff --git a/coresimd/ppsv/v128.rs b/coresimd/ppsv/v128.rs index 85a18f7863..08593bcef7 100644 --- a/coresimd/ppsv/v128.rs +++ b/coresimd/ppsv/v128.rs @@ -110,41 +110,53 @@ macro_rules! from_bits_x86 { }; } -#[cfg(all(target_arch = "arm", target_feature = "neon", - target_feature = "v7"))] -use coresimd::arch::arm::{// FIXME: float16x8_t, - float32x4_t, - int16x8_t, - int32x4_t, - int64x2_t, - int8x16_t, - poly16x8_t, - poly8x16_t, - uint16x8_t, - uint32x4_t, - uint64x2_t, - uint8x16_t}; +#[cfg( + all(target_arch = "arm", target_feature = "neon", target_feature = "v7") +)] +use coresimd::arch::arm::{ + // FIXME: float16x8_t, + float32x4_t, + int16x8_t, + int32x4_t, + int64x2_t, + int8x16_t, + poly16x8_t, + poly8x16_t, + uint16x8_t, + uint32x4_t, + uint64x2_t, + uint8x16_t, +}; #[cfg(target_arch = "aarch64")] -use coresimd::arch::aarch64::{// FIXME: float16x8_t, - float32x4_t, - float64x2_t, - int16x8_t, - int32x4_t, - int64x2_t, - int8x16_t, - poly16x8_t, - poly8x16_t, - uint16x8_t, - uint32x4_t, - uint64x2_t, - uint8x16_t}; +use coresimd::arch::aarch64::{ + // FIXME: float16x8_t, + float32x4_t, + float64x2_t, + int16x8_t, + int32x4_t, + int64x2_t, + int8x16_t, + poly16x8_t, + poly8x16_t, + uint16x8_t, + uint32x4_t, + uint64x2_t, + uint8x16_t, +}; macro_rules! from_bits_arm { ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => { - #[cfg(any(all(target_arch = "arm", target_feature = "neon", - target_feature = "v7"), - target_arch = "aarch64"))] + #[cfg( + any( + all( + target_arch = "arm", + target_feature = "neon", + target_feature = "v7" + ), + target_arch = "aarch64" + ) + )] impl_from_bits_!( $id: int8x16_t, uint8x16_t, @@ -182,12 +194,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(u64x2, u64, u64x2_from_bits_x86); -from_bits_arm!( - u64x2, - u64, - u64x2_from_bits_arm, - u64x2_from_bits_aarch64 -); +from_bits_arm!(u64x2, u64, u64x2_from_bits_arm, u64x2_from_bits_aarch64); impl_from_bits!( i64x2: i64, @@ -207,12 +214,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(i64x2, i64, i64x2_from_bits_x86); -from_bits_arm!( - i64x2, - i64, - i64x2_from_bits_arm, - i64x2_from_bits_aarch64 -); +from_bits_arm!(i64x2, i64, i64x2_from_bits_arm, i64x2_from_bits_aarch64); impl_from_bits!( f64x2: f64, @@ -232,12 +234,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(f64x2, f64, f64x2_from_bits_x86); -from_bits_arm!( - f64x2, - f64, - f64x2_from_bits_arm, - f64x2_from_bits_aarch64 -); +from_bits_arm!(f64x2, f64, f64x2_from_bits_arm, f64x2_from_bits_aarch64); impl_from_bits!( u32x4: u32, @@ -257,12 +254,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(u32x4, u32, u32x4_from_bits_x86); -from_bits_arm!( - u32x4, - u32, - u32x4_from_bits_arm, - u32x4_from_bits_aarch64 -); +from_bits_arm!(u32x4, u32, u32x4_from_bits_arm, u32x4_from_bits_aarch64); impl_from_bits!( i32x4: i32, @@ -282,12 +274,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(i32x4, i32, i32x4_from_bits_x86); -from_bits_arm!( - i32x4, - i32, - i32x4_from_bits_arm, - i32x4_from_bits_aarch64 -); +from_bits_arm!(i32x4, i32, i32x4_from_bits_arm, i32x4_from_bits_aarch64); impl_from_bits!( f32x4: f32, @@ -307,12 +294,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(f32x4, f32, f32x4_from_bits_x86); -from_bits_arm!( - f32x4, - f32, - f32x4_from_bits_arm, - f32x4_from_bits_aarch64 -); +from_bits_arm!(f32x4, f32, f32x4_from_bits_arm, f32x4_from_bits_aarch64); impl_from_bits!( u16x8: u16, @@ -332,12 +314,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(u16x8, u16, u16x8_from_bits_x86); -from_bits_arm!( - u16x8, - u16, - u16x8_from_bits_arm, - u16x8_from_bits_aarch64 -); +from_bits_arm!(u16x8, u16, u16x8_from_bits_arm, u16x8_from_bits_aarch64); impl_from_bits!( i16x8: i16, @@ -357,12 +334,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(i16x8, i16, i16x8_from_bits_x86); -from_bits_arm!( - i16x8, - i16, - i16x8_from_bits_arm, - i16x8_from_bits_aarch64 -); +from_bits_arm!(i16x8, i16, i16x8_from_bits_arm, i16x8_from_bits_aarch64); impl_from_bits!( u8x16: u8, @@ -382,12 +354,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(u8x16, u8, u8x16_from_bits_x86); -from_bits_arm!( - u8x16, - u8, - u8x16_from_bits_arm, - u8x16_from_bits_aarch64 -); +from_bits_arm!(u8x16, u8, u8x16_from_bits_arm, u8x16_from_bits_aarch64); impl_from_bits!( i8x16: i8, @@ -407,12 +374,7 @@ impl_from_bits!( m8x16 ); from_bits_x86!(i8x16, i8, i8x16_from_bits_x86); -from_bits_arm!( - i8x16, - i8, - i8x16_from_bits_arm, - i8x16_from_bits_aarch64 -); +from_bits_arm!(i8x16, i8, i8x16_from_bits_arm, i8x16_from_bits_aarch64); impl_from!( f64x2: f64, @@ -552,31 +514,37 @@ impl_from!( m8x8 ); -impl_from!(u8x16: u8, u8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, i8x16, m8x16); -impl_from!(i8x16: i8, i8x16_from, test_v128 | i32x16, u32x16, f32x16, m1x16, i16x16, u16x16, m16x16, u8x16, m8x16); - -impl_from!(m8x16: i8, m8x16_from, test_v128 | m1x16, m16x16); - impl_from!( - m16x8: i16, - m16x8_from, - test_v128 | m1x8, - m32x8, - m8x8 + u8x16: u8, + u8x16_from, + test_v128 | i32x16, + u32x16, + f32x16, + m1x16, + i16x16, + u16x16, + m16x16, + i8x16, + m8x16 ); - impl_from!( - m32x4: i32, - m32x4_from, - test_v128 | m64x4, - m16x4, - m8x4 + i8x16: i8, + i8x16_from, + test_v128 | i32x16, + u32x16, + f32x16, + m1x16, + i16x16, + u16x16, + m16x16, + u8x16, + m8x16 ); -impl_from!( - m64x2: i64, - m64x2_from, - test_v128 | m32x2, - m16x2, - m8x2 -); +impl_from!(m8x16: i8, m8x16_from, test_v128 | m1x16, m16x16); + +impl_from!(m16x8: i16, m16x8_from, test_v128 | m1x8, m32x8, m8x8); + +impl_from!(m32x4: i32, m32x4_from, test_v128 | m64x4, m16x4, m8x4); + +impl_from!(m64x2: i64, m64x2_from, test_v128 | m32x2, m16x2, m8x2); diff --git a/coresimd/ppsv/v16.rs b/coresimd/ppsv/v16.rs index 8bc08452c4..a2baf8dfc5 100644 --- a/coresimd/ppsv/v16.rs +++ b/coresimd/ppsv/v16.rs @@ -57,10 +57,4 @@ impl_from!( m8x2 ); -impl_from!( - m8x2: i8, - m8x2_from, - test_v16 | m64x2, - m32x2, - m16x2 -); +impl_from!(m8x2: i8, m8x2_from, test_v16 | m64x2, m32x2, m16x2); diff --git a/coresimd/ppsv/v256.rs b/coresimd/ppsv/v256.rs index 849897d4ea..c68ec9118e 100644 --- a/coresimd/ppsv/v256.rs +++ b/coresimd/ppsv/v256.rs @@ -465,25 +465,8 @@ impl_from!( impl_from!(m8x32: i8, m8x32_from, test_v256 | m1x32); -impl_from!( - m16x16: i16, - m16x16_from, - test_v256 | m1x16, - m8x16 -); +impl_from!(m16x16: i16, m16x16_from, test_v256 | m1x16, m8x16); -impl_from!( - m32x8: i32, - m32x8_from, - test_v256 | m1x8, - m16x8, - m8x8 -); +impl_from!(m32x8: i32, m32x8_from, test_v256 | m1x8, m16x8, m8x8); -impl_from!( - m64x4: i64, - m64x4_from, - test_v256 | m32x4, - m16x4, - m8x4 -); +impl_from!(m64x4: i64, m64x4_from, test_v256 | m32x4, m16x4, m8x4); diff --git a/coresimd/ppsv/v32.rs b/coresimd/ppsv/v32.rs index 854837e9ba..ab56b5ad80 100644 --- a/coresimd/ppsv/v32.rs +++ b/coresimd/ppsv/v32.rs @@ -151,18 +151,6 @@ impl_from!( m8x4 ); -impl_from!( - m8x4: i8, - m8x4_from, - test_v32 | m64x4, - m32x4, - m16x4 -); +impl_from!(m8x4: i8, m8x4_from, test_v32 | m64x4, m32x4, m16x4); -impl_from!( - m16x2: i16, - m16x2_from, - test_v32 | m64x2, - m32x2, - m8x2 -); +impl_from!(m16x2: i16, m16x2_from, test_v32 | m64x2, m32x2, m8x2); diff --git a/coresimd/ppsv/v512.rs b/coresimd/ppsv/v512.rs index 7fd42175a9..6bea72c73b 100644 --- a/coresimd/ppsv/v512.rs +++ b/coresimd/ppsv/v512.rs @@ -446,17 +446,6 @@ impl_from!(u8x64: u8, u8x64_from, test_v512 | i8x64, m1x64); impl_from!(m1x32: i16, m1x32_from, test_v512 | m8x32); -impl_from!( - m1x16: i32, - m1x16_from, - test_v512 | m16x16, - m8x16 -); +impl_from!(m1x16: i32, m1x16_from, test_v512 | m16x16, m8x16); -impl_from!( - m1x8: i64, - m1x8_from, - test_v512 | m32x8, - m16x8, - m8x8 -); +impl_from!(m1x8: i64, m1x8_from, test_v512 | m32x8, m16x8, m8x8); diff --git a/coresimd/ppsv/v64.rs b/coresimd/ppsv/v64.rs index cfa56a234a..64a86b601d 100644 --- a/coresimd/ppsv/v64.rs +++ b/coresimd/ppsv/v64.rs @@ -83,41 +83,53 @@ macro_rules! from_bits_x86 { }; } -#[cfg(all(target_arch = "arm", target_feature = "neon", - target_feature = "v7"))] -use coresimd::arch::arm::{// FIXME: float16x4_t, - float32x2_t, - int16x4_t, - int32x2_t, - int64x1_t, - int8x8_t, - poly16x4_t, - poly8x8_t, - uint16x4_t, - uint32x2_t, - uint64x1_t, - uint8x8_t}; +#[cfg( + all(target_arch = "arm", target_feature = "neon", target_feature = "v7") +)] +use coresimd::arch::arm::{ + // FIXME: float16x4_t, + float32x2_t, + int16x4_t, + int32x2_t, + int64x1_t, + int8x8_t, + poly16x4_t, + poly8x8_t, + uint16x4_t, + uint32x2_t, + uint64x1_t, + uint8x8_t, +}; #[cfg(target_arch = "aarch64")] -use coresimd::arch::aarch64::{// FIXME: float16x4_t, - float32x2_t, - float64x1_t, - int16x4_t, - int32x2_t, - int64x1_t, - int8x8_t, - poly16x4_t, - poly8x8_t, - uint16x4_t, - uint32x2_t, - uint64x1_t, - uint8x8_t}; +use coresimd::arch::aarch64::{ + // FIXME: float16x4_t, + float32x2_t, + float64x1_t, + int16x4_t, + int32x2_t, + int64x1_t, + int8x8_t, + poly16x4_t, + poly8x8_t, + uint16x4_t, + uint32x2_t, + uint64x1_t, + uint8x8_t, +}; macro_rules! from_bits_arm { ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => { - #[cfg(any(all(target_arch = "arm", target_feature = "neon", - target_feature = "v7"), - target_arch = "aarch64"))] + #[cfg( + any( + all( + target_arch = "arm", + target_feature = "neon", + target_feature = "v7" + ), + target_arch = "aarch64" + ) + )] impl_from_bits_!( $id: int64x1_t, uint64x1_t, @@ -151,12 +163,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(u32x2, u32, u32x2_from_bits_x86); -from_bits_arm!( - u32x2, - u32, - u32x2_from_bits_arm, - u32x2_from_bits_aarch64 -); +from_bits_arm!(u32x2, u32, u32x2_from_bits_arm, u32x2_from_bits_aarch64); impl_from_bits!( i32x2: i32, @@ -172,12 +179,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(i32x2, i32, i32x2_from_bits_x86); -from_bits_arm!( - i32x2, - i32, - i32x2_from_bits_arm, - i32x2_from_bits_aarch64 -); +from_bits_arm!(i32x2, i32, i32x2_from_bits_arm, i32x2_from_bits_aarch64); impl_from_bits!( f32x2: f32, @@ -193,12 +195,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(f32x2, f32, f32x2_from_bits_x86); -from_bits_arm!( - f32x2, - f32, - f32x2_from_bits_arm, - f32x2_from_bits_aarch64 -); +from_bits_arm!(f32x2, f32, f32x2_from_bits_arm, f32x2_from_bits_aarch64); impl_from_bits!( u16x4: u16, @@ -213,12 +210,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(u16x4, u16, u16x4_from_bits_x86); -from_bits_arm!( - u16x4, - u16, - u16x4_from_bits_arm, - u16x4_from_bits_aarch64 -); +from_bits_arm!(u16x4, u16, u16x4_from_bits_arm, u16x4_from_bits_aarch64); impl_from_bits!( i16x4: i16, @@ -233,12 +225,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(i16x4, i16, i16x4_from_bits_x86); -from_bits_arm!( - i16x4, - i16, - i16x4_from_bits_arm, - i16x4_from_bits_aarch64 -); +from_bits_arm!(i16x4, i16, i16x4_from_bits_arm, i16x4_from_bits_aarch64); impl_from_bits!( u8x8: u8, @@ -253,12 +240,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(u8x8, u8, u8x8_from_bits_x86); -from_bits_arm!( - u8x8, - u8, - u8x8_from_bits_arm, - u8x8_from_bits_aarch64 -); +from_bits_arm!(u8x8, u8, u8x8_from_bits_arm, u8x8_from_bits_aarch64); impl_from_bits!( i8x8: i8, @@ -273,12 +255,7 @@ impl_from_bits!( m8x8 ); from_bits_x86!(i8x8, i8, i8x8_from_bits_x86); -from_bits_arm!( - i8x8, - i8, - i8x8_from_bits_arm, - i8x8_from_bits_aarch64 -); +from_bits_arm!(i8x8, i8, i8x8_from_bits_arm, i8x8_from_bits_aarch64); impl_from!( f32x2: f32, @@ -404,26 +381,8 @@ impl_from!( m8x8 ); -impl_from!( - m8x8: i8, - m8x8_from, - test_v64 | m1x8, - m32x8, - m16x8 -); +impl_from!(m8x8: i8, m8x8_from, test_v64 | m1x8, m32x8, m16x8); -impl_from!( - m16x4: i16, - m16x4_from, - test_v64 | m64x4, - m32x4, - m8x4 -); +impl_from!(m16x4: i16, m16x4_from, test_v64 | m64x4, m32x4, m8x4); -impl_from!( - m32x2: i32, - m32x2_from, - test_v64 | m64x2, - m16x2, - m8x2 -); +impl_from!(m32x2: i32, m32x2_from, test_v64 | m64x2, m16x2, m8x2); diff --git a/coresimd/x86/avx.rs b/coresimd/x86/avx.rs index 7fe4c0a51d..f41ebb8974 100644 --- a/coresimd/x86/avx.rs +++ b/coresimd/x86/avx.rs @@ -1387,7 +1387,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_permute2f128_ps( - a: __m256, b: __m256, imm8: i32 + a: __m256, b: __m256, imm8: i32, ) -> __m256 { macro_rules! call { ($imm8:expr) => { @@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_permute2f128_ps( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_permute2f128_pd( - a: __m256d, b: __m256d, imm8: i32 + a: __m256d, b: __m256d, imm8: i32, ) -> __m256d { macro_rules! call { ($imm8:expr) => { @@ -1427,7 +1427,7 @@ pub unsafe fn _mm256_permute2f128_pd( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_permute2f128_si256( - a: __m256i, b: __m256i, imm8: i32 + a: __m256i, b: __m256i, imm8: i32, ) -> __m256i { let a = a.as_i32x8(); let b = b.as_i32x8(); @@ -1529,7 +1529,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_insertf128_pd( - a: __m256d, b: __m128d, imm8: i32 + a: __m256d, b: __m128d, imm8: i32, ) -> __m256d { match imm8 & 1 { 0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]), @@ -1547,7 +1547,7 @@ pub unsafe fn _mm256_insertf128_pd( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_insertf128_si256( - a: __m256i, b: __m128i, imm8: i32 + a: __m256i, b: __m128i, imm8: i32, ) -> __m256i { let b = _mm256_castsi128_si256(b).as_i64x4(); let dst: i64x4 = match imm8 & 1 { @@ -1567,11 +1567,7 @@ pub unsafe fn _mm256_insertf128_si256( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i { - mem::transmute(simd_insert( - a.as_i8x32(), - (index as u32) & 31, - i, - )) + mem::transmute(simd_insert(a.as_i8x32(), (index as u32) & 31, i)) } /// Copy `a` to result, and insert the 16-bit integer `i` into result @@ -1584,11 +1580,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i { - mem::transmute(simd_insert( - a.as_i16x16(), - (index as u32) & 15, - i, - )) + mem::transmute(simd_insert(a.as_i16x16(), (index as u32) & 15, i)) } /// Copy `a` to result, and insert the 32-bit integer `i` into result @@ -1790,7 +1782,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { #[cfg_attr(test, assert_instr(vmaskmovpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_pd( - mem_addr: *const f64, mask: __m256i + mem_addr: *const f64, mask: __m256i, ) -> __m256d { maskloadpd256(mem_addr as *const i8, mask.as_i64x4()) } @@ -1804,7 +1796,7 @@ pub unsafe fn _mm256_maskload_pd( #[cfg_attr(test, assert_instr(vmaskmovpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_pd( - mem_addr: *mut f64, mask: __m256i, a: __m256d + mem_addr: *mut f64, mask: __m256i, a: __m256d, ) { maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a); } @@ -1844,7 +1836,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { #[cfg_attr(test, assert_instr(vmaskmovps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_ps( - mem_addr: *const f32, mask: __m256i + mem_addr: *const f32, mask: __m256i, ) -> __m256 { maskloadps256(mem_addr as *const i8, mask.as_i32x8()) } @@ -1858,7 +1850,7 @@ pub unsafe fn _mm256_maskload_ps( #[cfg_attr(test, assert_instr(vmaskmovps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_ps( - mem_addr: *mut f32, mask: __m256i, a: __m256 + mem_addr: *mut f32, mask: __m256i, a: __m256, ) { maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a); } @@ -2383,7 +2375,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_set_ps( - a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32 + a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32, ) -> __m256 { _mm256_setr_ps(h, g, f, e, d, c, b, a) } @@ -2440,7 +2432,7 @@ pub unsafe fn _mm256_set_epi16( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_set_epi32( - e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32 + e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32, ) -> __m256i { _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) } @@ -2477,7 +2469,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_setr_ps( - a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32 + a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32, ) -> __m256 { __m256(a, b, c, d, e, f, g, h) } @@ -2536,7 +2528,7 @@ pub unsafe fn _mm256_setr_epi16( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_setr_epi32( - e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32 + e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32, ) -> __m256i { mem::transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) } @@ -2950,7 +2942,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_loadu2_m128( - hiaddr: *const f32, loaddr: *const f32 + hiaddr: *const f32, loaddr: *const f32, ) -> __m256 { let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr)); _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1) @@ -2967,7 +2959,7 @@ pub unsafe fn _mm256_loadu2_m128( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_loadu2_m128d( - hiaddr: *const f64, loaddr: *const f64 + hiaddr: *const f64, loaddr: *const f64, ) -> __m256d { let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr)); _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1) @@ -2983,7 +2975,7 @@ pub unsafe fn _mm256_loadu2_m128d( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_loadu2_m128i( - hiaddr: *const __m128i, loaddr: *const __m128i + hiaddr: *const __m128i, loaddr: *const __m128i, ) -> __m256i { let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr)); _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1) @@ -3000,7 +2992,7 @@ pub unsafe fn _mm256_loadu2_m128i( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_storeu2_m128( - hiaddr: *mut f32, loaddr: *mut f32, a: __m256 + hiaddr: *mut f32, loaddr: *mut f32, a: __m256, ) { let lo = _mm256_castps256_ps128(a); _mm_storeu_ps(loaddr, lo); @@ -3019,7 +3011,7 @@ pub unsafe fn _mm256_storeu2_m128( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_storeu2_m128d( - hiaddr: *mut f64, loaddr: *mut f64, a: __m256d + hiaddr: *mut f64, loaddr: *mut f64, a: __m256d, ) { let lo = _mm256_castpd256_pd128(a); _mm_storeu_pd(loaddr, lo); @@ -3037,7 +3029,7 @@ pub unsafe fn _mm256_storeu2_m128d( // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_storeu2_m128i( - hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i + hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i, ) { let lo = _mm256_castsi256_si128(a); _mm_storeu_si128(loaddr, lo); @@ -3500,20 +3492,11 @@ mod tests { let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.); let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.); let r = _mm256_blend_ps(a, b, 0x0); - assert_eq_m256( - r, - _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.), - ); + assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.)); let r = _mm256_blend_ps(a, b, 0x3); - assert_eq_m256( - r, - _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.), - ); + assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.)); let r = _mm256_blend_ps(a, b, 0xF); - assert_eq_m256( - r, - _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.), - ); + assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.)); } #[simd_test(enable = "avx")] @@ -3544,16 +3527,8 @@ mod tests { let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); let r = _mm256_dp_ps(a, b, 0xFF); - let e = _mm256_setr_ps( - 200., - 200., - 200., - 200., - 2387., - 2387., - 2387., - 2387., - ); + let e = + _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.); assert_eq_m256(r, e); } @@ -4234,9 +4209,7 @@ mod tests { pub data: [f64; 4], } let a = _mm256_set1_pd(7.0); - let mut mem = Memory { - data: [-1.0; 4], - }; + let mut mem = Memory { data: [-1.0; 4] }; _mm256_stream_pd(&mut mem.data[0] as *mut f64, a); for i in 0..4 { @@ -4251,9 +4224,7 @@ mod tests { pub data: [f32; 8], } let a = _mm256_set1_ps(7.0); - let mut mem = Memory { - data: [-1.0; 8], - }; + let mut mem = Memory { data: [-1.0; 8] }; _mm256_stream_ps(&mut mem.data[0] as *mut f32, a); for i in 0..8 { @@ -4534,10 +4505,7 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm256_set_ps() { let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); - assert_eq_m256( - r, - _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.), - ); + assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.)); } #[simd_test(enable = "avx")] @@ -4595,10 +4563,7 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm256_setr_ps() { let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); - assert_eq_m256( - r, - _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.), - ); + assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.)); } #[simd_test(enable = "avx")] diff --git a/coresimd/x86/avx2.rs b/coresimd/x86/avx2.rs index 982b293b88..c4ead715ae 100644 --- a/coresimd/x86/avx2.rs +++ b/coresimd/x86/avx2.rs @@ -413,7 +413,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blend_epi32( - a: __m256i, b: __m256i, imm8: i32 + a: __m256i, b: __m256i, imm8: i32, ) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; let a = a.as_i32x8(); @@ -480,7 +480,7 @@ pub unsafe fn _mm256_blend_epi32( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blend_epi16( - a: __m256i, b: __m256i, imm8: i32 + a: __m256i, b: __m256i, imm8: i32, ) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; let a = a.as_i16x16(); @@ -531,76 +531,20 @@ pub unsafe fn _mm256_blend_epi16( ) => { match (imm8 >> 6) & 0b11 { 0b00 => blend4!( - $a, - $b, - $c, - $d, - $e, - $f, - 6, - 7, - $a2, - $b2, - $c2, - $d2, - $e2, - $f2, - 14, - 15 + $a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, + $f2, 14, 15 ), 0b01 => blend4!( - $a, - $b, - $c, - $d, - $e, - $f, - 22, - 7, - $a2, - $b2, - $c2, - $d2, - $e2, - $f2, - 30, - 15 + $a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, + $e2, $f2, 30, 15 ), 0b10 => blend4!( - $a, - $b, - $c, - $d, - $e, - $f, - 6, - 23, - $a2, - $b2, - $c2, - $d2, - $e2, - $f2, - 14, - 31 + $a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, + $e2, $f2, 14, 31 ), _ => blend4!( - $a, - $b, - $c, - $d, - $e, - $f, - 22, - 23, - $a2, - $b2, - $c2, - $d2, - $e2, - $f2, - 30, - 31 + $a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, + $e2, $f2, 30, 31 ), } }; @@ -618,60 +562,16 @@ pub unsafe fn _mm256_blend_epi16( ) => { match (imm8 >> 4) & 0b11 { 0b00 => blend3!( - $a, - $b, - $c, - $d, - 4, - 5, - $a2, - $b2, - $c2, - $d2, - 12, - 13 + $a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13 ), 0b01 => blend3!( - $a, - $b, - $c, - $d, - 20, - 5, - $a2, - $b2, - $c2, - $d2, - 28, - 13 + $a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13 ), 0b10 => blend3!( - $a, - $b, - $c, - $d, - 4, - 21, - $a2, - $b2, - $c2, - $d2, - 12, - 29 + $a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29 ), _ => blend3!( - $a, - $b, - $c, - $d, - 20, - 21, - $a2, - $b2, - $c2, - $d2, - 28, - 29 + $a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29 ), } }; @@ -703,13 +603,9 @@ pub unsafe fn _mm256_blend_epi16( #[cfg_attr(test, assert_instr(vpblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_epi8( - a: __m256i, b: __m256i, mask: __m256i + a: __m256i, b: __m256i, mask: __m256i, ) -> __m256i { - mem::transmute(pblendvb( - a.as_i8x32(), - b.as_i8x32(), - mask.as_i8x32(), - )) + mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32())) } /// Broadcast the low packed 8-bit integer from `a` to all elements of @@ -1226,7 +1122,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i32gather_epi32( - slice: *const i32, offsets: __m128i, scale: i32 + slice: *const i32, offsets: __m128i, scale: i32, ) -> __m128i { let zero = _mm_setzero_si128().as_i32x4(); let neg_one = _mm_set1_epi32(-1).as_i32x4(); @@ -1280,7 +1176,7 @@ pub unsafe fn _mm_mask_i32gather_epi32( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i32gather_epi32( - slice: *const i32, offsets: __m256i, scale: i32 + slice: *const i32, offsets: __m256i, scale: i32, ) -> __m256i { let zero = _mm256_setzero_si256().as_i32x8(); let neg_one = _mm256_set1_epi32(-1).as_i32x8(); @@ -1334,7 +1230,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i32gather_ps( - slice: *const f32, offsets: __m128i, scale: i32 + slice: *const f32, offsets: __m128i, scale: i32, ) -> __m128 { let zero = _mm_setzero_ps(); let neg_one = _mm_set1_ps(-1.0); @@ -1360,7 +1256,7 @@ pub unsafe fn _mm_i32gather_ps( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mask_i32gather_ps( - src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32 + src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32, ) -> __m128 { let offsets = offsets.as_i32x4(); let slice = slice as *const i8; @@ -1383,7 +1279,7 @@ pub unsafe fn _mm_mask_i32gather_ps( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i32gather_ps( - slice: *const f32, offsets: __m256i, scale: i32 + slice: *const f32, offsets: __m256i, scale: i32, ) -> __m256 { let zero = _mm256_setzero_ps(); let neg_one = _mm256_set1_ps(-1.0); @@ -1409,7 +1305,7 @@ pub unsafe fn _mm256_i32gather_ps( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mask_i32gather_ps( - src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32 + src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32, ) -> __m256 { let offsets = offsets.as_i32x8(); let slice = slice as *const i8; @@ -1432,7 +1328,7 @@ pub unsafe fn _mm256_mask_i32gather_ps( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i32gather_epi64( - slice: *const i64, offsets: __m128i, scale: i32 + slice: *const i64, offsets: __m128i, scale: i32, ) -> __m128i { let zero = _mm_setzero_si128().as_i64x2(); let neg_one = _mm_set1_epi64x(-1).as_i64x2(); @@ -1486,7 +1382,7 @@ pub unsafe fn _mm_mask_i32gather_epi64( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i32gather_epi64( - slice: *const i64, offsets: __m128i, scale: i32 + slice: *const i64, offsets: __m128i, scale: i32, ) -> __m256i { let zero = _mm256_setzero_si256().as_i64x4(); let neg_one = _mm256_set1_epi64x(-1).as_i64x4(); @@ -1540,7 +1436,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i32gather_pd( - slice: *const f64, offsets: __m128i, scale: i32 + slice: *const f64, offsets: __m128i, scale: i32, ) -> __m128d { let zero = _mm_setzero_pd(); let neg_one = _mm_set1_pd(-1.0); @@ -1590,7 +1486,7 @@ pub unsafe fn _mm_mask_i32gather_pd( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i32gather_pd( - slice: *const f64, offsets: __m128i, scale: i32 + slice: *const f64, offsets: __m128i, scale: i32, ) -> __m256d { let zero = _mm256_setzero_pd(); let neg_one = _mm256_set1_pd(-1.0); @@ -1640,7 +1536,7 @@ pub unsafe fn _mm256_mask_i32gather_pd( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i64gather_epi32( - slice: *const i32, offsets: __m128i, scale: i32 + slice: *const i32, offsets: __m128i, scale: i32, ) -> __m128i { let zero = _mm_setzero_si128().as_i32x4(); let neg_one = _mm_set1_epi64x(-1).as_i32x4(); @@ -1694,7 +1590,7 @@ pub unsafe fn _mm_mask_i64gather_epi32( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i64gather_epi32( - slice: *const i32, offsets: __m256i, scale: i32 + slice: *const i32, offsets: __m256i, scale: i32, ) -> __m128i { let zero = _mm_setzero_si128().as_i32x4(); let neg_one = _mm_set1_epi64x(-1).as_i32x4(); @@ -1748,7 +1644,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i64gather_ps( - slice: *const f32, offsets: __m128i, scale: i32 + slice: *const f32, offsets: __m128i, scale: i32, ) -> __m128 { let zero = _mm_setzero_ps(); let neg_one = _mm_set1_ps(-1.0); @@ -1774,7 +1670,7 @@ pub unsafe fn _mm_i64gather_ps( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mask_i64gather_ps( - src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32 + src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32, ) -> __m128 { let offsets = offsets.as_i64x2(); let slice = slice as *const i8; @@ -1797,7 +1693,7 @@ pub unsafe fn _mm_mask_i64gather_ps( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i64gather_ps( - slice: *const f32, offsets: __m256i, scale: i32 + slice: *const f32, offsets: __m256i, scale: i32, ) -> __m128 { let zero = _mm_setzero_ps(); let neg_one = _mm_set1_ps(-1.0); @@ -1823,7 +1719,7 @@ pub unsafe fn _mm256_i64gather_ps( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mask_i64gather_ps( - src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32 + src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32, ) -> __m128 { let offsets = offsets.as_i64x4(); let slice = slice as *const i8; @@ -1846,7 +1742,7 @@ pub unsafe fn _mm256_mask_i64gather_ps( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i64gather_epi64( - slice: *const i64, offsets: __m128i, scale: i32 + slice: *const i64, offsets: __m128i, scale: i32, ) -> __m128i { let zero = _mm_setzero_si128().as_i64x2(); let neg_one = _mm_set1_epi64x(-1).as_i64x2(); @@ -1900,7 +1796,7 @@ pub unsafe fn _mm_mask_i64gather_epi64( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i64gather_epi64( - slice: *const i64, offsets: __m256i, scale: i32 + slice: *const i64, offsets: __m256i, scale: i32, ) -> __m256i { let zero = _mm256_setzero_si256().as_i64x4(); let neg_one = _mm256_set1_epi64x(-1).as_i64x4(); @@ -1954,7 +1850,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_i64gather_pd( - slice: *const f64, offsets: __m128i, scale: i32 + slice: *const f64, offsets: __m128i, scale: i32, ) -> __m128d { let zero = _mm_setzero_pd(); let neg_one = _mm_set1_pd(-1.0); @@ -2004,7 +1900,7 @@ pub unsafe fn _mm_mask_i64gather_pd( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_i64gather_pd( - slice: *const f64, offsets: __m256i, scale: i32 + slice: *const f64, offsets: __m256i, scale: i32, ) -> __m256d { let zero = _mm256_setzero_pd(); let neg_one = _mm256_set1_pd(-1.0); @@ -2053,7 +1949,7 @@ pub unsafe fn _mm256_mask_i64gather_pd( #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_inserti128_si256( - a: __m256i, b: __m128i, imm8: i32 + a: __m256i, b: __m128i, imm8: i32, ) -> __m256i { let a = a.as_i64x4(); let b = _mm256_castsi128_si256(b).as_i64x4(); @@ -2101,12 +1997,9 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskload_epi32( - mem_addr: *const i32, mask: __m128i + mem_addr: *const i32, mask: __m128i, ) -> __m128i { - mem::transmute(maskloadd( - mem_addr as *const i8, - mask.as_i32x4(), - )) + mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4())) } /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask` @@ -2119,12 +2012,9 @@ pub unsafe fn _mm_maskload_epi32( #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_epi32( - mem_addr: *const i32, mask: __m256i + mem_addr: *const i32, mask: __m256i, ) -> __m256i { - mem::transmute(maskloadd256( - mem_addr as *const i8, - mask.as_i32x8(), - )) + mem::transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8())) } /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask` @@ -2137,12 +2027,9 @@ pub unsafe fn _mm256_maskload_epi32( #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskload_epi64( - mem_addr: *const i64, mask: __m128i + mem_addr: *const i64, mask: __m128i, ) -> __m128i { - mem::transmute(maskloadq( - mem_addr as *const i8, - mask.as_i64x2(), - )) + mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2())) } /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask` @@ -2155,12 +2042,9 @@ pub unsafe fn _mm_maskload_epi64( #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_epi64( - mem_addr: *const i64, mask: __m256i + mem_addr: *const i64, mask: __m256i, ) -> __m256i { - mem::transmute(maskloadq256( - mem_addr as *const i8, - mask.as_i64x4(), - )) + mem::transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4())) } /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr` @@ -2173,13 +2057,9 @@ pub unsafe fn _mm256_maskload_epi64( #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskstore_epi32( - mem_addr: *mut i32, mask: __m128i, a: __m128i + mem_addr: *mut i32, mask: __m128i, a: __m128i, ) { - maskstored( - mem_addr as *mut i8, - mask.as_i32x4(), - a.as_i32x4(), - ) + maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4()) } /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr` @@ -2192,13 +2072,9 @@ pub unsafe fn _mm_maskstore_epi32( #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_epi32( - mem_addr: *mut i32, mask: __m256i, a: __m256i + mem_addr: *mut i32, mask: __m256i, a: __m256i, ) { - maskstored256( - mem_addr as *mut i8, - mask.as_i32x8(), - a.as_i32x8(), - ) + maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8()) } /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr` @@ -2211,13 +2087,9 @@ pub unsafe fn _mm256_maskstore_epi32( #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskstore_epi64( - mem_addr: *mut i64, mask: __m128i, a: __m128i + mem_addr: *mut i64, mask: __m128i, a: __m128i, ) { - maskstoreq( - mem_addr as *mut i8, - mask.as_i64x2(), - a.as_i64x2(), - ) + maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2()) } /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr` @@ -2230,13 +2102,9 @@ pub unsafe fn _mm_maskstore_epi64( #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_epi64( - mem_addr: *mut i64, mask: __m256i, a: __m256i + mem_addr: *mut i64, mask: __m256i, a: __m256i, ) { - maskstoreq256( - mem_addr as *mut i8, - mask.as_i64x4(), - a.as_i64x4(), - ) + maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4()) } /// Compare packed 16-bit integers in `a` and `b`, and return the packed @@ -2410,7 +2278,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mpsadbw_epu8( - a: __m256i, b: __m256i, imm8: i32 + a: __m256i, b: __m256i, imm8: i32, ) -> __m256i { let a = a.as_u8x32(); let b = b.as_u8x32(); @@ -2656,7 +2524,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_permute2x128_si256( - a: __m256i, b: __m256i, imm8: i32 + a: __m256i, b: __m256i, imm8: i32, ) -> __m256i { let a = a.as_i64x4(); let b = b.as_i64x4(); @@ -3559,16 +3427,23 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { /// # if is_x86_feature_detected!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, -/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); +/// let a = _mm256_setr_epi8( +/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +/// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +/// ); +/// let b = _mm256_setr_epi8( +/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, +/// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, +/// -30, -31, +/// ); /// /// let c = _mm256_unpackhi_epi8(a, b); /// -/// let expected = _mm256_setr_epi8(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, -/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, -/// 31,-31); +/// let expected = _mm256_setr_epi8( +/// 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15, +/// 24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31, +/// -31, +/// ); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3612,15 +3487,22 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { /// # if is_x86_feature_detected!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, -/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); +/// let a = _mm256_setr_epi8( +/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, +/// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +/// ); +/// let b = _mm256_setr_epi8( +/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, +/// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, +/// -30, -31, +/// ); /// /// let c = _mm256_unpacklo_epi8(a, b); /// -/// let expected = _mm256_setr_epi8(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, -/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23); +/// let expected = _mm256_setr_epi8( +/// 0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17, +/// -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23, +/// ); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3664,13 +3546,18 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { /// # if is_x86_feature_detected!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); +/// let a = _mm256_setr_epi16( +/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +/// ); +/// let b = _mm256_setr_epi16( +/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, +/// ); /// /// let c = _mm256_unpackhi_epi16(a, b); /// -/// let expected = _mm256_setr_epi16(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, -/// 15,-15); +/// let expected = _mm256_setr_epi16( +/// 4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15, +/// ); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3688,9 +3575,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { let r: i16x16 = simd_shuffle16( a.as_i16x16(), b.as_i16x16(), - [ - 4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31 - ], + [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31], ); mem::transmute(r) } @@ -3715,13 +3600,18 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { /// -/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); +/// let a = _mm256_setr_epi16( +/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +/// ); +/// let b = _mm256_setr_epi16( +/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, +/// ); /// /// let c = _mm256_unpacklo_epi16(a, b); /// -/// let expected = _mm256_setr_epi16(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, -/// 11,-11); +/// let expected = _mm256_setr_epi16( +/// 0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11, +/// ); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3739,9 +3629,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { let r: i16x16 = simd_shuffle16( a.as_i16x16(), b.as_i16x16(), - [ - 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27 - ], + [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27], ); mem::transmute(r) } @@ -3766,11 +3654,11 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); -/// let b = _mm256_setr_epi32(0,-1,-2,-3,-4,-5,-6,-7); +/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7); /// /// let c = _mm256_unpackhi_epi32(a, b); /// -/// let expected = _mm256_setr_epi32(2,-2, 3,-3, 6,-6, 7,-7); +/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3813,11 +3701,11 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); -/// let b = _mm256_setr_epi32(0,-1,-2,-3,-4,-5,-6,-7); +/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7); /// /// let c = _mm256_unpacklo_epi32(a, b); /// -/// let expected = _mm256_setr_epi32(0, 0, 1,-1, 4,-4, 5,-5); +/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3832,11 +3720,8 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vunpcklps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { - let r: i32x8 = simd_shuffle8( - a.as_i32x8(), - b.as_i32x8(), - [0, 8, 1, 9, 4, 12, 5, 13], - ); + let r: i32x8 = + simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); mem::transmute(r) } @@ -3860,11 +3745,11 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { /// let a = _mm256_setr_epi64x(0, 1, 2, 3); -/// let b = _mm256_setr_epi64x(0,-1,-2,-3); +/// let b = _mm256_setr_epi64x(0, -1, -2, -3); /// /// let c = _mm256_unpackhi_epi64(a, b); /// -/// let expected = _mm256_setr_epi64x(1,-1, 3,-3); +/// let expected = _mm256_setr_epi64x(1, -1, 3, -3); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -3903,11 +3788,11 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { /// let a = _mm256_setr_epi64x(0, 1, 2, 3); -/// let b = _mm256_setr_epi64x(0,-1,-2,-3); +/// let b = _mm256_setr_epi64x(0, -1, -2, -3); /// /// let c = _mm256_unpacklo_epi64(a, b); /// -/// let expected = _mm256_setr_epi64x(0, 0, 2,-2); +/// let expected = _mm256_setr_epi64x(0, 0, 2, -2); /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0); /// /// # } @@ -4183,35 +4068,35 @@ extern "C" { fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.d"] fn pgatherdd( - src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8 + src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8, ) -> i32x4; #[link_name = "llvm.x86.avx2.gather.d.d.256"] fn vpgatherdd( - src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8 + src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8, ) -> i32x8; #[link_name = "llvm.x86.avx2.gather.d.q"] fn pgatherdq( - src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8 + src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8, ) -> i64x2; #[link_name = "llvm.x86.avx2.gather.d.q.256"] fn vpgatherdq( - src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8 + src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8, ) -> i64x4; #[link_name = "llvm.x86.avx2.gather.q.d"] fn pgatherqd( - src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8 + src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8, ) -> i32x4; #[link_name = "llvm.x86.avx2.gather.q.d.256"] fn vpgatherqd( - src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8 + src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8, ) -> i32x4; #[link_name = "llvm.x86.avx2.gather.q.q"] fn pgatherqq( - src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8 + src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8, ) -> i64x2; #[link_name = "llvm.x86.avx2.gather.q.q.256"] fn vpgatherqq( - src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8 + src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8, ) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.pd"] fn pgatherdpd( @@ -4235,19 +4120,19 @@ extern "C" { ) -> __m256d; #[link_name = "llvm.x86.avx2.gather.d.ps"] fn pgatherdps( - src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8 + src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8, ) -> __m128; #[link_name = "llvm.x86.avx2.gather.d.ps.256"] fn vpgatherdps( - src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8 + src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8, ) -> __m256; #[link_name = "llvm.x86.avx2.gather.q.ps"] fn pgatherqps( - src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8 + src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8, ) -> __m128; #[link_name = "llvm.x86.avx2.gather.q.ps.256"] fn vpgatherqps( - src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8 + src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8, ) -> __m128; #[link_name = "llvm.x86.avx2.psll.dq"] fn vpslldq(a: i64x4, b: i32) -> i64x4; @@ -4718,10 +4603,7 @@ mod tests { 7, 6, 5, 4, 3, 2, 1, 0, ); let r = _mm256_cmpeq_epi8(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2), - ); + assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2)); } #[simd_test(enable = "avx2")] @@ -4737,10 +4619,7 @@ mod tests { 7, 6, 5, 4, 3, 2, 1, 0, ); let r = _mm256_cmpeq_epi16(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2), - ); + assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2)); } #[simd_test(enable = "avx2")] @@ -4758,10 +4637,7 @@ mod tests { let a = _mm256_setr_epi64x(0, 1, 2, 3); let b = _mm256_setr_epi64x(3, 2, 2, 0); let r = _mm256_cmpeq_epi64(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2), - ); + assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2)); } #[simd_test(enable = "avx2")] @@ -4769,10 +4645,7 @@ mod tests { let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0); let b = _mm256_set1_epi8(0); let r = _mm256_cmpgt_epi8(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0), - ); + assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0)); } #[simd_test(enable = "avx2")] @@ -4780,10 +4653,7 @@ mod tests { let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0); let b = _mm256_set1_epi16(0); let r = _mm256_cmpgt_epi16(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0), - ); + assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0)); } #[simd_test(enable = "avx2")] @@ -4791,10 +4661,7 @@ mod tests { let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0); let b = _mm256_set1_epi32(0); let r = _mm256_cmpgt_epi32(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0), - ); + assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0)); } #[simd_test(enable = "avx2")] @@ -4802,10 +4669,7 @@ mod tests { let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0); let b = _mm256_set1_epi64x(0); let r = _mm256_cmpgt_epi64(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0), - ); + assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0)); } #[simd_test(enable = "avx2")] @@ -5997,16 +5861,7 @@ mod tests { ); assert_eq_m256( r, - _mm256_setr_ps( - 0.0, - 16.0, - 64.0, - 256.0, - 256.0, - 256.0, - 256.0, - 256.0, - ), + _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0), ); } diff --git a/coresimd/x86/bmi1.rs b/coresimd/x86/bmi1.rs index 953e3d9135..a84763b7e8 100644 --- a/coresimd/x86/bmi1.rs +++ b/coresimd/x86/bmi1.rs @@ -21,17 +21,14 @@ use stdsimd_test::assert_instr; #[cfg_attr(test, assert_instr(bextr))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { - _bextr2_u32( - a, - (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32), - ) + _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32)) } /// Extracts bits of `a` specified by `control` into /// the least significant bits of the result. /// -/// Bits `[7,0]` of `control` specify the index to the first bit in the range to -/// be extracted, and bits `[15,8]` specify the length of the range. +/// Bits `[7,0]` of `control` specify the index to the first bit in the range +/// to be extracted, and bits `[15,8]` specify the length of the range. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32) #[inline] diff --git a/coresimd/x86/cpuid.rs b/coresimd/x86/cpuid.rs index 7e000625ce..6217d8824a 100644 --- a/coresimd/x86/cpuid.rs +++ b/coresimd/x86/cpuid.rs @@ -86,26 +86,58 @@ pub fn has_cpuid() -> bool { } #[cfg(target_arch = "x86")] { - use coresimd::x86::{__readeflags, __writeeflags}; + // Optimization for i586 and i686 Rust targets which SSE enabled + // and support cpuid: + #[cfg(target_feature = "sse")] { + true + } - // On `x86` the `cpuid` instruction is not always available. - // This follows the approach indicated in: - // http://wiki.osdev.org/CPUID#Checking_CPUID_availability + // If SSE is not enabled, detect whether cpuid is available: + #[cfg(not(target_feature = "sse"))] unsafe { - // Read EFLAGS: - let eflags: u32 = __readeflags(); - - // Invert the ID bit in EFLAGS: - let eflags_mod: u32 = eflags | 0x0020_0000; - - // Store the modified EFLAGS (ID bit may or may not be inverted) - __writeeflags(eflags_mod); - - // Read EFLAGS again: - let eflags_after: u32 = __readeflags(); - - // Check if the ID bit changed: - eflags_after != eflags + // On `x86` the `cpuid` instruction is not always available. + // This follows the approach indicated in: + // http://wiki.osdev.org/CPUID#Checking_CPUID_availability + // https://software.intel.com/en-us/articles/using-cpuid-to-detect-the-presence-of-sse-41-and-sse-42-instruction-sets/ + // which detects whether `cpuid` is available by checking whether the 21st bit of the EFLAGS register is modifiable or not. + // If it is, then `cpuid` is available. + let result: u32; + let _temp: u32; + asm!(r#" + # Read eflags into $0 and copy it into $1: + pushfd + pop $0 + mov $1, $0 + # Flip 21st bit of $0. + xor $0, 0x200000 + # Set eflags to the value of $0 + # + # Bit 21st can only be modified if cpuid is available + push $0 + popfd # A + # Read eflags into $0: + pushfd # B + pop $0 + # xor with the original eflags sets the bits that + # have been modified: + xor $0, $1 + "# + : "=r"(result), "=r"(_temp) + : + : "cc", "memory" + : "intel"); + // There is a race between popfd (A) and pushfd (B) + // where other bits beyond 21st may have been modified due to + // interrupts, a debugger stepping through the asm, etc. + // + // Therefore, explicitly check whether the 21st bit + // was modified or not. + // + // If the result is zero, the cpuid bit was not modified. + // If the result is 0x200000 (non-zero), then the cpuid + // was correctly modified and the CPU supports the cpuid + // instruction: + (result & 0x200000) != 0 } } } @@ -138,17 +170,8 @@ mod tests { assert!(cpuid::has_cpuid()); } - #[cfg(target_arch = "x86")] #[test] - fn test_has_cpuid() { - unsafe { - let before = __readeflags(); - - if cpuid::has_cpuid() { - assert!(before != __readeflags()); - } else { - assert!(before == __readeflags()); - } - } + fn test_has_cpuid_idempotent() { + assert_eq!(cpuid::has_cpuid(), cpuid::has_cpuid()); } } diff --git a/coresimd/x86/eflags.rs b/coresimd/x86/eflags.rs index 0a7ba919a7..42380e6e38 100644 --- a/coresimd/x86/eflags.rs +++ b/coresimd/x86/eflags.rs @@ -6,6 +6,8 @@ #[cfg(target_arch = "x86")] #[inline(always)] #[stable(feature = "simd_x86", since = "1.27.0")] +#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")] +#[doc(hidden)] pub unsafe fn __readeflags() -> u32 { let eflags: u32; asm!("pushfd; popl $0" : "=r"(eflags) : : : "volatile"); @@ -18,6 +20,8 @@ pub unsafe fn __readeflags() -> u32 { #[cfg(target_arch = "x86_64")] #[inline(always)] #[stable(feature = "simd_x86", since = "1.27.0")] +#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")] +#[doc(hidden)] pub unsafe fn __readeflags() -> u64 { let eflags: u64; asm!("pushfq; popq $0" : "=r"(eflags) : : : "volatile"); @@ -30,6 +34,8 @@ pub unsafe fn __readeflags() -> u64 { #[cfg(target_arch = "x86")] #[inline(always)] #[stable(feature = "simd_x86", since = "1.27.0")] +#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")] +#[doc(hidden)] pub unsafe fn __writeeflags(eflags: u32) { asm!("pushl $0; popfd" : : "r"(eflags) : "cc", "flags" : "volatile"); } @@ -40,6 +46,8 @@ pub unsafe fn __writeeflags(eflags: u32) { #[cfg(target_arch = "x86_64")] #[inline(always)] #[stable(feature = "simd_x86", since = "1.27.0")] +#[rustc_deprecated(since = "1.29.0", reason = "See issue #51810 - use inline assembly instead")] +#[doc(hidden)] pub unsafe fn __writeeflags(eflags: u64) { asm!("pushq $0; popfq" : : "r"(eflags) : "cc", "flags" : "volatile"); } @@ -49,6 +57,7 @@ mod tests { use coresimd::x86::*; #[test] + #[allow(deprecated)] fn test_eflags() { unsafe { // reads eflags, writes them back, reads them again, diff --git a/coresimd/x86/fxsr.rs b/coresimd/x86/fxsr.rs index b70e84eaf9..91261c721c 100644 --- a/coresimd/x86/fxsr.rs +++ b/coresimd/x86/fxsr.rs @@ -58,7 +58,7 @@ pub unsafe fn _fxrstor(mem_addr: *const u8) { #[cfg(test)] mod tests { use coresimd::x86::*; - use std::{fmt, cmp::PartialEq}; + use std::{cmp::PartialEq, fmt}; use stdsimd_test::simd_test; #[repr(align(16))] diff --git a/coresimd/x86/mmx.rs b/coresimd/x86/mmx.rs index 6e6b0b7d4f..c58a97c3cd 100644 --- a/coresimd/x86/mmx.rs +++ b/coresimd/x86/mmx.rs @@ -380,7 +380,7 @@ pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 { #[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set_pi8( - e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8 + e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> __m64 { _mm_setr_pi8(e0, e1, e2, e3, e4, e5, e6, e7) } @@ -426,7 +426,7 @@ pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 { #[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_setr_pi8( - e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8 + e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8, ) -> __m64 { mem::transmute(i8x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) } @@ -514,12 +514,8 @@ mod tests { -30001, i16::max_value() - 1, ); - let e = _mm_setr_pi16( - i16::min_value(), - 30000, - -30000, - i16::max_value(), - ); + let e = + _mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value()); assert_eq_m64(e, _mm_add_pi16(a, b)); assert_eq_m64(e, _m_paddw(a, b)); } @@ -537,16 +533,8 @@ mod tests { unsafe fn test_mm_adds_pi8() { let a = _mm_setr_pi8(-100, -1, 1, 100, -1, 0, 1, 0); let b = _mm_setr_pi8(-100, 1, -1, 100, 0, -1, 0, 1); - let e = _mm_setr_pi8( - i8::min_value(), - 0, - 0, - i8::max_value(), - -1, - -1, - 1, - 1, - ); + let e = + _mm_setr_pi8(i8::min_value(), 0, 0, i8::max_value(), -1, -1, 1, 1); assert_eq_m64(e, _mm_adds_pi8(a, b)); assert_eq_m64(e, _m_paddsb(a, b)); } diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs index aba3d137c0..31d950a2c6 100644 --- a/coresimd/x86/mod.rs +++ b/coresimd/x86/mod.rs @@ -276,13 +276,13 @@ types! { /// use std::arch::x86_64::*; /// /// # fn main() { - /// # #[target_feature(enable = "sse")] + /// # #[target_feature(enable = "avx")] /// # unsafe fn foo() { /// let eight_zeros = _mm256_setzero_ps(); /// let eight_ones = _mm256_set1_ps(1.0); /// let eight_floats = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); /// # } - /// # if is_x86_feature_detected!("sse") { unsafe { foo() } } + /// # if is_x86_feature_detected!("avx") { unsafe { foo() } } /// # } /// ``` #[stable(feature = "simd_x86", since = "1.27.0")] @@ -444,11 +444,12 @@ impl m256iExt for __m256i { } } -use coresimd::simd::{f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8, - i32x2, i32x4, i32x8, i64x2, i64x4, i8x16, i8x32, i8x8, - m16x16, m16x4, m16x8, m32x2, m32x4, m32x8, m64x2, m64x4, - m8x16, m8x32, m8x8, u16x16, u16x4, u16x8, u32x2, u32x4, - u32x8, u64x2, u64x4, u8x16, u8x32, u8x8}; +use coresimd::simd::{ + f32x2, f32x4, f32x8, f64x2, f64x4, i16x16, i16x4, i16x8, i32x2, i32x4, + i32x8, i64x2, i64x4, i8x16, i8x32, i8x8, m16x16, m16x4, m16x8, m32x2, + m32x4, m32x8, m64x2, m64x4, m8x16, m8x32, m8x8, u16x16, u16x4, u16x8, + u32x2, u32x4, u32x8, u64x2, u64x4, u8x16, u8x32, u8x8, +}; impl_from_bits_!( __m64: u32x2, diff --git a/coresimd/x86/pclmulqdq.rs b/coresimd/x86/pclmulqdq.rs index e33928f06d..987ac89d79 100644 --- a/coresimd/x86/pclmulqdq.rs +++ b/coresimd/x86/pclmulqdq.rs @@ -25,20 +25,25 @@ extern "C" { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128) #[inline] #[target_feature(enable = "pclmulqdq")] -#[cfg_attr(all(test, not(target_os = "linux")), - assert_instr(pclmulqdq, imm8 = 0))] -#[cfg_attr(all(test, target_os = "linux"), - assert_instr(pclmullqlqdq, imm8 = 0))] -#[cfg_attr(all(test, target_os = "linux"), - assert_instr(pclmulhqlqdq, imm8 = 1))] -#[cfg_attr(all(test, target_os = "linux"), - assert_instr(pclmullqhqdq, imm8 = 16))] -#[cfg_attr(all(test, target_os = "linux"), - assert_instr(pclmulhqhqdq, imm8 = 17))] +#[cfg_attr( + all(test, not(target_os = "linux")), assert_instr(pclmulqdq, imm8 = 0) +)] +#[cfg_attr( + all(test, target_os = "linux"), assert_instr(pclmullqlqdq, imm8 = 0) +)] +#[cfg_attr( + all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, imm8 = 1) +)] +#[cfg_attr( + all(test, target_os = "linux"), assert_instr(pclmullqhqdq, imm8 = 16) +)] +#[cfg_attr( + all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, imm8 = 17) +)] #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_clmulepi64_si128( - a: __m128i, b: __m128i, imm8: i32 + a: __m128i, b: __m128i, imm8: i32, ) -> __m128i { macro_rules! call { ($imm8:expr) => { diff --git a/coresimd/x86/rdrand.rs b/coresimd/x86/rdrand.rs index a20cee0747..2b900837fd 100644 --- a/coresimd/x86/rdrand.rs +++ b/coresimd/x86/rdrand.rs @@ -1,4 +1,3 @@ - //! RDRAND and RDSEED instructions for returning random numbers from an Intel //! on-chip hardware random number generator which has been seeded by an //! on-chip entropy source. diff --git a/coresimd/x86/sha.rs b/coresimd/x86/sha.rs index 344cb43991..f6546fa1b4 100644 --- a/coresimd/x86/sha.rs +++ b/coresimd/x86/sha.rs @@ -75,7 +75,7 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_sha1rnds4_epu32( - a: __m128i, b: __m128i, func: i32 + a: __m128i, b: __m128i, func: i32, ) -> __m128i { let a = a.as_i32x4(); let b = b.as_i32x4(); @@ -126,13 +126,9 @@ pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(sha256rnds2))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_sha256rnds2_epu32( - a: __m128i, b: __m128i, k: __m128i + a: __m128i, b: __m128i, k: __m128i, ) -> __m128i { - mem::transmute(sha256rnds2( - a.as_i32x4(), - b.as_i32x4(), - k.as_i32x4(), - )) + mem::transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) } #[cfg(test)] diff --git a/coresimd/x86/sse.rs b/coresimd/x86/sse.rs index a51f3f1423..c53b46a774 100644 --- a/coresimd/x86/sse.rs +++ b/coresimd/x86/sse.rs @@ -230,8 +230,10 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { #[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `and` instructions, so ignore it. -#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(andps))] +#[cfg_attr( + all(test, any(target_arch = "x86_64", target_feature = "sse2")), + assert_instr(andps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { let a: __m128i = mem::transmute(a); @@ -249,8 +251,10 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { #[target_feature(enable = "sse")] // i586 only seems to generate plain `not` and `and` instructions, so ignore // it. -#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(andnps))] +#[cfg_attr( + all(test, any(target_arch = "x86_64", target_feature = "sse2")), + assert_instr(andnps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { let a: __m128i = mem::transmute(a); @@ -265,8 +269,10 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { #[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `or` instructions, so we ignore it. -#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(orps))] +#[cfg_attr( + all(test, any(target_arch = "x86_64", target_feature = "sse2")), + assert_instr(orps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { let a: __m128i = mem::transmute(a); @@ -281,8 +287,10 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { #[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `xor` instructions, so we ignore it. -#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(xorps))] +#[cfg_attr( + all(test, any(target_arch = "x86_64", target_feature = "sse2")), + assert_instr(xorps) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { let a: __m128i = mem::transmute(a); @@ -968,6 +976,14 @@ pub unsafe fn _mm_setzero_ps() -> __m128 { __m128(0.0, 0.0, 0.0, 0.0) } +/// A utility function for creating masks to use with Intel shuffle and permute intrinsics. +#[inline] +#[allow(non_snake_case)] +#[stable(feature = "simd_x86", since = "1.28.0")] +pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> u32 { + (z << 6) | (y << 4) | (x << 2) | w +} + /// Shuffle packed single-precision (32-bit) floating-point elements in `a` and /// `b` using `mask`. /// @@ -1117,7 +1133,7 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { /// # /// let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); /// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; -/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _) ; +/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _); /// // assert_eq!(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0)); /// # /// # } @@ -1132,10 +1148,14 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movhpd))] // 32-bit codegen does not generate `movhps` or `movhpd`, but instead // `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2). -#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"), - assert_instr(movlhps))] -#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")), - assert_instr(unpcklps))] +#[cfg_attr( + all(test, target_arch = "x86", target_feature = "sse2"), + assert_instr(movlhps) +)] +#[cfg_attr( + all(test, target_arch = "x86", not(target_feature = "sse2")), + assert_instr(unpcklps) +)] // TODO: This function is actually not limited to floats, but that's what // what matches the C type most closely: (__m128, *const __m64) -> __m128 pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 { @@ -1171,7 +1191,7 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 { /// # /// let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); /// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; -/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _) ; +/// let r = _mm_loadh_pi(a, data[..].as_ptr() as *const _); /// // assert_eq!(r, _mm_setr_ps(5.0, 6.0, 3.0, 4.0)); /// # /// # } @@ -1185,11 +1205,15 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 { // #[cfg_attr(test, assert_instr(movlps))] #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))] // On 32-bit targets with SSE2, it just generates two `movsd`. -#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"), - assert_instr(movsd))] +#[cfg_attr( + all(test, target_arch = "x86", target_feature = "sse2"), + assert_instr(movsd) +)] // It should really generate "movlps", but oh well... -#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")), - assert_instr(movss))] +#[cfg_attr( + all(test, target_arch = "x86", not(target_feature = "sse2")), + assert_instr(movss) +)] // TODO: Like _mm_loadh_pi, this also isn't limited to floats. pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 { let q = p as *const f32x2; @@ -1321,8 +1345,10 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's // fine. // On i586 (no SSE2) it just generates plain MOV instructions. -#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(movhpd))] +#[cfg_attr( + all(test, any(target_arch = "x86_64", target_feature = "sse2")), + assert_instr(movhpd) +)] pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) { #[cfg(target_arch = "x86")] { @@ -1349,8 +1375,10 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) { #[inline] #[target_feature(enable = "sse")] // On i586 the codegen just generates plane MOVs. No need to test for that. -#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), - assert_instr(movlps))] +#[cfg_attr( + all(test, any(target_arch = "x86_64", target_feature = "sse2")), + assert_instr(movlps) +)] pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) { #[cfg(target_arch = "x86")] { @@ -1929,7 +1957,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 { #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _MM_TRANSPOSE4_PS( - row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128 + row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128, ) { let tmp0 = _mm_unpacklo_ps(*row0, *row1); let tmp2 = _mm_unpacklo_ps(*row2, *row3); @@ -2040,6 +2068,8 @@ extern "C" { fn pminub(a: __m64, b: __m64) -> __m64; #[link_name = "llvm.x86.mmx.pmulhu.w"] fn pmulhuw(a: __m64, b: __m64) -> __m64; + #[link_name = "llvm.x86.mmx.pmull.w"] + fn pmullw(a: __m64, b: __m64) -> __m64; #[link_name = "llvm.x86.mmx.pavg.b"] fn pavgb(a: __m64, b: __m64) -> __m64; #[link_name = "llvm.x86.mmx.pavg.w"] @@ -2157,6 +2187,16 @@ pub unsafe fn _mm_mulhi_pu16(a: __m64, b: __m64) -> __m64 { pmulhuw(a, b) } +/// Multiplies packed 16-bit integer values and writes the +/// low-order 16 bits of each 32-bit product to the corresponding bits in +/// the destination. +#[inline] +#[target_feature(enable = "sse,mmx")] +#[cfg_attr(test, assert_instr(pmullw))] +pub unsafe fn _mm_mullo_pi16(a: __m64, b: __m64) -> __m64 { + pmullw(a, b) +} + /// Multiplies packed 16-bit unsigned integer values and writes the /// high-order 16 bits of each 32-bit product to the corresponding bits in /// the destination. @@ -2722,12 +2762,8 @@ mod tests { let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); - let e2: u32x4 = transmute(_mm_setr_ps( - transmute(0xffffffffu32), - 2.0, - 3.0, - 4.0, - )); + let e2: u32x4 = + transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0)); assert_eq!(r2, e2); } @@ -3441,22 +3477,9 @@ mod tests { #[simd_test(enable = "sse")] unsafe fn test_mm_cvtss_si32() { - let inputs = &[ - 42.0f32, - -3.1, - 4.0e10, - 4.0e-20, - NAN, - 2147483500.1, - ]; - let result = &[ - 42i32, - -3, - i32::min_value(), - 0, - i32::min_value(), - 2147483520, - ]; + let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; + let result = + &[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520]; for i in 0..inputs.len() { let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); let e = result[i]; @@ -3570,6 +3593,13 @@ mod tests { assert_eq_m128(r, _mm_set1_ps(0.0)); } + #[simd_test(enable = "sse")] + unsafe fn test_mm_shuffle() { + assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); + assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); + assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); + } + #[simd_test(enable = "sse")] unsafe fn test_mm_shuffle_ps() { let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); @@ -3660,10 +3690,8 @@ mod tests { } let r = _mm_load_ps(p); - let e = _mm_add_ps( - _mm_setr_ps(1.0, 2.0, 3.0, 4.0), - _mm_set1_ps(fixup), - ); + let e = + _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); assert_eq_m128(r, e); } @@ -3693,10 +3721,8 @@ mod tests { } let r = _mm_loadr_ps(p); - let e = _mm_add_ps( - _mm_setr_ps(4.0, 3.0, 2.0, 1.0), - _mm_set1_ps(fixup), - ); + let e = + _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); assert_eq_m128(r, e); } @@ -3935,9 +3961,7 @@ mod tests { #[simd_test(enable = "sse")] unsafe fn test_mm_stream_ps() { let a = _mm_set1_ps(7.0); - let mut mem = Memory { - data: [-1.0; 4], - }; + let mut mem = Memory { data: [-1.0; 4] }; _mm_stream_ps(&mut mem.data[0] as *mut f32, a); for i in 0..4 { @@ -4001,6 +4025,13 @@ mod tests { assert_eq_m64(r, _mm_set1_pi16(15)); } + #[simd_test(enable = "sse,mmx")] + unsafe fn test_mm_mullo_pi16() { + let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001)); + let r = _mm_mullo_pi16(a, b); + assert_eq_m64(r, _mm_set1_pi16(17960)); + } + #[simd_test(enable = "sse,mmx")] unsafe fn test_m_pmulhuw() { let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001)); @@ -4138,12 +4169,8 @@ mod tests { #[simd_test(enable = "sse,mmx")] unsafe fn test_mm_movemask_pi8() { - let a = _mm_setr_pi16( - 0b1000_0000, - 0b0100_0000, - 0b1000_0000, - 0b0100_0000, - ); + let a = + _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000); let r = _mm_movemask_pi8(a); assert_eq!(r, 0b10001); diff --git a/coresimd/x86/sse2.rs b/coresimd/x86/sse2.rs index 7f7af002a4..d7d2a12fc5 100644 --- a/coresimd/x86/sse2.rs +++ b/coresimd/x86/sse2.rs @@ -1010,7 +1010,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { // no particular instruction to test #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_set_epi16( - e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16 + e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> __m128i { mem::transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) } @@ -1095,7 +1095,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { // no particular instruction to test #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_setr_epi16( - e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16 + e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> __m128i { _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7) } @@ -1134,10 +1134,15 @@ pub unsafe fn _mm_setzero_si128() -> __m128i { #[inline] #[target_feature(enable = "sse2")] // FIXME movsd on windows -#[cfg_attr(all(test, not(windows), - not(all(target_os = "linux", target_arch = "x86_64")), - target_arch = "x86_64"), - assert_instr(movq))] +#[cfg_attr( + all( + test, + not(windows), + not(all(target_os = "linux", target_arch = "x86_64")), + target_arch = "x86_64" + ), + assert_instr(movq) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i { _mm_set_epi64x(0, simd_extract((*mem_addr).as_i64x2(), 0)) @@ -1190,7 +1195,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { #[cfg_attr(test, assert_instr(maskmovdqu))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskmoveu_si128( - a: __m128i, mask: __m128i, mem_addr: *mut i8 + a: __m128i, mask: __m128i, mem_addr: *mut i8, ) { maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr) } @@ -1229,10 +1234,15 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { #[inline] #[target_feature(enable = "sse2")] // FIXME mov on windows, movlps on i686 -#[cfg_attr(all(test, not(windows), - not(all(target_os = "linux", target_arch = "x86_64")), - target_arch = "x86_64"), - assert_instr(movq))] +#[cfg_attr( + all( + test, + not(windows), + not(all(target_os = "linux", target_arch = "x86_64")), + target_arch = "x86_64" + ), + assert_instr(movq) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { ptr::copy_nonoverlapping( @@ -1275,8 +1285,9 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { #[inline] #[target_feature(enable = "sse2")] // FIXME movd on windows, movd on i686 -#[cfg_attr(all(test, not(windows), target_arch = "x86_64"), - assert_instr(movq))] +#[cfg_attr( + all(test, not(windows), target_arch = "x86_64"), assert_instr(movq) +)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i { let zero = _mm_setzero_si128(); @@ -1341,11 +1352,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i { - mem::transmute(simd_insert( - a.as_i16x8(), - (imm8 & 7) as u32, - i as i16, - )) + mem::transmute(simd_insert(a.as_i16x8(), (imm8 & 7) as u32, i as i16)) } /// Return a mask of the most significant bit of each element in `a`. @@ -1443,16 +1450,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i { simd_shuffle8( a, a, - [ - 0, - 1, - 2, - 3, - $x01 + 4, - $x23 + 4, - $x45 + 4, - $x67 + 4, - ], + [0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4], ) }; } @@ -1567,9 +1565,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i { mem::transmute::(simd_shuffle16( a.as_i8x16(), b.as_i8x16(), - [ - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 - ], + [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31], )) } @@ -1630,9 +1626,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i { mem::transmute::(simd_shuffle16( a.as_i8x16(), b.as_i8x16(), - [ - 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 - ], + [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23], )) } @@ -1644,11 +1638,8 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(punpcklwd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i { - let x = simd_shuffle8( - a.as_i16x8(), - b.as_i16x8(), - [0, 8, 1, 9, 2, 10, 3, 11], - ); + let x = + simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]); mem::transmute::(x) } @@ -1947,11 +1938,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(cmpltsd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d { - simd_insert( - _mm_cmplt_sd(b, a), - 1, - simd_extract::<_, f64>(a, 1), - ) + simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1)) } /// Return a new vector with the low element of `a` replaced by the @@ -1963,11 +1950,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(cmplesd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d { - simd_insert( - _mm_cmple_sd(b, a), - 1, - simd_extract::<_, f64>(a, 1), - ) + simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1)) } /// Return a new vector with the low element of `a` replaced by the result @@ -2042,11 +2025,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(cmpnltsd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d { - simd_insert( - _mm_cmpnlt_sd(b, a), - 1, - simd_extract::<_, f64>(a, 1), - ) + simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1)) } /// Return a new vector with the low element of `a` replaced by the @@ -2058,11 +2037,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(cmpnlesd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d { - simd_insert( - _mm_cmpnle_sd(b, a), - 1, - simd_extract::<_, f64>(a, 1), - ) + simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1)) } /// Compare corresponding elements in `a` and `b` for equality. @@ -2881,8 +2856,9 @@ pub unsafe fn _mm_undefined_si128() -> __m128i { /// The resulting `__m128d` element is composed by the low-order values of /// the two `__m128d` interleaved input elements, i.e.: /// -/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input -/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input +/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second +/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first +/// input /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd) #[inline] @@ -3223,22 +3199,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_add_epi8() { let a = _mm_setr_epi8( - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( @@ -3290,22 +3251,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_adds_epi8() { let a = _mm_setr_epi8( - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( @@ -3363,22 +3309,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_adds_epu8() { let a = _mm_setr_epi8( - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( @@ -3629,22 +3560,7 @@ mod tests { ); let r = _mm_slli_si128(a, 1); let e = _mm_setr_epi8( - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); assert_eq_m128i(r, e); @@ -3888,41 +3804,10 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpeq_epi8() { let a = _mm_setr_epi8( - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - ); - let b = _mm_setr_epi8( - 15, - 14, - 2, - 12, - 11, - 10, - 9, - 8, - 7, - 6, - 5, - 4, - 3, - 2, - 1, - 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); + let b = + _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = _mm_cmpeq_epi8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] assert_eq_m128i( @@ -4869,9 +4754,7 @@ mod tests { pub data: [f64; 2], } let a = _mm_set1_pd(7.0); - let mut mem = Memory { - data: [-1.0; 2], - }; + let mut mem = Memory { data: [-1.0; 2] }; _mm_stream_pd(&mut mem.data[0] as *mut f64, a); for i in 0..2 { @@ -4889,9 +4772,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_store_pd() { - let mut mem = Memory { - data: [0.0f64; 4], - }; + let mut mem = Memory { data: [0.0f64; 4] }; let vals = &mut mem.data; let a = _mm_setr_pd(1.0, 2.0); let d = vals.as_mut_ptr(); @@ -4903,9 +4784,7 @@ mod tests { #[simd_test(enable = "sse")] unsafe fn test_mm_storeu_pd() { - let mut mem = Memory { - data: [0.0f64; 4], - }; + let mut mem = Memory { data: [0.0f64; 4] }; let vals = &mut mem.data; let a = _mm_setr_pd(1.0, 2.0); @@ -4929,9 +4808,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_store1_pd() { - let mut mem = Memory { - data: [0.0f64; 4], - }; + let mut mem = Memory { data: [0.0f64; 4] }; let vals = &mut mem.data; let a = _mm_setr_pd(1.0, 2.0); let d = vals.as_mut_ptr(); @@ -4943,9 +4820,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_store_pd1() { - let mut mem = Memory { - data: [0.0f64; 4], - }; + let mut mem = Memory { data: [0.0f64; 4] }; let vals = &mut mem.data; let a = _mm_setr_pd(1.0, 2.0); let d = vals.as_mut_ptr(); @@ -4957,9 +4832,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_storer_pd() { - let mut mem = Memory { - data: [0.0f64; 4], - }; + let mut mem = Memory { data: [0.0f64; 4] }; let vals = &mut mem.data; let a = _mm_setr_pd(1.0, 2.0); let d = vals.as_mut_ptr(); @@ -5013,10 +4886,7 @@ mod tests { } let r = _mm_loadu_pd(d); - let e = _mm_add_pd( - _mm_setr_pd(1.0, 2.0), - _mm_set1_pd(offset as f64), - ); + let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64)); assert_eq_m128d(r, e); } @@ -5091,12 +4961,8 @@ mod tests { assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4)); - let a = _mm_setr_ps( - -1.1, - f32::NEG_INFINITY, - f32::MAX, - f32::NEG_INFINITY, - ); + let a = + _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY); let b = _mm_setr_pd(f64::INFINITY, -5.0); let r = _mm_cvtsd_ss(a, b); @@ -5161,12 +5027,8 @@ mod tests { let r = _mm_cvttps_epi32(a); assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6)); - let a = _mm_setr_ps( - f32::NEG_INFINITY, - f32::INFINITY, - f32::MIN, - f32::MAX, - ); + let a = + _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX); let r = _mm_cvttps_epi32(a); assert_eq_m128i( r, diff --git a/coresimd/x86/sse41.rs b/coresimd/x86/sse41.rs index ba65004966..198bb16ba0 100644 --- a/coresimd/x86/sse41.rs +++ b/coresimd/x86/sse41.rs @@ -66,13 +66,9 @@ pub const _MM_FROUND_NEARBYINT: i32 = #[cfg_attr(test, assert_instr(pblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_epi8( - a: __m128i, b: __m128i, mask: __m128i + a: __m128i, b: __m128i, mask: __m128i, ) -> __m128i { - mem::transmute(pblendvb( - a.as_i8x16(), - b.as_i8x16(), - mask.as_i8x16(), - )) + mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) } /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`. @@ -250,11 +246,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i { - mem::transmute(simd_insert( - a.as_i8x16(), - (imm8 & 0b1111) as u32, - i as i8, - )) + mem::transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8)) } /// Return a copy of `a` with the 32-bit integer from `i` inserted at a @@ -267,11 +259,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i { #[rustc_args_required_const(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i { - mem::transmute(simd_insert( - a.as_i32x4(), - (imm8 & 0b11) as u32, - i, - )) + mem::transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i)) } /// Compare packed 8-bit integers in `a` and `b` and return packed maximum @@ -1778,16 +1766,12 @@ mod tests { } { let a = _mm_setr_epi32( - 15, - 2, /* ignored */ - 1234567, - 4, /* ignored */ + 15, 2, /* ignored */ + 1234567, 4, /* ignored */ ); let b = _mm_setr_epi32( - -20, - -256, /* ignored */ - 666666, - 666666, /* ignored */ + -20, -256, /* ignored */ + 666666, 666666, /* ignored */ ); let r = _mm_mul_epi32(a, b); let e = _mm_setr_epi64x(-300, 823043843622); diff --git a/coresimd/x86/sse42.rs b/coresimd/x86/sse42.rs index 0ba76b57b9..845c5bff58 100644 --- a/coresimd/x86/sse42.rs +++ b/coresimd/x86/sse42.rs @@ -151,7 +151,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i { /// let b = _mm_loadu_si128(chunk.as_ptr() as *const _); /// let idx = _mm_cmpistri(a, b, _SIDD_CMP_EQUAL_ORDERED); /// if idx != 16 { -/// indexes.push((idx as usize) + (i * hop)); +/// indexes.push((idx as usize) + (i * hop)); /// } /// } /// assert_eq!(indexes, vec![34]); @@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 { #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestrm( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> __m128i { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -544,7 +544,7 @@ pub unsafe fn _mm_cmpestrm( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestri( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> i32 { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -567,7 +567,7 @@ pub unsafe fn _mm_cmpestri( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestrz( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> i32 { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -590,7 +590,7 @@ pub unsafe fn _mm_cmpestrz( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestrc( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> i32 { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -613,7 +613,7 @@ pub unsafe fn _mm_cmpestrc( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestrs( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> i32 { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -636,7 +636,7 @@ pub unsafe fn _mm_cmpestrs( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestro( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> i32 { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -660,7 +660,7 @@ pub unsafe fn _mm_cmpestro( #[rustc_args_required_const(4)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cmpestra( - a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32 + a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32, ) -> i32 { let a = a.as_i8x16(); let b = b.as_i8x16(); @@ -917,13 +917,8 @@ mod tests { unsafe fn test_mm_cmpestra() { let a = str_to_m128i(b"Cannot match a"); let b = str_to_m128i(b"Null after 14"); - let i = _mm_cmpestra( - a, - 14, - b, - 16, - _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK, - ); + let i = + _mm_cmpestra(a, 14, b, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK); assert_eq!(1, i); } diff --git a/coresimd/x86/sse4a.rs b/coresimd/x86/sse4a.rs index 7c45ca1d11..2c184166fb 100644 --- a/coresimd/x86/sse4a.rs +++ b/coresimd/x86/sse4a.rs @@ -25,8 +25,8 @@ extern "C" { /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. /// /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The -/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All other -/// bits are ignored. +/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All +/// other bits are ignored. /// /// If the length is zero, it is interpreted as `64`. If the length and index /// are zero, the lower 64 bits of `x` are extracted. diff --git a/coresimd/x86/ssse3.rs b/coresimd/x86/ssse3.rs index 2b66847091..e2c415f1cb 100644 --- a/coresimd/x86/ssse3.rs +++ b/coresimd/x86/ssse3.rs @@ -596,24 +596,8 @@ mod tests { 12, 5, 5, 10, 4, 1, 8, 0, ); - let expected = _mm_setr_epi8( - 5, - 0, - 5, - 4, - 9, - 13, - 7, - 4, - 13, - 6, - 6, - 11, - 5, - 2, - 9, - 1, - ); + let expected = + _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1); let r = _mm_shuffle_epi8(a, b); assert_eq_m128i(r, expected); } diff --git a/coresimd/x86/test.rs b/coresimd/x86/test.rs index 1b5b6b1fb0..72077f383e 100644 --- a/coresimd/x86/test.rs +++ b/coresimd/x86/test.rs @@ -121,7 +121,7 @@ mod x86_polyfill { #[target_feature(enable = "avx2")] pub unsafe fn _mm256_insert_epi64( - a: __m256i, val: i64, idx: i32 + a: __m256i, val: i64, idx: i32, ) -> __m256i { union A { a: __m256i, diff --git a/coresimd/x86/xsave.rs b/coresimd/x86/xsave.rs index 98df42da42..66816bdbff 100644 --- a/coresimd/x86/xsave.rs +++ b/coresimd/x86/xsave.rs @@ -38,11 +38,7 @@ extern "C" { #[cfg_attr(test, assert_instr(xsave))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) { - xsave( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial restore of the enabled processor states using @@ -110,11 +106,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 { #[cfg_attr(test, assert_instr(xsaveopt))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) { - xsaveopt( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial save of the enabled processor states to memory @@ -130,11 +122,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) { #[cfg_attr(test, assert_instr(xsavec))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) { - xsavec( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial save of the enabled processor states to memory at @@ -151,11 +139,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) { #[cfg_attr(test, assert_instr(xsaves))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) { - xsaves( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial restore of the enabled processor states using the @@ -196,9 +180,7 @@ mod tests { impl XsaveArea { fn new() -> XsaveArea { - XsaveArea { - data: [0; 2560], - } + XsaveArea { data: [0; 2560] } } fn ptr(&mut self) -> *mut u8 { &mut self.data[0] as *mut _ as *mut u8 diff --git a/coresimd/x86_64/bmi.rs b/coresimd/x86_64/bmi.rs index 61bee8f0f9..831f524714 100644 --- a/coresimd/x86_64/bmi.rs +++ b/coresimd/x86_64/bmi.rs @@ -28,8 +28,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 { /// Extracts bits of `a` specified by `control` into /// the least significant bits of the result. /// -/// Bits `[7,0]` of `control` specify the index to the first bit in the range to -/// be extracted, and bits `[15,8]` specify the length of the range. +/// Bits `[7,0]` of `control` specify the index to the first bit in the range +/// to be extracted, and bits `[15,8]` specify the length of the range. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64) #[inline] diff --git a/coresimd/x86_64/fxsr.rs b/coresimd/x86_64/fxsr.rs index d28a6d8d2e..846162e219 100644 --- a/coresimd/x86_64/fxsr.rs +++ b/coresimd/x86_64/fxsr.rs @@ -58,7 +58,7 @@ pub unsafe fn _fxrstor64(mem_addr: *const u8) { #[cfg(test)] mod tests { use coresimd::x86_64::*; - use std::{fmt, cmp::PartialEq}; + use std::{cmp::PartialEq, fmt}; use stdsimd_test::simd_test; #[repr(align(16))] diff --git a/coresimd/x86_64/xsave.rs b/coresimd/x86_64/xsave.rs index a5b630232f..3c0fda249b 100644 --- a/coresimd/x86_64/xsave.rs +++ b/coresimd/x86_64/xsave.rs @@ -36,11 +36,7 @@ extern "C" { #[cfg_attr(test, assert_instr(xsave64))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) { - xsave64( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial restore of the enabled processor states using @@ -73,11 +69,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) { #[cfg_attr(test, assert_instr(xsaveopt64))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) { - xsaveopt64( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial save of the enabled processor states to memory @@ -93,11 +85,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) { #[cfg_attr(test, assert_instr(xsavec64))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) { - xsavec64( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial save of the enabled processor states to memory at @@ -114,11 +102,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) { #[cfg_attr(test, assert_instr(xsaves64))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) { - xsaves64( - mem_addr, - (save_mask >> 32) as u32, - save_mask as u32, - ); + xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32); } /// Perform a full or partial restore of the enabled processor states using the diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs index 3362bdc6f1..317e7cf01b 100644 --- a/crates/assert-instr-macro/src/lib.rs +++ b/crates/assert-instr-macro/src/lib.rs @@ -21,7 +21,7 @@ use proc_macro2::TokenStream; #[proc_macro_attribute] pub fn assert_instr( - attr: proc_macro::TokenStream, item: proc_macro::TokenStream + attr: proc_macro::TokenStream, item: proc_macro::TokenStream, ) -> proc_macro::TokenStream { let invoc = syn::parse::(attr) .expect("expected #[assert_instr(instr, a = b, ...)]"); @@ -36,9 +36,10 @@ pub fn assert_instr( let name = &func.ident; // Disable assert_instr for x86 targets compiled with avx enabled, which - // causes LLVM to generate different intrinsics that the ones we are testing - // for. - let disable_assert_instr = std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok(); + // causes LLVM to generate different intrinsics that the ones we are + // testing for. + let disable_assert_instr = + std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok(); let maybe_ignore = if cfg!(optimized) && !disable_assert_instr { TokenStream::new() } else { @@ -72,11 +73,7 @@ pub fn assert_instr( syn::Pat::Ident(ref i) => &i.ident, _ => panic!("must have bare arguments"), }; - match invoc - .args - .iter() - .find(|a| *ident == a.0) - { + match invoc.args.iter().find(|a| *ident == a.0) { Some(&(_, ref tts)) => { input_vals.push(quote! { #tts }); } @@ -87,7 +84,8 @@ pub fn assert_instr( }; } - let attrs = func.attrs + let attrs = func + .attrs .iter() .filter(|attr| { attr.path @@ -142,9 +140,8 @@ pub fn assert_instr( } }.into(); // why? necessary now to get tests to work? - let tts: TokenStream = tts.to_string() - .parse() - .expect("cannot parse tokenstream"); + let tts: TokenStream = + tts.to_string().parse().expect("cannot parse tokenstream"); let tts: TokenStream = quote! { #item diff --git a/crates/coresimd/build.rs b/crates/coresimd/build.rs index 7126538207..3dc31c52a7 100644 --- a/crates/coresimd/build.rs +++ b/crates/coresimd/build.rs @@ -1,8 +1,5 @@ use std::env; fn main() { - println!( - "cargo:rustc-env=TARGET={}", - env::var("TARGET").unwrap() - ); + println!("cargo:rustc-env=TARGET={}", env::var("TARGET").unwrap()); } diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index a09495fe92..0751a3a17b 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -9,29 +9,35 @@ #![cfg_attr(stdsimd_strict, deny(warnings))] #![allow(dead_code)] #![allow(unused_features)] -#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, - simd_ffi, asm, proc_macro_gen, - integer_atomics, stmt_expr_attributes, core_intrinsics, - crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd, - staged_api, core_float, core_slice_ext, align_offset, - doc_cfg, mmx_target_feature, tbm_target_feature, - sse4a_target_feature, arm_target_feature, aarch64_target_feature, - mips_target_feature, powerpc_target_feature)] -#![cfg_attr(test, - feature(proc_macro, test, attr_literals, abi_vectorcall, - untagged_unions))] -#![cfg_attr(feature = "cargo-clippy", - allow(inline_always, too_many_arguments, cast_sign_loss, - cast_lossless, cast_possible_wrap, - cast_possible_truncation, cast_precision_loss, - shadow_reuse, cyclomatic_complexity, similar_names, - many_single_char_names))] +#![feature( + const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, + asm, proc_macro_gen, integer_atomics, stmt_expr_attributes, + core_intrinsics, crate_in_paths, no_core, attr_literals, rustc_attrs, + stdsimd, staged_api, core_float, core_slice_ext, align_offset, doc_cfg, + mmx_target_feature, tbm_target_feature, sse4a_target_feature, + arm_target_feature, aarch64_target_feature, mips_target_feature, + powerpc_target_feature +)] +#![cfg_attr( + test, + feature(proc_macro, test, attr_literals, abi_vectorcall, untagged_unions) +)] +#![cfg_attr( + feature = "cargo-clippy", + allow( + inline_always, too_many_arguments, cast_sign_loss, cast_lossless, + cast_possible_wrap, cast_possible_truncation, cast_precision_loss, + shadow_reuse, cyclomatic_complexity, similar_names, + many_single_char_names + ) +)] #![cfg_attr(test, allow(unused_imports))] #![no_core] #![unstable(feature = "stdsimd", issue = "27731")] -#![doc(test(attr(deny(warnings))), - test(attr(allow(dead_code, deprecated, unused_variables, - unused_mut))))] +#![doc( + test(attr(deny(warnings))), + test(attr(allow(dead_code, deprecated, unused_variables, unused_mut))) +)] #[cfg_attr(not(test), macro_use)] extern crate core as _core; diff --git a/crates/coresimd/tests/cpu-detection.rs b/crates/coresimd/tests/cpu-detection.rs index 2a788102a4..46f8194c09 100644 --- a/crates/coresimd/tests/cpu-detection.rs +++ b/crates/coresimd/tests/cpu-detection.rs @@ -1,7 +1,9 @@ #![feature(stdsimd)] #![cfg_attr(stdsimd_strict, deny(warnings))] -#![cfg_attr(feature = "cargo-clippy", - allow(option_unwrap_used, print_stdout, use_debug))] +#![cfg_attr( + feature = "cargo-clippy", + allow(option_unwrap_used, print_stdout, use_debug) +)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[macro_use] @@ -14,53 +16,20 @@ fn x86_all() { println!("sse2: {:?}", is_x86_feature_detected!("sse2")); println!("sse3: {:?}", is_x86_feature_detected!("sse3")); println!("ssse3: {:?}", is_x86_feature_detected!("ssse3")); - println!( - "sse4.1: {:?}", - is_x86_feature_detected!("sse4.1") - ); - println!( - "sse4.2: {:?}", - is_x86_feature_detected!("sse4.2") - ); + println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1")); + println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2")); println!("sse4a: {:?}", is_x86_feature_detected!("sse4a")); println!("avx: {:?}", is_x86_feature_detected!("avx")); println!("avx2: {:?}", is_x86_feature_detected!("avx2")); - println!( - "avx512f {:?}", - is_x86_feature_detected!("avx512f") - ); - println!( - "avx512cd {:?}", - is_x86_feature_detected!("avx512cd") - ); - println!( - "avx512er {:?}", - is_x86_feature_detected!("avx512er") - ); - println!( - "avx512pf {:?}", - is_x86_feature_detected!("avx512pf") - ); - println!( - "avx512bw {:?}", - is_x86_feature_detected!("avx512bw") - ); - println!( - "avx512dq {:?}", - is_x86_feature_detected!("avx512dq") - ); - println!( - "avx512vl {:?}", - is_x86_feature_detected!("avx512vl") - ); - println!( - "avx512_ifma {:?}", - is_x86_feature_detected!("avx512ifma") - ); - println!( - "avx512_vbmi {:?}", - is_x86_feature_detected!("avx512vbmi") - ); + println!("avx512f {:?}", is_x86_feature_detected!("avx512f")); + println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd")); + println!("avx512er {:?}", is_x86_feature_detected!("avx512er")); + println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf")); + println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw")); + println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq")); + println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl")); + println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma")); + println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi")); println!( "avx512_vpopcntdq {:?}", is_x86_feature_detected!("avx512vpopcntdq") @@ -70,23 +39,11 @@ fn x86_all() { println!("bmi: {:?}", is_x86_feature_detected!("bmi1")); println!("bmi2: {:?}", is_x86_feature_detected!("bmi2")); println!("tbm: {:?}", is_x86_feature_detected!("tbm")); - println!( - "popcnt: {:?}", - is_x86_feature_detected!("popcnt") - ); + println!("popcnt: {:?}", is_x86_feature_detected!("popcnt")); println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt")); println!("fxsr: {:?}", is_x86_feature_detected!("fxsr")); println!("xsave: {:?}", is_x86_feature_detected!("xsave")); - println!( - "xsaveopt: {:?}", - is_x86_feature_detected!("xsaveopt") - ); - println!( - "xsaves: {:?}", - is_x86_feature_detected!("xsaves") - ); - println!( - "xsavec: {:?}", - is_x86_feature_detected!("xsavec") - ); + println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt")); + println!("xsaves: {:?}", is_x86_feature_detected!("xsaves")); + println!("xsavec: {:?}", is_x86_feature_detected!("xsavec")); } diff --git a/crates/coresimd/tests/reductions.rs b/crates/coresimd/tests/reductions.rs index 46908cf503..123410b879 100644 --- a/crates/coresimd/tests/reductions.rs +++ b/crates/coresimd/tests/reductions.rs @@ -2,7 +2,7 @@ #![feature(arm_target_feature)] #![feature(aarch64_target_feature)] #![feature(powerpc_target_feature)] -#![allow(unused_attributes)] +#![allow(unused_attributes, dead_code, unused_imports, unused_macros)] #[macro_use] extern crate stdsimd; @@ -253,11 +253,7 @@ macro_rules! product_nan_test { } } let v = $id::splat(n0); - assert!( - v.product().is_nan(), - "all nans | {:?}", - v - ); + assert!(v.product().is_nan(), "all nans | {:?}", v); } unsafe { test_fn() }; } @@ -355,8 +351,7 @@ mod offset { // tolerate 1 ULP difference: if vsum.as_int() > tsum.as_int() { assert!( - vsum.as_int() - tsum.as_int() - < 2, + vsum.as_int() - tsum.as_int() < 2, "v: {:?} | vsum: {} | tsum: {}", v, vsum, @@ -364,8 +359,7 @@ mod offset { ); } else { assert!( - tsum.as_int() - vsum.as_int() - < 2, + tsum.as_int() - vsum.as_int() < 2, "v: {:?} | vsum: {} | tsum: {}", v, vsum, diff --git a/crates/simd-test-macro/src/lib.rs b/crates/simd-test-macro/src/lib.rs index 76da2eb222..e695b7290d 100644 --- a/crates/simd-test-macro/src/lib.rs +++ b/crates/simd-test-macro/src/lib.rs @@ -12,7 +12,7 @@ extern crate quote; use std::env; -use proc_macro2::{Literal, Span, Ident, TokenStream, TokenTree}; +use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree}; fn string(s: &str) -> TokenTree { Literal::string(s).into() @@ -20,11 +20,9 @@ fn string(s: &str) -> TokenTree { #[proc_macro_attribute] pub fn simd_test( - attr: proc_macro::TokenStream, item: proc_macro::TokenStream + attr: proc_macro::TokenStream, item: proc_macro::TokenStream, ) -> proc_macro::TokenStream { - let tokens = TokenStream::from(attr) - .into_iter() - .collect::>(); + let tokens = TokenStream::from(attr).into_iter().collect::>(); if tokens.len() != 3 { panic!("expected #[simd_test(enable = \"feature\")]"); } @@ -53,18 +51,19 @@ pub fn simd_test( let item = TokenStream::from(item); let name = find_name(item.clone()); - let name: TokenStream = name.to_string().parse().expect(&format!( - "failed to parse name: {}", - name.to_string() - )); + let name: TokenStream = name + .to_string() + .parse() + .expect(&format!("failed to parse name: {}", name.to_string())); let target = env::var("TARGET") .expect("TARGET environment variable should be set for rustc"); let mut force_test = false; - let macro_test = match target.split('-').next().expect(&format!( - "target triple contained no \"-\": {}", - target - )) { + let macro_test = match target + .split('-') + .next() + .expect(&format!("target triple contained no \"-\": {}", target)) + { "i686" | "x86_64" | "i586" => "is_x86_feature_detected", "arm" | "armv7" => "is_arm_feature_detected", "aarch64" => "is_aarch64_feature_detected", diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs index f256b83533..91b949b024 100644 --- a/crates/stdsimd-test/src/lib.rs +++ b/crates/stdsimd-test/src/lib.rs @@ -5,8 +5,10 @@ //! assertions about the disassembly of a function. #![feature(proc_macro)] -#![cfg_attr(feature = "cargo-clippy", - allow(missing_docs_in_private_items, print_stdout))] +#![cfg_attr( + feature = "cargo-clippy", + allow(missing_docs_in_private_items, print_stdout) +)] extern crate assert_instr_macro; extern crate backtrace; @@ -25,7 +27,8 @@ pub use assert_instr_macro::*; pub use simd_test_macro::*; lazy_static! { - static ref DISASSEMBLY: HashMap> = disassemble_myself(); + static ref DISASSEMBLY: HashMap> = + disassemble_myself(); } struct Function { @@ -39,14 +42,16 @@ struct Instruction { fn disassemble_myself() -> HashMap> { let me = env::current_exe().expect("failed to get current exe"); - if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows") + if cfg!(target_arch = "x86_64") + && cfg!(target_os = "windows") && cfg!(target_env = "msvc") { let mut cmd = cc::windows_registry::find( "x86_64-pc-windows-msvc", "dumpbin.exe", ).expect("failed to find `dumpbin` tool"); - let output = cmd.arg("/DISASM") + let output = cmd + .arg("/DISASM") .arg(&me) .output() .expect("failed to execute dumpbin"); @@ -76,11 +81,14 @@ fn disassemble_myself() -> HashMap> { } else { let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string()); - let output = Command::new(objdump) + let output = Command::new(objdump.clone()) .arg("--disassemble") .arg(&me) .output() - .expect("failed to execute objdump"); + .expect(&format!( + "failed to execute objdump. OBJDUMP={}", + objdump + )); println!( "{}\n{}", output.status, @@ -257,9 +265,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { // in the disassembly. let mut sym = None; backtrace::resolve(fnptr as *mut _, |name| { - sym = name.name() - .and_then(|s| s.as_str()) - .map(normalize); + sym = name.name().and_then(|s| s.as_str()).map(normalize); }); let functions = @@ -270,26 +276,17 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { println!("assumed symbol name: `{}`", sym); } println!("maybe related functions"); - for f in DISASSEMBLY - .keys() - .filter(|k| k.contains(fnname)) - { + for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) { println!("\t- {}", f); } - panic!( - "failed to find disassembly of {:#x} ({})", - fnptr, fnname - ); + panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname); }; assert_eq!(functions.len(), 1); let function = &functions[0]; let mut instrs = &function.instrs[..]; - while instrs - .last() - .map_or(false, |s| s.parts == ["nop"]) - { + while instrs.last().map_or(false, |s| s.parts == ["nop"]) { instrs = &instrs[..instrs.len() - 1]; } @@ -400,10 +397,7 @@ pub fn assert_skip_test_ok(name: &str) { if env::var("STDSIMD_TEST_EVERYTHING").is_err() { return; } - panic!( - "skipped test `{}` when it shouldn't be skipped", - name - ); + panic!("skipped test `{}` when it shouldn't be skipped", name); } // See comment in `assert-instr-macro` crate for why this exists diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index a570e3cd91..0df5a07f06 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -1,9 +1,12 @@ #![feature(proc_macro)] #![allow(bad_style)] -#![cfg_attr(feature = "cargo-clippy", - allow(shadow_reuse, cast_lossless, match_same_arms, - nonminimal_bool, print_stdout, use_debug, eq_op, - useless_format))] +#![cfg_attr( + feature = "cargo-clippy", + allow( + shadow_reuse, cast_lossless, match_same_arms, nonminimal_bool, + print_stdout, use_debug, eq_op, useless_format + ) +)] #[macro_use] extern crate serde_derive; @@ -249,10 +252,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { .flat_map(|c| c.to_lowercase()) .collect::(); - let rust_feature = rust.target_feature.expect(&format!( - "no target feature listed for {}", - rust.name - )); + let rust_feature = rust + .target_feature + .expect(&format!("no target feature listed for {}", rust.name)); if rust_feature.contains(&cpuid) { continue; } @@ -314,25 +316,20 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { if rust.arguments.len() != intel.parameters.len() { bail!("wrong number of arguments on {}", rust.name) } - for (i, (a, b)) in intel - .parameters - .iter() - .zip(rust.arguments) - .enumerate() + for (i, (a, b)) in + intel.parameters.iter().zip(rust.arguments).enumerate() { let is_const = rust.required_const.contains(&i); equate(b, &a.type_, &intel.name, is_const)?; } } - let any_i64 = rust.arguments - .iter() - .cloned() - .chain(rust.ret) - .any(|arg| match *arg { + let any_i64 = rust.arguments.iter().cloned().chain(rust.ret).any(|arg| { + match *arg { Type::PrimSigned(64) | Type::PrimUnsigned(64) => true, _ => false, - }); + } + }); let any_i64_exempt = match rust.name { // These intrinsics have all been manually verified against Clang's // headers to be available on x86, and the u64 arguments seem @@ -363,7 +360,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { } fn equate( - t: &Type, intel: &str, intrinsic: &str, is_const: bool + t: &Type, intel: &str, intrinsic: &str, is_const: bool, ) -> Result<(), String> { let intel = intel.replace(" *", "*"); let intel = intel.replace(" const*", "*"); @@ -371,9 +368,7 @@ fn equate( if is_const { return Ok(()); } - Err(format!( - "argument required to be const but isn't" - )) + Err(format!("argument required to be const but isn't")) }; match (t, &intel[..]) { (&Type::PrimFloat(32), "float") => {} diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs index abee4fcfd4..4986e839da 100644 --- a/crates/stdsimd/src/lib.rs +++ b/crates/stdsimd/src/lib.rs @@ -1,11 +1,7 @@ //! SIMD and vendor intrinsics support library. //! //! This crate defines the vendor intrinsics and types primarily used for SIMD -//! in Rust. The crate here will soon be available in the standard library, but -//! for now you can also browse the documentation here, primarily in the `arch` -//! submodule. -//! -//! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/ +//! in Rust. #![feature(const_fn, integer_atomics, staged_api, stdsimd)] #![feature(doc_cfg, allow_internal_unstable)] diff --git a/crates/stdsimd/tests/cpu-detection.rs b/crates/stdsimd/tests/cpu-detection.rs index 775b9774f1..962a3fa314 100644 --- a/crates/stdsimd/tests/cpu-detection.rs +++ b/crates/stdsimd/tests/cpu-detection.rs @@ -1,155 +1,106 @@ #![feature(stdsimd)] #![cfg_attr(stdsimd_strict, deny(warnings))] -#![cfg_attr(feature = "cargo-clippy", - allow(option_unwrap_used, use_debug, print_stdout))] +#![cfg_attr( + feature = "cargo-clippy", + allow(option_unwrap_used, use_debug, print_stdout) +)] -#[cfg(any(target_arch = "arm", target_arch = "aarch64", - target_arch = "x86", target_arch = "x86_64", - target_arch = "powerpc", target_arch = "powerpc64"))] +#[cfg( + any( + target_arch = "arm", + target_arch = "aarch64", + target_arch = "x86", + target_arch = "x86_64", + target_arch = "powerpc", + target_arch = "powerpc64" + ) +)] #[macro_use] extern crate stdsimd; #[test] -#[cfg(all(target_arch = "arm", target_os = "linux"))] +#[cfg(all(target_arch = "arm", + any(target_os = "linux", target_os = "android")))] fn arm_linux() { println!("neon: {}", is_arm_feature_detected!("neon")); println!("pmull: {}", is_arm_feature_detected!("pmull")); } #[test] -#[cfg(all(target_arch = "aarch64", target_os = "linux"))] +#[cfg(all(target_arch = "aarch64", + any(target_os = "linux", target_os = "android")))] fn aarch64_linux() { println!("fp: {}", is_aarch64_feature_detected!("fp")); println!("fp16: {}", is_aarch64_feature_detected!("fp16")); println!("neon: {}", is_aarch64_feature_detected!("neon")); - println!( - "asimd: {}", - is_aarch64_feature_detected!("asimd") - ); + println!("asimd: {}", is_aarch64_feature_detected!("asimd")); println!("sve: {}", is_aarch64_feature_detected!("sve")); println!("crc: {}", is_aarch64_feature_detected!("crc")); - println!( - "crypto: {}", - is_aarch64_feature_detected!("crypto") - ); + println!("crypto: {}", is_aarch64_feature_detected!("crypto")); println!("lse: {}", is_aarch64_feature_detected!("lse")); println!("rdm: {}", is_aarch64_feature_detected!("rdm")); println!("rcpc: {}", is_aarch64_feature_detected!("rcpc")); - println!( - "dotprod: {}", - is_aarch64_feature_detected!("dotprod") - ); + println!("dotprod: {}", is_aarch64_feature_detected!("dotprod")); } #[test] #[cfg(all(target_arch = "powerpc", target_os = "linux"))] fn powerpc_linux() { - println!( - "altivec: {}", - is_powerpc_feature_detected!("altivec") - ); + println!("altivec: {}", is_powerpc_feature_detected!("altivec")); println!("vsx: {}", is_powerpc_feature_detected!("vsx")); - println!( - "power8: {}", - is_powerpc_feature_detected!("power8") - ); + println!("power8: {}", is_powerpc_feature_detected!("power8")); } #[test] #[cfg(all(target_arch = "powerpc64", target_os = "linux"))] fn powerpc64_linux() { - println!( - "altivec: {}", - is_powerpc64_feature_detected!("altivec") - ); + println!("altivec: {}", is_powerpc64_feature_detected!("altivec")); println!("vsx: {}", is_powerpc64_feature_detected!("vsx")); - println!( - "power8: {}", - is_powerpc64_feature_detected!("power8") - ); + println!("power8: {}", is_powerpc64_feature_detected!("power8")); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn x86_all() { + println!("aes: {:?}", is_x86_feature_detected!("aes")); + println!("pcmulqdq: {:?}", is_x86_feature_detected!("pclmulqdq")); + println!("rdrand: {:?}", is_x86_feature_detected!("rdrand")); + println!("rdseed: {:?}", is_x86_feature_detected!("rdseed")); + println!("tsc: {:?}", is_x86_feature_detected!("tsc")); + println!("mmx: {:?}", is_x86_feature_detected!("mmx")); println!("sse: {:?}", is_x86_feature_detected!("sse")); println!("sse2: {:?}", is_x86_feature_detected!("sse2")); println!("sse3: {:?}", is_x86_feature_detected!("sse3")); println!("ssse3: {:?}", is_x86_feature_detected!("ssse3")); - println!( - "sse4.1: {:?}", - is_x86_feature_detected!("sse4.1") - ); - println!( - "sse4.2: {:?}", - is_x86_feature_detected!("sse4.2") - ); + println!("sse4.1: {:?}", is_x86_feature_detected!("sse4.1")); + println!("sse4.2: {:?}", is_x86_feature_detected!("sse4.2")); println!("sse4a: {:?}", is_x86_feature_detected!("sse4a")); println!("sha: {:?}", is_x86_feature_detected!("sha")); println!("avx: {:?}", is_x86_feature_detected!("avx")); println!("avx2: {:?}", is_x86_feature_detected!("avx2")); - println!( - "avx512f {:?}", - is_x86_feature_detected!("avx512f") - ); - println!( - "avx512cd {:?}", - is_x86_feature_detected!("avx512cd") - ); - println!( - "avx512er {:?}", - is_x86_feature_detected!("avx512er") - ); - println!( - "avx512pf {:?}", - is_x86_feature_detected!("avx512pf") - ); - println!( - "avx512bw {:?}", - is_x86_feature_detected!("avx512bw") - ); - println!( - "avx512dq {:?}", - is_x86_feature_detected!("avx512dq") - ); - println!( - "avx512vl {:?}", - is_x86_feature_detected!("avx512vl") - ); - println!( - "avx512_ifma {:?}", - is_x86_feature_detected!("avx512ifma") - ); - println!( - "avx512_vbmi {:?}", - is_x86_feature_detected!("avx512vbmi") - ); + println!("avx512f {:?}", is_x86_feature_detected!("avx512f")); + println!("avx512cd {:?}", is_x86_feature_detected!("avx512cd")); + println!("avx512er {:?}", is_x86_feature_detected!("avx512er")); + println!("avx512pf {:?}", is_x86_feature_detected!("avx512pf")); + println!("avx512bw {:?}", is_x86_feature_detected!("avx512bw")); + println!("avx512dq {:?}", is_x86_feature_detected!("avx512dq")); + println!("avx512vl {:?}", is_x86_feature_detected!("avx512vl")); + println!("avx512_ifma {:?}", is_x86_feature_detected!("avx512ifma")); + println!("avx512_vbmi {:?}", is_x86_feature_detected!("avx512vbmi")); println!( "avx512_vpopcntdq {:?}", is_x86_feature_detected!("avx512vpopcntdq") ); println!("fma: {:?}", is_x86_feature_detected!("fma")); - println!("abm: {:?}", is_x86_feature_detected!("abm")); - println!("bmi: {:?}", is_x86_feature_detected!("bmi1")); + println!("bmi1: {:?}", is_x86_feature_detected!("bmi1")); println!("bmi2: {:?}", is_x86_feature_detected!("bmi2")); - println!("tbm: {:?}", is_x86_feature_detected!("tbm")); - println!( - "popcnt: {:?}", - is_x86_feature_detected!("popcnt") - ); + println!("abm: {:?}", is_x86_feature_detected!("abm")); println!("lzcnt: {:?}", is_x86_feature_detected!("lzcnt")); + println!("tbm: {:?}", is_x86_feature_detected!("tbm")); + println!("popcnt: {:?}", is_x86_feature_detected!("popcnt")); println!("fxsr: {:?}", is_x86_feature_detected!("fxsr")); println!("xsave: {:?}", is_x86_feature_detected!("xsave")); - println!( - "xsaveopt: {:?}", - is_x86_feature_detected!("xsaveopt") - ); - println!( - "xsaves: {:?}", - is_x86_feature_detected!("xsaves") - ); - println!( - "xsavec: {:?}", - is_x86_feature_detected!("xsavec") - ); + println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt")); + println!("xsaves: {:?}", is_x86_feature_detected!("xsaves")); + println!("xsavec: {:?}", is_x86_feature_detected!("xsavec")); } diff --git a/examples/hex.rs b/examples/hex.rs index 5b045c6126..878f17125d 100644 --- a/examples/hex.rs +++ b/examples/hex.rs @@ -14,10 +14,13 @@ #![feature(stdsimd)] #![cfg_attr(test, feature(test))] -#![cfg_attr(feature = "cargo-clippy", - allow(result_unwrap_used, print_stdout, option_unwrap_used, - shadow_reuse, cast_possible_wrap, cast_sign_loss, - missing_docs_in_private_items))] +#![cfg_attr( + feature = "cargo-clippy", + allow( + result_unwrap_used, print_stdout, option_unwrap_used, shadow_reuse, + cast_possible_wrap, cast_sign_loss, missing_docs_in_private_items + ) +)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[macro_use] @@ -68,7 +71,7 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { #[target_feature(enable = "avx2")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe fn hex_encode_avx2<'a>( - mut src: &[u8], dst: &'a mut [u8] + mut src: &[u8], dst: &'a mut [u8], ) -> Result<&'a str, usize> { let ascii_zero = _mm256_set1_epi8(b'0' as i8); let nines = _mm256_set1_epi8(9); @@ -115,16 +118,14 @@ unsafe fn hex_encode_avx2<'a>( let i = i as usize; let _ = hex_encode_sse41(src, &mut dst[i * 2..]); - Ok(str::from_utf8_unchecked( - &dst[..src.len() * 2 + i * 2], - )) + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) } // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp #[target_feature(enable = "sse4.1")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe fn hex_encode_sse41<'a>( - mut src: &[u8], dst: &'a mut [u8] + mut src: &[u8], dst: &'a mut [u8], ) -> Result<&'a str, usize> { let ascii_zero = _mm_set1_epi8(b'0' as i8); let nines = _mm_set1_epi8(9); @@ -157,10 +158,7 @@ unsafe fn hex_encode_sse41<'a>( let res2 = _mm_unpackhi_epi8(masked2, masked1); _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1); - _mm_storeu_si128( - dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, - res2, - ); + _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2); src = &src[16..]; i += 16; } @@ -168,13 +166,11 @@ unsafe fn hex_encode_sse41<'a>( let i = i as usize; let _ = hex_encode_fallback(src, &mut dst[i * 2..]); - Ok(str::from_utf8_unchecked( - &dst[..src.len() * 2 + i * 2], - )) + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) } fn hex_encode_fallback<'a>( - src: &[u8], dst: &'a mut [u8] + src: &[u8], dst: &'a mut [u8], ) -> Result<&'a str, usize> { fn hex(byte: u8) -> u8 { static TABLE: &[u8] = b"0123456789abcdef"; @@ -199,10 +195,7 @@ mod tests { fn test(input: &[u8], output: &str) { let tmp = || vec![0; input.len() * 2]; - assert_eq!( - hex_encode_fallback(input, &mut tmp()).unwrap(), - output - ); + assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output); assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] @@ -239,9 +232,7 @@ mod tests { fn odd() { test( &[0; 313], - &iter::repeat('0') - .take(313 * 2) - .collect::(), + &iter::repeat('0').take(313 * 2).collect::(), ); } diff --git a/examples/nbody.rs b/examples/nbody.rs index 8f12ec4365..63281e78e8 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -5,9 +5,13 @@ #![cfg_attr(stdsimd_strict, deny(warnings))] #![feature(stdsimd)] -#![cfg_attr(feature = "cargo-clippy", - allow(similar_names, missing_docs_in_private_items, - shadow_reuse, print_stdout))] +#![cfg_attr( + feature = "cargo-clippy", + allow( + similar_names, missing_docs_in_private_items, shadow_reuse, + print_stdout + ) +)] extern crate stdsimd; #[macro_use] @@ -15,8 +19,6 @@ extern crate cfg_if; use stdsimd::simd::*; - - const PI: f64 = std::f64::consts::PI; const SOLAR_MASS: f64 = 4.0 * PI * PI; const DAYS_PER_YEAR: f64 = 365.24; @@ -81,7 +83,7 @@ struct Body { impl Body { fn new( - x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64 + x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64, ) -> Self { Self { x: [x0, x1, x2], diff --git a/stdsimd/arch/detect/bit.rs b/stdsimd/arch/detect/bit.rs index c4ec43bb85..578f0b16b7 100644 --- a/stdsimd/arch/detect/bit.rs +++ b/stdsimd/arch/detect/bit.rs @@ -2,7 +2,8 @@ /// Tests the `bit` of `x`. #[allow(dead_code)] -pub fn test(x: usize, bit: u32) -> bool { +#[inline] +pub(crate) fn test(x: usize, bit: u32) -> bool { debug_assert!(bit < 32, "bit index out-of-bounds"); x & (1 << bit) != 0 } diff --git a/stdsimd/arch/detect/cache.rs b/stdsimd/arch/detect/cache.rs index 3d6b11d786..ead91ad86a 100644 --- a/stdsimd/arch/detect/cache.rs +++ b/stdsimd/arch/detect/cache.rs @@ -12,12 +12,14 @@ use core::sync::atomic::AtomicU64; use core::sync::atomic::AtomicU32; /// Sets the `bit` of `x`. -pub const fn set_bit(x: u64, bit: u32) -> u64 { +#[inline] +const fn set_bit(x: u64, bit: u32) -> u64 { x | 1 << bit } /// Tests the `bit` of `x`. -pub const fn test_bit(x: u64, bit: u32) -> bool { +#[inline] +const fn test_bit(x: u64, bit: u32) -> bool { x & (1 << bit) != 0 } @@ -26,7 +28,7 @@ const CACHE_CAPACITY: u32 = 63; /// This type is used to initialize the cache #[derive(Copy, Clone)] -pub struct Initializer(u64); +pub(crate) struct Initializer(u64); impl Default for Initializer { fn default() -> Self { @@ -37,7 +39,8 @@ impl Default for Initializer { impl Initializer { /// Tests the `bit` of the cache. #[allow(dead_code)] - pub fn test(&self, bit: u32) -> bool { + #[inline] + pub(crate) fn test(&self, bit: u32) -> bool { // FIXME: this way of making sure that the cache is large enough is // brittle. debug_assert!( @@ -48,7 +51,8 @@ impl Initializer { } /// Sets the `bit` of the cache. - pub fn set(&mut self, bit: u32) { + #[inline] + pub(crate) fn set(&mut self, bit: u32) { // FIXME: this way of making sure that the cache is large enough is // brittle. debug_assert!( @@ -77,17 +81,20 @@ impl Cache { Cache(AtomicU64::new(u64::max_value())) } /// Is the cache uninitialized? - pub fn is_uninitialized(&self) -> bool { + #[inline] + pub(crate) fn is_uninitialized(&self) -> bool { self.0.load(Ordering::Relaxed) == u64::max_value() } /// Is the `bit` in the cache set? - pub fn test(&self, bit: u32) -> bool { + #[inline] + pub(crate) fn test(&self, bit: u32) -> bool { test_bit(CACHE.0.load(Ordering::Relaxed), bit) } /// Initializes the cache. - pub fn initialize(&self, value: Initializer) { + #[inline] + pub(crate) fn initialize(&self, value: Initializer) { self.0.store(value.0, Ordering::Relaxed); } } @@ -109,12 +116,14 @@ impl Cache { ) } /// Is the cache uninitialized? - pub fn is_uninitialized(&self) -> bool { + #[inline] + pub(crate) fn is_uninitialized(&self) -> bool { self.1.load(Ordering::Relaxed) == u32::max_value() } /// Is the `bit` in the cache set? - pub fn test(&self, bit: u32) -> bool { + #[inline] + pub(crate) fn test(&self, bit: u32) -> bool { if bit < 32 { test_bit(CACHE.0.load(Ordering::Relaxed) as u64, bit) } else { @@ -123,7 +132,8 @@ impl Cache { } /// Initializes the cache. - pub fn initialize(&self, value: Initializer) { + #[inline] + pub(crate) fn initialize(&self, value: Initializer) { let lo: u32 = value.0 as u32; let hi: u32 = (value.0 >> 32) as u32; self.0.store(lo, Ordering::Relaxed); @@ -139,9 +149,8 @@ impl Cache { /// /// It uses the `Feature` variant to index into this variable as a bitset. If /// the bit is set, the feature is enabled, and otherwise it is disabled. -/// -/// PLEASE: do not use this, it is an implementation detail subject to change. -pub fn test(bit: u32, f: F) -> bool +#[inline] +pub(crate) fn test(bit: u32, f: F) -> bool where F: FnOnce() -> Initializer, { diff --git a/stdsimd/arch/detect/error_macros.rs b/stdsimd/arch/detect/error_macros.rs index 0bba7b7cfe..743f7ea952 100644 --- a/stdsimd/arch/detect/error_macros.rs +++ b/stdsimd/arch/detect/error_macros.rs @@ -25,7 +25,7 @@ macro_rules! is_x86_feature_detected { #[macro_export] #[unstable(feature = "stdsimd", issue = "27731")] macro_rules! is_arm_feature_detected { - ($t: tt) => { + ($t:tt) => { compile_error!( r#" is_arm_feature_detected can only be used on ARM targets. @@ -64,7 +64,8 @@ macro_rules! is_aarch64_feature_detected { #[unstable(feature = "stdsimd", issue = "27731")] macro_rules! is_powerpc_feature_detected { ($t:tt) => { - compile_error!(r#" + compile_error!( + r#" is_powerpc_feature_detected can only be used on PowerPC targets. You can prevent it from being used in other architectures by guarding it behind a cfg(target_arch) as follows: @@ -72,7 +73,8 @@ guarding it behind a cfg(target_arch) as follows: #[cfg(target_arch = "powerpc")] { if is_powerpc_feature_detected(...) { ... } } -"#) +"# + ) }; } @@ -81,7 +83,8 @@ guarding it behind a cfg(target_arch) as follows: #[unstable(feature = "stdsimd", issue = "27731")] macro_rules! is_powerpc64_feature_detected { ($t:tt) => { - compile_error!(r#" + compile_error!( + r#" is_powerpc64_feature_detected can only be used on PowerPC64 targets. You can prevent it from being used in other architectures by guarding it behind a cfg(target_arch) as follows: @@ -89,7 +92,8 @@ guarding it behind a cfg(target_arch) as follows: #[cfg(target_arch = "powerpc64")] { if is_powerpc64_feature_detected(...) { ... } } -"#) +"# + ) }; } @@ -97,7 +101,7 @@ guarding it behind a cfg(target_arch) as follows: #[macro_export] #[unstable(feature = "stdsimd", issue = "27731")] macro_rules! is_mips_feature_detected { - ($t: tt) => { + ($t:tt) => { compile_error!( r#" is_mips_feature_detected can only be used on MIPS targets. @@ -116,7 +120,7 @@ macro_rules! is_mips_feature_detected { #[macro_export] #[unstable(feature = "stdsimd", issue = "27731")] macro_rules! is_mips64_feature_detected { - ($t: tt) => { + ($t:tt) => { compile_error!( r#" is_mips64_feature_detected can only be used on MIPS64 targets. diff --git a/stdsimd/arch/detect/mod.rs b/stdsimd/arch/detect/mod.rs index 7ed3971711..57cab4f120 100644 --- a/stdsimd/arch/detect/mod.rs +++ b/stdsimd/arch/detect/mod.rs @@ -60,8 +60,8 @@ cfg_if! { } pub use self::arch::Feature; -mod cache; mod bit; +mod cache; cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { diff --git a/stdsimd/arch/detect/os/linux/aarch64.rs b/stdsimd/arch/detect/os/linux/aarch64.rs index e46ace5a30..ebddf2d75d 100644 --- a/stdsimd/arch/detect/os/linux/aarch64.rs +++ b/stdsimd/arch/detect/os/linux/aarch64.rs @@ -7,6 +7,7 @@ use super::auxvec; use super::cpuinfo; /// Performs run-time feature detection. +#[inline] pub fn check_for(x: Feature) -> bool { cache::test(x as u32, detect_features) } @@ -125,7 +126,7 @@ impl AtHwcap { /// /// The features are enabled approximately like in LLVM host feature detection: /// https://github.com/llvm-mirror/llvm/blob/master/lib/Support/Host.cpp#L1273 - pub fn cache(self) -> cache::Initializer { + fn cache(self) -> cache::Initializer { let mut value = cache::Initializer::default(); { let mut enable_feature = |f, enable| { diff --git a/stdsimd/arch/detect/os/linux/arm.rs b/stdsimd/arch/detect/os/linux/arm.rs index 7b964962cb..9d265e02db 100644 --- a/stdsimd/arch/detect/os/linux/arm.rs +++ b/stdsimd/arch/detect/os/linux/arm.rs @@ -7,6 +7,7 @@ use super::auxvec; use super::cpuinfo; /// Performs run-time feature detection. +#[inline] pub fn check_for(x: Feature) -> bool { cache::test(x as u32, detect_features) } diff --git a/stdsimd/arch/detect/os/linux/auxvec.rs b/stdsimd/arch/detect/os/linux/auxvec.rs index b387b70da6..20fbb5f588 100644 --- a/stdsimd/arch/detect/os/linux/auxvec.rs +++ b/stdsimd/arch/detect/os/linux/auxvec.rs @@ -7,17 +7,17 @@ use fs::File; use io::Read; /// Key to access the CPU Hardware capabilities bitfield. -pub const AT_HWCAP: usize = 16; +pub(crate) const AT_HWCAP: usize = 16; /// Key to access the CPU Hardware capabilities 2 bitfield. #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] -pub const AT_HWCAP2: usize = 26; +pub(crate) const AT_HWCAP2: usize = 26; /// Cache HWCAP bitfields of the ELF Auxiliary Vector. /// /// If an entry cannot be read all the bits in the bitfield are set to zero. /// This should be interpreted as all the features being disabled. #[derive(Debug, Copy, Clone)] -pub struct AuxVec { +pub(crate) struct AuxVec { pub hwcap: usize, #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] pub hwcap2: usize, @@ -48,7 +48,7 @@ pub struct AuxVec { /// /// [auxvec_h]: https://github.com/torvalds/linux/blob/master/include/uapi/linux/auxvec.h /// [auxv_docs]: https://docs.rs/auxv/0.3.3/auxv/ -pub fn auxv() -> Result { +pub(crate) fn auxv() -> Result { // Try to call a dynamically-linked getauxval function. if let Ok(hwcap) = getauxval(AT_HWCAP) { // Targets with only AT_HWCAP: diff --git a/stdsimd/arch/detect/os/linux/cpuinfo.rs b/stdsimd/arch/detect/os/linux/cpuinfo.rs index eeb152ccce..fd7b5876f3 100644 --- a/stdsimd/arch/detect/os/linux/cpuinfo.rs +++ b/stdsimd/arch/detect/os/linux/cpuinfo.rs @@ -6,20 +6,20 @@ use fs::File; use io::{self, Read}; /// cpuinfo -pub struct CpuInfo { +pub(crate) struct CpuInfo { raw: String, } impl CpuInfo { /// Reads /proc/cpuinfo into CpuInfo. - pub fn new() -> Result { + pub(crate) fn new() -> Result { let mut file = File::open("/proc/cpuinfo")?; let mut cpui = Self { raw: String::new() }; file.read_to_string(&mut cpui.raw)?; Ok(cpui) } /// Returns the value of the cpuinfo `field`. - pub fn field(&self, field: &str) -> CpuInfoField { + pub(crate) fn field(&self, field: &str) -> CpuInfoField { for l in self.raw.lines() { if l.trim().starts_with(field) { return CpuInfoField::new(l.split(": ").nth(1)); @@ -44,7 +44,7 @@ impl CpuInfo { /// Field of cpuinfo #[derive(Debug)] -pub struct CpuInfoField<'a>(Option<&'a str>); +pub(crate) struct CpuInfoField<'a>(Option<&'a str>); impl<'a> PartialEq<&'a str> for CpuInfoField<'a> { fn eq(&self, other: &&'a str) -> bool { @@ -56,7 +56,7 @@ impl<'a> PartialEq<&'a str> for CpuInfoField<'a> { } impl<'a> CpuInfoField<'a> { - pub fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> { + pub(crate) fn new<'b>(v: Option<&'b str>) -> CpuInfoField<'b> { match v { None => CpuInfoField::<'b>(None), Some(f) => CpuInfoField::<'b>(Some(f.trim())), @@ -64,11 +64,11 @@ impl<'a> CpuInfoField<'a> { } /// Does the field exist? #[cfg(test)] - pub fn exists(&self) -> bool { + pub(crate) fn exists(&self) -> bool { self.0.is_some() } /// Does the field contain `other`? - pub fn has(&self, other: &str) -> bool { + pub(crate) fn has(&self, other: &str) -> bool { match self.0 { None => other.is_empty(), Some(f) => { diff --git a/stdsimd/arch/detect/os/linux/mips.rs b/stdsimd/arch/detect/os/linux/mips.rs index 2c3ee03d22..92e95f057e 100644 --- a/stdsimd/arch/detect/os/linux/mips.rs +++ b/stdsimd/arch/detect/os/linux/mips.rs @@ -6,6 +6,7 @@ use arch::detect::bit; use super::auxvec; /// Performs run-time feature detection. +#[inline] pub fn check_for(x: Feature) -> bool { cache::test(x as u32, detect_features) } diff --git a/stdsimd/arch/detect/os/linux/powerpc.rs b/stdsimd/arch/detect/os/linux/powerpc.rs index 8289c09816..41f58508f2 100644 --- a/stdsimd/arch/detect/os/linux/powerpc.rs +++ b/stdsimd/arch/detect/os/linux/powerpc.rs @@ -6,6 +6,7 @@ use super::auxvec; use super::cpuinfo; /// Performs run-time feature detection. +#[inline] pub fn check_for(x: Feature) -> bool { cache::test(x as u32, detect_features) } diff --git a/stdsimd/arch/detect/os/other.rs b/stdsimd/arch/detect/os/other.rs index 2b1b378acf..562cbfe8f1 100644 --- a/stdsimd/arch/detect/os/other.rs +++ b/stdsimd/arch/detect/os/other.rs @@ -3,6 +3,7 @@ use arch::detect::Feature; /// Performs run-time feature detection. +#[inline] pub fn check_for(_x: Feature) -> bool { false } diff --git a/stdsimd/arch/detect/os/x86.rs b/stdsimd/arch/detect/os/x86.rs index b7647cd637..3ac009a2d3 100644 --- a/stdsimd/arch/detect/os/x86.rs +++ b/stdsimd/arch/detect/os/x86.rs @@ -12,6 +12,7 @@ use arch::detect::cache; use arch::detect::bit; /// Performs run-time feature detection. +#[inline] pub fn check_for(x: Feature) -> bool { cache::test(x as u32, detect_features) } @@ -32,7 +33,7 @@ pub fn check_for(x: Feature) -> bool { /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf #[cfg_attr(feature = "cargo-clippy", allow(similar_names))] -pub fn detect_features() -> cache::Initializer { +fn detect_features() -> cache::Initializer { let mut value = cache::Initializer::default(); // If the x86 CPU does not support the CPUID instruction then it is too @@ -115,7 +116,6 @@ pub fn detect_features() -> cache::Initializer { enable(proc_info_ecx, 0, Feature::sse3); enable(proc_info_ecx, 9, Feature::ssse3); - enable(proc_info_ecx, 12, Feature::fma); enable(proc_info_ecx, 19, Feature::sse4_1); enable(proc_info_ecx, 20, Feature::sse4_2); enable(proc_info_ecx, 23, Feature::popcnt); @@ -149,64 +149,77 @@ pub fn detect_features() -> cache::Initializer { // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190 let cpu_osxsave = bit::test(proc_info_ecx as usize, 27); - // 2. The OS must have signaled the CPU that it supports saving and - // restoring the SSE and AVX registers by setting `XCR0.SSE[1]` and - // `XCR0.AVX[2]` to `1`. - // - // This is safe because the CPU supports `xsave` - let xcr0 = unsafe { _xgetbv(0) }; - let os_avx_support = xcr0 & 6 == 6; - let os_avx512_support = xcr0 & 224 == 224; - - // Only if the OS and the CPU support saving/restoring the AVX - // registers we enable `xsave` support: - if cpu_osxsave && os_avx_support { - // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED - // FEATURES" in the "Intel® 64 and IA-32 Architectures Software - // Developer’s Manual, Volume 1: Basic Architecture": + if cpu_osxsave { + // 2. The OS must have signaled the CPU that it supports saving and + // restoring the: + // + // * SSE -> `XCR0.SSE[1]` + // * AVX -> `XCR0.AVX[2]` + // * AVX-512 -> `XCR0.AVX-512[7:5]`. // - // "Software enables the XSAVE feature set by setting - // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4 - // instruction). If this bit is 0, execution of any of XGETBV, - // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV - // causes an invalid-opcode exception (#UD)" + // by setting the corresponding bits of `XCR0` to `1`. // - enable(proc_info_ecx, 26, Feature::xsave); + // This is safe because the CPU supports `xsave` + // and the OS has set `osxsave`. + let xcr0 = unsafe { _xgetbv(0) }; + // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`: + let os_avx_support = xcr0 & 6 == 6; + // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`: + let os_avx512_support = xcr0 & 224 == 224; - // For `xsaveopt`, `xsavec`, and `xsaves` we need to query: - // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, - // ECX = 1): - if max_basic_leaf >= 0xd { - let CpuidResult { - eax: proc_extended_state1_eax, - .. - } = unsafe { __cpuid_count(0xd_u32, 1) }; - enable(proc_extended_state1_eax, 0, Feature::xsaveopt); - enable(proc_extended_state1_eax, 1, Feature::xsavec); - enable(proc_extended_state1_eax, 3, Feature::xsaves); - } + // Only if the OS and the CPU support saving/restoring the AVX + // registers we enable `xsave` support: + if os_avx_support { + // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED + // FEATURES" in the "Intel® 64 and IA-32 Architectures Software + // Developer’s Manual, Volume 1: Basic Architecture": + // + // "Software enables the XSAVE feature set by setting + // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4 + // instruction). If this bit is 0, execution of any of XGETBV, + // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV + // causes an invalid-opcode exception (#UD)" + // + enable(proc_info_ecx, 26, Feature::xsave); + + // For `xsaveopt`, `xsavec`, and `xsaves` we need to query: + // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, + // ECX = 1): + if max_basic_leaf >= 0xd { + let CpuidResult { + eax: proc_extended_state1_eax, + .. + } = unsafe { __cpuid_count(0xd_u32, 1) }; + enable(proc_extended_state1_eax, 0, Feature::xsaveopt); + enable(proc_extended_state1_eax, 1, Feature::xsavec); + enable(proc_extended_state1_eax, 3, Feature::xsaves); + } + + // FMA (uses 256-bit wide registers): + enable(proc_info_ecx, 12, Feature::fma); - // And AVX/AVX2: - enable(proc_info_ecx, 28, Feature::avx); - enable(extended_features_ebx, 5, Feature::avx2); + // And AVX/AVX2: + enable(proc_info_ecx, 28, Feature::avx); + enable(extended_features_ebx, 5, Feature::avx2); - // For AVX-512 the OS also needs to support saving/restoring - // the extended state, only then we enable AVX-512 support: - if os_avx512_support { - enable(extended_features_ebx, 16, Feature::avx512f); - enable(extended_features_ebx, 17, Feature::avx512dq); - enable(extended_features_ebx, 21, Feature::avx512_ifma); - enable(extended_features_ebx, 26, Feature::avx512pf); - enable(extended_features_ebx, 27, Feature::avx512er); - enable(extended_features_ebx, 28, Feature::avx512cd); - enable(extended_features_ebx, 30, Feature::avx512bw); - enable(extended_features_ebx, 31, Feature::avx512vl); - enable(extended_features_ecx, 1, Feature::avx512_vbmi); - enable( - extended_features_ecx, - 14, - Feature::avx512_vpopcntdq, - ); + // For AVX-512 the OS also needs to support saving/restoring + // the extended state, only then we enable AVX-512 support: + if os_avx512_support { + enable(extended_features_ebx, 16, Feature::avx512f); + enable(extended_features_ebx, 17, Feature::avx512dq); + enable(extended_features_ebx, 21, Feature::avx512_ifma); + enable(extended_features_ebx, 26, Feature::avx512pf); + enable(extended_features_ebx, 27, Feature::avx512er); + enable(extended_features_ebx, 28, Feature::avx512cd); + enable(extended_features_ebx, 30, Feature::avx512bw); + enable(extended_features_ebx, 31, Feature::avx512vl); + enable(extended_features_ecx, 1, Feature::avx512_vbmi); + enable( + extended_features_ecx, + 14, + Feature::avx512_vpopcntdq, + ); + } } } } diff --git a/stdsimd/mod.rs b/stdsimd/mod.rs index 9eca5b075a..b76deb520e 100644 --- a/stdsimd/mod.rs +++ b/stdsimd/mod.rs @@ -188,14 +188,14 @@ /// * [`powerpc`] /// * [`powerpc64`] /// -/// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html -/// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html -/// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html -/// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html -/// [`mips`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/mips/index.html -/// [`mips64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/mips64/index.html -/// [`powerpc`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/powerpc/index.html -/// [`powerpc64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/powerpc64/index.html +/// [`x86`]: x86/index.html +/// [`x86_64`]: x86_64/index.html +/// [`arm`]: arm/index.html +/// [`aarch64`]: aarch64/index.html +/// [`mips`]: mips/index.html +/// [`mips64`]: mips64/index.html +/// [`powerpc`]: powerpc/index.html +/// [`powerpc64`]: powerpc64/index.html /// /// # Examples ///