Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.InteropServices;
using static System.IO.Hashing.VectorHelper;

namespace System.IO.Hashing
{
Expand All @@ -17,7 +18,9 @@ public partial class Crc32
private static bool CanBeVectorized(ReadOnlySpan<byte> source) =>
BitConverter.IsLittleEndian
&& VectorHelper.IsSupported
&& source.Length >= Vector128<byte>.Count * 4;
// Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they
// seem to be more performant for spans less than 8 vectors (128 bytes).
&& source.Length >= Vector128<byte>.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is JIT smart enough to turn Vector128<byte>.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1) into a constant?

cc @EgorBo

Copy link
Member

@EgorBo EgorBo May 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is JIT smart enough to turn Vector128<byte>.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1) into a constant?

cc @EgorBo

Just checked: folded to a constant (on both x64 and arm64)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@EgorBo thank you very much!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was also one of my concerns. My testing in sharplab.io also shows it folding into a constant.


// Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
// followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires
Expand All @@ -35,102 +38,81 @@ private static uint UpdateVectorized(uint crc, ReadOnlySpan<byte> source)
ref byte srcRef = ref MemoryMarshal.GetReference(source);
int length = source.Length;

Vector128<ulong> x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
Vector128<ulong> x5;
Vector128<ulong> kConstants;
Vector128<ulong> x1; // Accumulator for the new CRC
Vector128<ulong> x2;

x1 ^= Vector128.CreateScalar(crc).AsUInt64();
Vector128<ulong> x0 = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;

// Parallel fold blocks of 64, if any.
while (length >= Vector128<byte>.Count * 4)
if (length >= Vector128<byte>.Count * 8)
{
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
Vector128<ulong> x6 = VectorHelper.CarrylessMultiplyLower(x2, x0);
Vector128<ulong> x7 = VectorHelper.CarrylessMultiplyLower(x3, x0);
Vector128<ulong> x8 = VectorHelper.CarrylessMultiplyLower(x4, x0);

x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x2 = VectorHelper.CarrylessMultiplyUpper(x2, x0);
x3 = VectorHelper.CarrylessMultiplyUpper(x3, x0);
x4 = VectorHelper.CarrylessMultiplyUpper(x4, x0);

Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

x1 ^= x5;
x2 ^= x6;
x3 ^= x7;
x4 ^= x8;

x1 ^= y5;
x2 ^= y6;
x3 ^= y7;
x4 ^= y8;
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
}

// Fold into 128-bits.
x0 = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x2;
x1 ^= x5;
// Load and XOR the initial CRC value
x1 ^= Vector128.CreateScalar(crc).AsUInt64();

kConstants = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2

// Parallel fold blocks of 64, if any.
do
{
Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

x1 = FoldPolynomialPair(y5, x1, kConstants);
x2 = FoldPolynomialPair(y6, x2, kConstants);
x3 = FoldPolynomialPair(y7, x3, kConstants);
x4 = FoldPolynomialPair(y8, x4, kConstants);

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
length -= Vector128<byte>.Count * 4;
} while (length >= Vector128<byte>.Count * 4);

// Fold into 128-bits.
kConstants = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
x1 = FoldPolynomialPair(x2, x1, kConstants);
x1 = FoldPolynomialPair(x3, x1, kConstants);
x1 = FoldPolynomialPair(x4, x1, kConstants);
}
else
{
// For shorter sources just load the first vector and XOR with the CRC
Debug.Assert(length >= 16);

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x3;
x1 ^= x5;
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
x1 ^= Vector128.CreateScalar(crc).AsUInt64();

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x4;
x1 ^= x5;
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}

// Single fold blocks of 16, if any.
while (length >= Vector128<byte>.Count)
{
x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();

x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
x1 ^= x2;
x1 ^= x5;
x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1,
Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL));

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}

// Fold 128 bits to 64 bits.
x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x1, x0);
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
x1 = VectorHelper.ShiftRightBytesInVector(x1, 8);
x1 ^= x2;

x0 = Vector128.CreateScalar(0x0163cd6124UL); // k5, k0

x2 = VectorHelper.ShiftRightBytesInVector(x1, 4);
x1 &= x3;
x1 = VectorHelper.CarrylessMultiplyLower(x1, x0);
x1 ^= x2;
Vector128<ulong> bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
x1 = ShiftRightBytesInVector(x1, 8) ^
CarrylessMultiplyLower(x1, Vector128.CreateScalar(0x00ccaa009eUL));
x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(0x0163cd6124UL)) ^ // k5, k0
ShiftRightBytesInVector(x1, 4);

// Reduce to 32 bits.
x0 = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial

x2 = x1 & x3;
x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x2, x0);
x2 &= x3;
x2 = VectorHelper.CarrylessMultiplyLower(x2, x0);
kConstants = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, kConstants) & bitmask;
x2 = CarrylessMultiplyLower(x2, kConstants);
x1 ^= x2;

// Process the remaining bytes, if any
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.InteropServices;
using static System.IO.Hashing.VectorHelper;

namespace System.IO.Hashing
{
Expand Down Expand Up @@ -72,7 +73,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
// Load and XOR the initial CRC value
// CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
// because data will be byte-reflected and will align with initial crc at correct place.
x0 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
x0 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));

kConstants = Vector128.Create(0x5cf79dea9ac37d6UL, 0x001067e571d7d5c2UL); // k3, k4

Expand All @@ -81,36 +82,36 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
{
Vector128<ulong> y1 = LoadFromSource(ref srcRef, 0);
Vector128<ulong> y2 = LoadFromSource(ref srcRef, 16);
x0 = VectorHelper.FoldPolynomialPair(y1, x0, kConstants);
x1 = VectorHelper.FoldPolynomialPair(y2, x1, kConstants);
x0 = FoldPolynomialPair(y1, x0, kConstants);
x1 = FoldPolynomialPair(y2, x1, kConstants);

y1 = LoadFromSource(ref srcRef, 32);
y2 = LoadFromSource(ref srcRef, 48);
x2 = VectorHelper.FoldPolynomialPair(y1, x2, kConstants);
x3 = VectorHelper.FoldPolynomialPair(y2, x3, kConstants);
x2 = FoldPolynomialPair(y1, x2, kConstants);
x3 = FoldPolynomialPair(y2, x3, kConstants);

y1 = LoadFromSource(ref srcRef, 64);
y2 = LoadFromSource(ref srcRef, 80);
x4 = VectorHelper.FoldPolynomialPair(y1, x4, kConstants);
x5 = VectorHelper.FoldPolynomialPair(y2, x5, kConstants);
x4 = FoldPolynomialPair(y1, x4, kConstants);
x5 = FoldPolynomialPair(y2, x5, kConstants);

y1 = LoadFromSource(ref srcRef, 96);
y2 = LoadFromSource(ref srcRef, 112);
x6 = VectorHelper.FoldPolynomialPair(y1, x6, kConstants);
x7 = VectorHelper.FoldPolynomialPair(y2, x7, kConstants);
x6 = FoldPolynomialPair(y1, x6, kConstants);
x7 = FoldPolynomialPair(y2, x7, kConstants);

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 8);
length -= Vector128<byte>.Count * 8;
} while (length >= Vector128<byte>.Count * 8);

// Fold into 128-bits in x7
x7 = VectorHelper.FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
x7 = VectorHelper.FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
x7 = VectorHelper.FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
x7 = VectorHelper.FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
x7 = VectorHelper.FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
x7 = VectorHelper.FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
x7 = VectorHelper.FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
x7 = FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
x7 = FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
x7 = FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
x7 = FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
x7 = FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
x7 = FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
x7 = FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
}
else
{
Expand All @@ -122,7 +123,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
// Load and XOR the initial CRC value
// CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
// because the data will be byte-reflected and will align with initial crc at correct place.
x7 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
x7 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
Expand All @@ -131,22 +132,22 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
// Single fold blocks of 16, if any, into x7
while (length >= Vector128<byte>.Count)
{
x7 = VectorHelper.FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
x7 = FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2

srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
length -= Vector128<byte>.Count;
}

// Compute CRC of a 128-bit value and fold to the upper 64-bits
x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
VectorHelper.ShiftLowerToUpper(x7);
x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
ShiftLowerToUpper(x7);

// Barrett reduction
kConstants = Vector128.Create(0x578d29d06cc4f872UL, 0x42f0e1eba9ea3693UL); // k7, k8
Vector128<ulong> temp = x7;
x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
x7 = VectorHelper.CarrylessMultiplyUpper(x7, kConstants);
x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
x7 = CarrylessMultiplyUpper(x7, kConstants);
x7 ^= temp;

// Process the remaining bytes, if any
Expand Down