<PackageReference Include="BouncyCastle.Cryptography" Version="2.7.0-beta.98" />

Blake2b_X86

static class Blake2b_X86
using Org.BouncyCastle.Runtime.Intrinsics; using Org.BouncyCastle.Runtime.Intrinsics.X86; using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Digests { internal static class Blake2b_X86 { internal static bool IsSupported { get { if (Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled) return Vector.IsPackedLittleEndian; return false; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void Compress(Span<ulong> hashBuffer, ReadOnlySpan<ulong> blakeIV, ulong t0, ulong t1, ulong f0, ReadOnlySpan<byte> message) { if (!IsSupported) throw new PlatformNotSupportedException("Blake2b_X86"); Span<byte> span = MemoryMarshal.AsBytes(hashBuffer); ReadOnlySpan<byte> source = MemoryMarshal.AsBytes(blakeIV); Vector256<ulong> right = Vector256.Create(t0, t1, f0, 0); Vector256<ulong> row = MemoryMarshal.Read<Vector256<ulong>>(span); Vector256<ulong> row2 = MemoryMarshal.Read<Vector256<ulong>>(span.Slice(32, span.Length - 32)); Vector256<ulong> row3 = MemoryMarshal.Read<Vector256<ulong>>(source); Vector256<ulong> row4 = MemoryMarshal.Read<Vector256<ulong>>(source.Slice(32, source.Length - 32)); row4 = System.Runtime.Intrinsics.X86.Avx2.Xor(row4, right); Vector256<ulong> right2 = row; Vector256<ulong> right3 = row2; Perform12Rounds(message, ref row, ref row2, ref row3, ref row4); row = System.Runtime.Intrinsics.X86.Avx2.Xor(row, row3); row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, row4); row = System.Runtime.Intrinsics.X86.Avx2.Xor(row, right2); row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, right3); MemoryMarshal.Write(span, ref row); MemoryMarshal.Write(span.Slice(32, span.Length - 32), ref row2); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Perform12Rounds(ReadOnlySpan<byte> m, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4) { Vector256<ulong> vector = Broadcast128ToVector256<ulong>(m); Vector256<ulong> vector2 = Broadcast128ToVector256<ulong>(m.Slice(16, m.Length - 16)); Vector256<ulong> vector3 = Broadcast128ToVector256<ulong>(m.Slice(32, m.Length - 32)); Vector256<ulong> vector4 = Broadcast128ToVector256<ulong>(m.Slice(48, m.Length - 48)); Vector256<ulong> vector5 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector, vector2); Vector256<ulong> vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector4); Vector256<ulong> b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector5.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector7 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector4); Vector256<ulong> b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector7.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector8 = Broadcast128ToVector256<ulong>(m.Slice(64, m.Length - 64)); Vector256<ulong> vector9 = Broadcast128ToVector256<ulong>(m.Slice(80, m.Length - 80)); Vector256<ulong> vector10 = Broadcast128ToVector256<ulong>(m.Slice(96, m.Length - 96)); Vector256<ulong> vector11 = Broadcast128ToVector256<ulong>(m.Slice(112, m.Length - 112)); Vector256<ulong> vector12 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector10); Vector256<ulong> b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector12.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector13 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector11, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector10); Vector256<ulong> b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector13.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector14 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector3); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector10); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector14.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector15 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector4, vector11, 8); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector15.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector16 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector9.AsUInt32(), 204).AsUInt64(); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector16.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector17 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector10, vector2, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector4.AsUInt32(), 204).AsUInt64(); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector17.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector18 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector10, vector9, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector11); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector18.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector19 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector8, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector10.AsUInt32(), 204).AsUInt64(); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector19.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector20 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector9, vector8, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector2, vector4); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector20.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector21 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector11); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector4.AsUInt32(), vector.AsUInt32(), 204).AsUInt64(); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector21.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector22 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector4, vector2); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector9); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector22.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector23 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector10, vector11); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector23.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector24 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector2, vector11, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector3.AsUInt32(), 78).AsUInt64(); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector24.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector25 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector8, vector4); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector25.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector26 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector3); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector2, vector9); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector26.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector27 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector4.AsUInt32(), 204).AsUInt64(); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector3.AsUInt32(), vector11.AsUInt32(), 204).AsUInt64(); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector27.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector28 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector11, vector2, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector4, vector9, 8); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector28.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector29 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector10, vector8); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector29.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector30 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector2, vector4); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector, vector8); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector30.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector31 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector10, vector9); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector2); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector31.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector32 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector3, vector, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector4, vector11); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector32.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector33 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector10); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector11, vector3, 8); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector33.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector34 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector10.AsUInt32(), vector.AsUInt32(), 204).AsUInt64(); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector3); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector34.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector35 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector11); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector9, vector10, 8); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector35.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector36 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector8, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector4.AsUInt32(), vector8.AsUInt32(), 204).AsUInt64(); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector36.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector37 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector4); vector6 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector2.AsUInt32(), 78).AsUInt64(); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector37.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector38 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector4); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector10.AsUInt32(), vector2.AsUInt32(), 204).AsUInt64(); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector38.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector39 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector11, vector9, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector, vector8); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector39.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector40 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector3.AsUInt32(), 204).AsUInt64(); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector8, vector11, 8); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector40.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector41 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector4); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector41.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector42 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector4, vector11); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector, vector9, 8); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector42.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector43 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector11, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector8, vector2, 8); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector43.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector44 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector10); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector44.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector45 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector2, vector3, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector3, vector4, 8); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector45.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector46 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector4, vector); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector46.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector47 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector2, vector3); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector4.AsUInt32(), vector3.AsUInt32(), 204).AsUInt64(); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector47.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector48 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector11); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector2); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector48.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector49 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector9.AsUInt32(), 204).AsUInt64(); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector10); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector49.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector50 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector4); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector50.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector51 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector4); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector51.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector52 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector10); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector52.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector53 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector11, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector10); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector53.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); Vector256<ulong> vector54 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector3); vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector10); b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector54.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector55 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector8); vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector4, vector11, 8); b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector55.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector56 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector9.AsUInt32(), 204).AsUInt64(); b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector56.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Vector256<ulong> vector57 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector10, vector2, 8); vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector4.AsUInt32(), 204).AsUInt64(); b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector57.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Round(ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b1, Vector256<ulong> b2, Vector256<ulong> b3, Vector256<ulong> b4) { Vector256<byte> r = Vector256.Create((byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)2, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9, (byte)10, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)2, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9, (byte)10); Vector256<byte> r2 = Vector256.Create((byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9); G1(r, ref row1, ref row2, ref row3, ref row4, b1); G2(r2, ref row1, ref row2, ref row3, ref row4, b2); Diagonalize(ref row1, ref row3, ref row4); G1(r, ref row1, ref row2, ref row3, ref row4, b3); G2(r2, ref row1, ref row2, ref row3, ref row4, b4); Undiagonalize(ref row1, ref row3, ref row4); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector256<ulong> row1, ref Vector256<ulong> row3, ref Vector256<ulong> row4) { row1 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row1, 147); row3 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row3, 57); row4 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row4, 78); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G1(Vector256<byte> r24, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0) { row1 = System.Runtime.Intrinsics.X86.Avx2.Add(System.Runtime.Intrinsics.X86.Avx2.Add(row1, b0), row2); row4 = System.Runtime.Intrinsics.X86.Avx2.Xor(row4, row1); row4 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(row4.AsUInt32(), 177).AsUInt64(); row3 = System.Runtime.Intrinsics.X86.Avx2.Add(row3, row4); row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, row3); row2 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G2(Vector256<byte> r16, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0) { row1 = System.Runtime.Intrinsics.X86.Avx2.Add(System.Runtime.Intrinsics.X86.Avx2.Add(row1, b0), row2); row4 = System.Runtime.Intrinsics.X86.Avx2.Xor(row4, row1); row4 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); row3 = System.Runtime.Intrinsics.X86.Avx2.Add(row3, row4); row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, row3); row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(row2, 63), System.Runtime.Intrinsics.X86.Avx2.Add(row2, row2)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Undiagonalize(ref Vector256<ulong> row1, ref Vector256<ulong> row3, ref Vector256<ulong> row4) { row1 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row1, 57); row3 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row3, 147); row4 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row4, 78); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256<T> Broadcast128ToVector256<T>(ReadOnlySpan<byte> source) where T : struct { Vector128<T> vector = MemoryMarshal.Read<Vector128<T>>(source); return vector.ToVector256Unsafe().WithUpper(vector); } } }