<PackageReference Include="BouncyCastle.Cryptography" Version="2.6.0" />

Blake2s_X86

static class Blake2s_X86
using Org.BouncyCastle.Runtime.Intrinsics; using Org.BouncyCastle.Runtime.Intrinsics.X86; using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Digests { internal static class Blake2s_X86 { internal static bool IsSupported { get { if (Org.BouncyCastle.Runtime.Intrinsics.X86.Sse41.IsEnabled) return Vector.IsPackedLittleEndian; return false; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void Compress(Span<uint> hashBuffer, ReadOnlySpan<uint> blakeIV, uint t0, uint t1, uint f0, ReadOnlySpan<byte> message) { if (!IsSupported) throw new PlatformNotSupportedException("Blake2s_X86"); Span<byte> span = MemoryMarshal.AsBytes(hashBuffer); ReadOnlySpan<byte> source = MemoryMarshal.AsBytes(blakeIV); Vector128<uint> right = Vector128.Create(t0, t1, f0, 0); Vector128<uint> row = MemoryMarshal.Read<Vector128<uint>>(span); Vector128<uint> row2 = MemoryMarshal.Read<Vector128<uint>>(span.Slice(16, span.Length - 16)); Vector128<uint> row3 = MemoryMarshal.Read<Vector128<uint>>(source); Vector128<uint> row4 = MemoryMarshal.Read<Vector128<uint>>(source.Slice(16, source.Length - 16)); row4 = System.Runtime.Intrinsics.X86.Sse2.Xor(row4, right); Vector128<uint> right2 = row; Vector128<uint> right3 = row2; Perform10Rounds(message, ref row, ref row2, ref row3, ref row4); row = System.Runtime.Intrinsics.X86.Sse2.Xor(row, row3); row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, row4); row = System.Runtime.Intrinsics.X86.Sse2.Xor(row, right2); row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, right3); MemoryMarshal.Write(span, ref row); MemoryMarshal.Write(span.Slice(16, span.Length - 16), ref row2); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Perform10Rounds(ReadOnlySpan<byte> m, ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4) { Vector128<uint> vector = MemoryMarshal.Read<Vector128<uint>>(m); Vector128<uint> vector2 = MemoryMarshal.Read<Vector128<uint>>(m.Slice(16, m.Length - 16)); Vector128<uint> vector3 = MemoryMarshal.Read<Vector128<uint>>(m.Slice(32, m.Length - 32)); Vector128<uint> vector4 = MemoryMarshal.Read<Vector128<uint>>(m.Slice(48, m.Length - 48)); Vector128<uint> b = Sse.Shuffle(vector.AsSingle(), vector2.AsSingle(), 136).AsUInt32(); Vector128<uint> b2 = Sse.Shuffle(vector.AsSingle(), vector2.AsSingle(), 221).AsUInt32(); Vector128<uint> vector5 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector3, 225); Vector128<uint> vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector4, 30); Vector128<uint> b3 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 195).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 60).AsUInt32(); Vector128<uint> b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector5, 177); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector3.AsUInt16(), 12).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 4); Vector128<uint> value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 240).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 147); vector5 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector3, 8); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector4.AsUInt16(), 192).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 240).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 177); vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector2, 4); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector3.AsUInt16(), vector5.AsUInt16(), 48).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector6.AsUInt16(), 240).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 198); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 4); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 12).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 198); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector3, vector4); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector2.AsUInt16(), 12).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 15).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 210); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector3, vector); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector.AsUInt16(), 240).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 8); b2 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), value.AsUInt16(), 192).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector3.AsUInt16(), 60).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector2, 12); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 57); vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 4); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector2.AsUInt16(), 51).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 108); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector5, vector3); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector4.AsUInt16(), 12).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 210); vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector3, 8); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector.AsUInt16(), 12).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 135); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector2.AsUInt16(), 15).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector4.AsUInt16(), 192).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 27); vector5 = System.Runtime.Intrinsics.X86.Ssse3.AlignRight(vector, vector2, 4); b4 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector3.AsUInt16(), 51).AsUInt32(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector2.AsUInt64(), vector3.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector.AsUInt64(), vector3.AsUInt64()).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 51).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 135); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2.AsUInt64(), vector4.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector.AsUInt64(), vector2.AsUInt64()).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 51).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector4.AsUInt64(), vector2.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector3.AsUInt64(), vector.AsUInt64()).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 51).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 147); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector3.AsUInt16(), 3).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector5, 8); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector4.AsUInt16(), 15).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 141); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector, vector3); b = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector5.AsUInt64(), vector6.AsUInt64()).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector3, 4); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector4.AsUInt16(), 3).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 60).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector.AsUInt16(), 12).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector4, 4); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 48).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 177); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector3.AsUInt64(), vector2.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector4, 132); value = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector5, 4); b4 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), value.AsUInt16(), 51).AsUInt32(); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector2, 12); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector4.AsUInt16(), 51).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector3.AsUInt16(), 48).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector2, 4); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 156); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector.AsUInt64(), vector3.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector2, 4); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 12).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 210); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2, vector3); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector.AsUInt64(), vector5.AsUInt64()).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 27); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector4.AsUInt16(), 15).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 141); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector3.AsUInt16(), vector4.AsUInt16(), 48).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector, 4); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 75); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector.AsUInt64(), vector4.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector2.AsUInt64(), vector3.AsUInt64()).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 60).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 180); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector, vector2); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2, vector3); value = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector5.AsUInt64(), vector6.AsUInt64()).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 147); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2, vector4); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector5.AsUInt64(), vector.AsUInt64()).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector3.AsUInt16(), 192).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.ShuffleHigh(value.AsUInt16(), 78).AsUInt32(); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector4); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector3.AsUInt16(), vector5.AsUInt16(), 240).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 39); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector.AsUInt64(), vector4.AsUInt64()).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector3, 8); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 120); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector.AsUInt16(), 48).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector5, 57); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector3.AsUInt16(), 3).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector3.AsUInt16(), 48).AsUInt32(); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 15).AsUInt32(); b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 114); vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector, 4); vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32(); b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 99); vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector4); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector3, vector4); value = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector5.AsUInt64(), vector6.AsUInt64()).AsUInt32(); b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 39); vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector3.AsUInt16(), 192).AsUInt32(); vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector, vector4); value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 15).AsUInt32(); b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 108); Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Round(ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4, Vector128<uint> b1, Vector128<uint> b2, Vector128<uint> b3, Vector128<uint> b4) { Vector128<byte> r = Vector128.Create((byte)1, (byte)2, (byte)3, (byte)0, (byte)5, (byte)6, (byte)7, (byte)4, (byte)9, (byte)10, (byte)11, (byte)8, (byte)13, (byte)14, (byte)15, (byte)12); Vector128<byte> r2 = Vector128.Create((byte)2, (byte)3, (byte)0, (byte)1, (byte)6, (byte)7, (byte)4, (byte)5, (byte)10, (byte)11, (byte)8, (byte)9, (byte)14, (byte)15, (byte)12, (byte)13); G1(r2, ref row1, ref row2, ref row3, ref row4, b1); G2(r, ref row1, ref row2, ref row3, ref row4, b2); Diagonalize(ref row1, ref row3, ref row4); G1(r2, ref row1, ref row2, ref row3, ref row4, b3); G2(r, ref row1, ref row2, ref row3, ref row4, b4); Undiagonalize(ref row1, ref row3, ref row4); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector128<uint> row1, ref Vector128<uint> row3, ref Vector128<uint> row4) { row1 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row1, 147); row3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row3, 57); row4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row4, 78); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G1(Vector128<byte> r16, ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4, Vector128<uint> b0) { row1 = System.Runtime.Intrinsics.X86.Sse2.Add(System.Runtime.Intrinsics.X86.Sse2.Add(row1, b0), row2); row4 = System.Runtime.Intrinsics.X86.Sse2.Xor(row4, row1); row4 = System.Runtime.Intrinsics.X86.Ssse3.Shuffle(row4.AsByte(), r16).AsUInt32(); row3 = System.Runtime.Intrinsics.X86.Sse2.Add(row3, row4); row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, row3); row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(row2, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(row2, 20)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G2(Vector128<byte> r8, ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4, Vector128<uint> b0) { row1 = System.Runtime.Intrinsics.X86.Sse2.Add(System.Runtime.Intrinsics.X86.Sse2.Add(row1, b0), row2); row4 = System.Runtime.Intrinsics.X86.Sse2.Xor(row4, row1); row4 = System.Runtime.Intrinsics.X86.Ssse3.Shuffle(row4.AsByte(), r8).AsUInt32(); row3 = System.Runtime.Intrinsics.X86.Sse2.Add(row3, row4); row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, row3); row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(row2, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(row2, 25)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Undiagonalize(ref Vector128<uint> row1, ref Vector128<uint> row3, ref Vector128<uint> row4) { row1 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row1, 57); row3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row3, 147); row4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row4, 78); } } }