Blake2s_X86
using Org.BouncyCastle.Runtime.Intrinsics;
using Org.BouncyCastle.Runtime.Intrinsics.X86;
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Org.BouncyCastle.Crypto.Digests
{
internal static class Blake2s_X86
{
internal static bool IsSupported {
get {
if (Org.BouncyCastle.Runtime.Intrinsics.X86.Sse41.IsEnabled)
return Vector.IsPackedLittleEndian;
return false;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void Compress(Span<uint> hashBuffer, ReadOnlySpan<uint> blakeIV, uint t0, uint t1, uint f0, ReadOnlySpan<byte> message)
{
if (!IsSupported)
throw new PlatformNotSupportedException("Blake2s_X86");
Span<byte> span = MemoryMarshal.AsBytes(hashBuffer);
ReadOnlySpan<byte> source = MemoryMarshal.AsBytes(blakeIV);
Vector128<uint> right = Vector128.Create(t0, t1, f0, 0);
Vector128<uint> row = MemoryMarshal.Read<Vector128<uint>>(span);
Vector128<uint> row2 = MemoryMarshal.Read<Vector128<uint>>(span.Slice(16, span.Length - 16));
Vector128<uint> row3 = MemoryMarshal.Read<Vector128<uint>>(source);
Vector128<uint> row4 = MemoryMarshal.Read<Vector128<uint>>(source.Slice(16, source.Length - 16));
row4 = System.Runtime.Intrinsics.X86.Sse2.Xor(row4, right);
Vector128<uint> right2 = row;
Vector128<uint> right3 = row2;
Perform10Rounds(message, ref row, ref row2, ref row3, ref row4);
row = System.Runtime.Intrinsics.X86.Sse2.Xor(row, row3);
row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, row4);
row = System.Runtime.Intrinsics.X86.Sse2.Xor(row, right2);
row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, right3);
MemoryMarshal.Write(span, ref row);
MemoryMarshal.Write(span.Slice(16, span.Length - 16), ref row2);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Perform10Rounds(ReadOnlySpan<byte> m, ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4)
{
Vector128<uint> vector = MemoryMarshal.Read<Vector128<uint>>(m);
Vector128<uint> vector2 = MemoryMarshal.Read<Vector128<uint>>(m.Slice(16, m.Length - 16));
Vector128<uint> vector3 = MemoryMarshal.Read<Vector128<uint>>(m.Slice(32, m.Length - 32));
Vector128<uint> vector4 = MemoryMarshal.Read<Vector128<uint>>(m.Slice(48, m.Length - 48));
Vector128<uint> b = Sse.Shuffle(vector.AsSingle(), vector2.AsSingle(), 136).AsUInt32();
Vector128<uint> b2 = Sse.Shuffle(vector.AsSingle(), vector2.AsSingle(), 221).AsUInt32();
Vector128<uint> vector5 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector3, 225);
Vector128<uint> vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector4, 30);
Vector128<uint> b3 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 195).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 60).AsUInt32();
Vector128<uint> b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector5, 177);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector3.AsUInt16(), 12).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 4);
Vector128<uint> value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 240).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 147);
vector5 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector3, 8);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector4.AsUInt16(), 192).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 240).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 177);
vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector2, 4);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector3.AsUInt16(), vector5.AsUInt16(), 48).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector6.AsUInt16(), 240).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 198);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 4);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 12).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 198);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector3, vector4);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector2.AsUInt16(), 12).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 15).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 210);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector3, vector);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector.AsUInt16(), 240).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 8);
b2 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), value.AsUInt16(), 192).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector3.AsUInt16(), 60).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector2, 12);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 57);
vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector4, 4);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector2.AsUInt16(), 51).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 108);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector5, vector3);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector4.AsUInt16(), 12).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 210);
vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector3, 8);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector.AsUInt16(), 12).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 135);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector2.AsUInt16(), 15).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector4.AsUInt16(), 192).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 27);
vector5 = System.Runtime.Intrinsics.X86.Ssse3.AlignRight(vector, vector2, 4);
b4 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector3.AsUInt16(), 51).AsUInt32();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector2.AsUInt64(), vector3.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector.AsUInt64(), vector3.AsUInt64()).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 51).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 135);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2.AsUInt64(), vector4.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector.AsUInt64(), vector2.AsUInt64()).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 51).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector4.AsUInt64(), vector2.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector3.AsUInt64(), vector.AsUInt64()).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 51).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 147);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector3.AsUInt16(), 3).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector5, 8);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector4.AsUInt16(), 15).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 141);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector, vector3);
b = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector5.AsUInt64(), vector6.AsUInt64()).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector3, 4);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector4.AsUInt16(), 3).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 60).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector.AsUInt16(), 12).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector4, 4);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 48).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 177);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector3.AsUInt64(), vector2.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector4, 132);
value = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector5, 4);
b4 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), value.AsUInt16(), 51).AsUInt32();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector2, 12);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector4.AsUInt16(), 51).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector3.AsUInt16(), 48).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector2, 4);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 156);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector.AsUInt64(), vector3.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector2, 4);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 12).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 210);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2, vector3);
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector.AsUInt64(), vector5.AsUInt64()).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 27);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector4.AsUInt16(), 15).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 141);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector3.AsUInt16(), vector4.AsUInt16(), 48).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector, 4);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 75);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector.AsUInt64(), vector4.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector2.AsUInt64(), vector3.AsUInt64()).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 60).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 180);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2, vector3);
value = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector5.AsUInt64(), vector6.AsUInt64()).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 147);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector2, vector4);
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector5.AsUInt64(), vector.AsUInt64()).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector3.AsUInt16(), 192).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.ShuffleHigh(value.AsUInt16(), 78).AsUInt32();
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector4);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector3.AsUInt16(), vector5.AsUInt16(), 240).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 39);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector.AsUInt64(), vector4.AsUInt64()).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical128BitLane(vector3, 8);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 3).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 120);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector.AsUInt16(), 48).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector5, 57);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector.AsUInt16(), vector3.AsUInt16(), 3).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector3.AsUInt16(), 48).AsUInt32();
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector6.AsUInt16(), vector5.AsUInt16(), 15).AsUInt32();
b = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 114);
vector5 = System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical128BitLane(vector, 4);
vector6 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector2.AsUInt16(), vector5.AsUInt16(), 192).AsUInt32();
b2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 99);
vector5 = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector, vector4);
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector3, vector4);
value = System.Runtime.Intrinsics.X86.Sse2.UnpackHigh(vector5.AsUInt64(), vector6.AsUInt64()).AsUInt32();
b3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 39);
vector5 = System.Runtime.Intrinsics.X86.Sse41.Blend(vector4.AsUInt16(), vector3.AsUInt16(), 192).AsUInt32();
vector6 = System.Runtime.Intrinsics.X86.Sse2.UnpackLow(vector, vector4);
value = System.Runtime.Intrinsics.X86.Sse41.Blend(vector5.AsUInt16(), vector6.AsUInt16(), 15).AsUInt32();
b4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(value, 108);
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Round(ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4, Vector128<uint> b1, Vector128<uint> b2, Vector128<uint> b3, Vector128<uint> b4)
{
Vector128<byte> r = Vector128.Create((byte)1, (byte)2, (byte)3, (byte)0, (byte)5, (byte)6, (byte)7, (byte)4, (byte)9, (byte)10, (byte)11, (byte)8, (byte)13, (byte)14, (byte)15, (byte)12);
Vector128<byte> r2 = Vector128.Create((byte)2, (byte)3, (byte)0, (byte)1, (byte)6, (byte)7, (byte)4, (byte)5, (byte)10, (byte)11, (byte)8, (byte)9, (byte)14, (byte)15, (byte)12, (byte)13);
G1(r2, ref row1, ref row2, ref row3, ref row4, b1);
G2(r, ref row1, ref row2, ref row3, ref row4, b2);
Diagonalize(ref row1, ref row3, ref row4);
G1(r2, ref row1, ref row2, ref row3, ref row4, b3);
G2(r, ref row1, ref row2, ref row3, ref row4, b4);
Undiagonalize(ref row1, ref row3, ref row4);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Diagonalize(ref Vector128<uint> row1, ref Vector128<uint> row3, ref Vector128<uint> row4)
{
row1 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row1, 147);
row3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row3, 57);
row4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row4, 78);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void G1(Vector128<byte> r16, ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4, Vector128<uint> b0)
{
row1 = System.Runtime.Intrinsics.X86.Sse2.Add(System.Runtime.Intrinsics.X86.Sse2.Add(row1, b0), row2);
row4 = System.Runtime.Intrinsics.X86.Sse2.Xor(row4, row1);
row4 = System.Runtime.Intrinsics.X86.Ssse3.Shuffle(row4.AsByte(), r16).AsUInt32();
row3 = System.Runtime.Intrinsics.X86.Sse2.Add(row3, row4);
row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, row3);
row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(row2, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(row2, 20));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void G2(Vector128<byte> r8, ref Vector128<uint> row1, ref Vector128<uint> row2, ref Vector128<uint> row3, ref Vector128<uint> row4, Vector128<uint> b0)
{
row1 = System.Runtime.Intrinsics.X86.Sse2.Add(System.Runtime.Intrinsics.X86.Sse2.Add(row1, b0), row2);
row4 = System.Runtime.Intrinsics.X86.Sse2.Xor(row4, row1);
row4 = System.Runtime.Intrinsics.X86.Ssse3.Shuffle(row4.AsByte(), r8).AsUInt32();
row3 = System.Runtime.Intrinsics.X86.Sse2.Add(row3, row4);
row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(row2, row3);
row2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(row2, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(row2, 25));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Undiagonalize(ref Vector128<uint> row1, ref Vector128<uint> row3, ref Vector128<uint> row4)
{
row1 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row1, 57);
row3 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row3, 147);
row4 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(row4, 78);
}
}
}