Blake2b_X86
using Org.BouncyCastle.Runtime.Intrinsics;
using Org.BouncyCastle.Runtime.Intrinsics.X86;
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Org.BouncyCastle.Crypto.Digests
{
internal static class Blake2b_X86
{
internal static bool IsSupported {
get {
if (Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled)
return Vector.IsPackedLittleEndian;
return false;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void Compress(Span<ulong> hashBuffer, ReadOnlySpan<ulong> blakeIV, ulong t0, ulong t1, ulong f0, ReadOnlySpan<byte> message)
{
if (!IsSupported)
throw new PlatformNotSupportedException("Blake2b_X86");
Span<byte> span = MemoryMarshal.AsBytes(hashBuffer);
ReadOnlySpan<byte> source = MemoryMarshal.AsBytes(blakeIV);
Vector256<ulong> right = Vector256.Create(t0, t1, f0, 0);
Vector256<ulong> row = MemoryMarshal.Read<Vector256<ulong>>(span);
Vector256<ulong> row2 = MemoryMarshal.Read<Vector256<ulong>>(span.Slice(32, span.Length - 32));
Vector256<ulong> row3 = MemoryMarshal.Read<Vector256<ulong>>(source);
Vector256<ulong> row4 = MemoryMarshal.Read<Vector256<ulong>>(source.Slice(32, source.Length - 32));
row4 = System.Runtime.Intrinsics.X86.Avx2.Xor(row4, right);
Vector256<ulong> right2 = row;
Vector256<ulong> right3 = row2;
Perform12Rounds(message, ref row, ref row2, ref row3, ref row4);
row = System.Runtime.Intrinsics.X86.Avx2.Xor(row, row3);
row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, row4);
row = System.Runtime.Intrinsics.X86.Avx2.Xor(row, right2);
row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, right3);
MemoryMarshal.Write(span, ref row);
MemoryMarshal.Write(span.Slice(32, span.Length - 32), ref row2);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Perform12Rounds(ReadOnlySpan<byte> m, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4)
{
Vector256<ulong> vector = Broadcast128ToVector256<ulong>(m);
Vector256<ulong> vector2 = Broadcast128ToVector256<ulong>(m.Slice(16, m.Length - 16));
Vector256<ulong> vector3 = Broadcast128ToVector256<ulong>(m.Slice(32, m.Length - 32));
Vector256<ulong> vector4 = Broadcast128ToVector256<ulong>(m.Slice(48, m.Length - 48));
Vector256<ulong> vector5 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector, vector2);
Vector256<ulong> vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector4);
Vector256<ulong> b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector5.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector7 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector4);
Vector256<ulong> b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector7.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector8 = Broadcast128ToVector256<ulong>(m.Slice(64, m.Length - 64));
Vector256<ulong> vector9 = Broadcast128ToVector256<ulong>(m.Slice(80, m.Length - 80));
Vector256<ulong> vector10 = Broadcast128ToVector256<ulong>(m.Slice(96, m.Length - 96));
Vector256<ulong> vector11 = Broadcast128ToVector256<ulong>(m.Slice(112, m.Length - 112));
Vector256<ulong> vector12 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector10);
Vector256<ulong> b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector12.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector13 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector11, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector10);
Vector256<ulong> b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector13.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector14 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector3);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector10);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector14.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector15 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector4, vector11, 8);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector15.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector16 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector9.AsUInt32(), 204).AsUInt64();
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector16.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector17 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector10, vector2, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector4.AsUInt32(), 204).AsUInt64();
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector17.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector18 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector10, vector9, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector11);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector18.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector19 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector8, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector10.AsUInt32(), 204).AsUInt64();
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector19.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector20 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector9, vector8, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector2, vector4);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector20.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector21 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector11);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector4.AsUInt32(), vector.AsUInt32(), 204).AsUInt64();
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector21.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector22 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector4, vector2);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector9);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector22.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector23 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector10, vector11);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector23.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector24 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector2, vector11, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector3.AsUInt32(), 78).AsUInt64();
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector24.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector25 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector8, vector4);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector25.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector26 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector3);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector2, vector9);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector26.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector27 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector4.AsUInt32(), 204).AsUInt64();
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector3.AsUInt32(), vector11.AsUInt32(), 204).AsUInt64();
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector27.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector28 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector11, vector2, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector4, vector9, 8);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector28.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector29 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector10, vector8);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector29.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector30 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector2, vector4);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector, vector8);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector30.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector31 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector10, vector9);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector2);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector31.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector32 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector3, vector, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector4, vector11);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector32.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector33 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector10);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector11, vector3, 8);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector33.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector34 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector10.AsUInt32(), vector.AsUInt32(), 204).AsUInt64();
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector3);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector34.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector35 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector11);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector9, vector10, 8);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector35.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector36 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector8, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector4.AsUInt32(), vector8.AsUInt32(), 204).AsUInt64();
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector36.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector37 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector4);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector2.AsUInt32(), 78).AsUInt64();
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector37.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector38 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector4);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector10.AsUInt32(), vector2.AsUInt32(), 204).AsUInt64();
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector38.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector39 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector11, vector9, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector, vector8);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector39.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector40 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector3.AsUInt32(), 204).AsUInt64();
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector8, vector11, 8);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector40.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector41 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector4);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector41.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector42 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector4, vector11);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector, vector9, 8);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector42.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector43 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector11, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector8, vector2, 8);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector43.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector44 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector10);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector44.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector45 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector2, vector3, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector3, vector4, 8);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector45.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector46 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector4, vector);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector46.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector47 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector2, vector3);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector4.AsUInt32(), vector3.AsUInt32(), 204).AsUInt64();
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector47.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector48 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector10, vector11);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector2);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector48.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector49 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector9.AsUInt32(), 204).AsUInt64();
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector10);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector49.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector50 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector3, vector4);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector50.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector51 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector, vector2);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector4);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector51.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector52 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector10);
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector52.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector53 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector11, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector9, vector10);
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector53.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
Vector256<ulong> vector54 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector11, vector3);
vector6 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector8, vector10);
b = System.Runtime.Intrinsics.X86.Avx2.Blend(vector54.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector55 = System.Runtime.Intrinsics.X86.Avx2.UnpackLow(vector9, vector8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector4, vector11, 8);
b2 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector55.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector56 = System.Runtime.Intrinsics.X86.Avx2.UnpackHigh(vector3, vector);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector.AsUInt32(), vector9.AsUInt32(), 204).AsUInt64();
b3 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector56.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Vector256<ulong> vector57 = System.Runtime.Intrinsics.X86.Avx2.AlignRight(vector10, vector2, 8);
vector6 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector2.AsUInt32(), vector4.AsUInt32(), 204).AsUInt64();
b4 = System.Runtime.Intrinsics.X86.Avx2.Blend(vector57.AsUInt32(), vector6.AsUInt32(), 240).AsUInt64();
Round(ref row1, ref row2, ref row3, ref row4, b, b2, b3, b4);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Round(ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b1, Vector256<ulong> b2, Vector256<ulong> b3, Vector256<ulong> b4)
{
Vector256<byte> r = Vector256.Create((byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)2, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9, (byte)10, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)2, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9, (byte)10);
Vector256<byte> r2 = Vector256.Create((byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)0, (byte)1, (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)8, (byte)9);
G1(r, ref row1, ref row2, ref row3, ref row4, b1);
G2(r2, ref row1, ref row2, ref row3, ref row4, b2);
Diagonalize(ref row1, ref row3, ref row4);
G1(r, ref row1, ref row2, ref row3, ref row4, b3);
G2(r2, ref row1, ref row2, ref row3, ref row4, b4);
Undiagonalize(ref row1, ref row3, ref row4);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Diagonalize(ref Vector256<ulong> row1, ref Vector256<ulong> row3, ref Vector256<ulong> row4)
{
row1 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row1, 147);
row3 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row3, 57);
row4 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row4, 78);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void G1(Vector256<byte> r24, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0)
{
row1 = System.Runtime.Intrinsics.X86.Avx2.Add(System.Runtime.Intrinsics.X86.Avx2.Add(row1, b0), row2);
row4 = System.Runtime.Intrinsics.X86.Avx2.Xor(row4, row1);
row4 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(row4.AsUInt32(), 177).AsUInt64();
row3 = System.Runtime.Intrinsics.X86.Avx2.Add(row3, row4);
row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, row3);
row2 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void G2(Vector256<byte> r16, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0)
{
row1 = System.Runtime.Intrinsics.X86.Avx2.Add(System.Runtime.Intrinsics.X86.Avx2.Add(row1, b0), row2);
row4 = System.Runtime.Intrinsics.X86.Avx2.Xor(row4, row1);
row4 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
row3 = System.Runtime.Intrinsics.X86.Avx2.Add(row3, row4);
row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(row2, row3);
row2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(row2, 63), System.Runtime.Intrinsics.X86.Avx2.Add(row2, row2));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Undiagonalize(ref Vector256<ulong> row1, ref Vector256<ulong> row3, ref Vector256<ulong> row4)
{
row1 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row1, 57);
row3 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row3, 147);
row4 = System.Runtime.Intrinsics.X86.Avx2.Permute4x64(row4, 78);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<T> Broadcast128ToVector256<T>(ReadOnlySpan<byte> source) where T : struct
{
Vector128<T> vector = MemoryMarshal.Read<Vector128<T>>(source);
return vector.ToVector256Unsafe().WithUpper(vector);
}
}
}