ChaCha7539Engine
Implementation of Daniel J. Bernstein's ChaCha stream cipher.
using Org.BouncyCastle.Crypto.Utilities;
using Org.BouncyCastle.Runtime.Intrinsics;
using Org.BouncyCastle.Runtime.Intrinsics.X86;
using System;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Org.BouncyCastle.Crypto.Engines
{
public class ChaCha7539Engine : Salsa20Engine
{
public override string AlgorithmName => "ChaCha7539";
protected override int NonceSize => 12;
protected override void AdvanceCounter()
{
if (++engineState[12] == 0)
throw new InvalidOperationException("attempt to increase counter past 2^32.");
}
protected override void ResetCounter()
{
engineState[12] = 0;
}
protected override void SetKey(byte[] keyBytes, byte[] ivBytes)
{
if (keyBytes != null) {
if (keyBytes.Length != 32)
throw new ArgumentException(AlgorithmName + " requires 256 bit key");
Salsa20Engine.PackTauOrSigma(keyBytes.Length, engineState, 0);
Pack.LE_To_UInt32(keyBytes, 0, engineState, 4, 8);
}
Pack.LE_To_UInt32(ivBytes, 0, engineState, 13, 3);
}
protected override void GenerateKeyStream(byte[] output)
{
ChaChaEngine.ChachaCore(rounds, engineState, output);
}
internal void DoFinal(byte[] inBuf, int inOff, int inLen, byte[] outBuf, int outOff)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (index != 0)
throw new InvalidOperationException(AlgorithmName + " not in block-aligned state");
Check.DataLength(inBuf, inOff, inLen, "input buffer too short");
Check.OutputLength(outBuf, outOff, inLen, "output buffer too short");
while (inLen >= 128) {
ProcessBlocks2(inBuf.AsSpan(inOff), outBuf.AsSpan(outOff));
inOff += 128;
inLen -= 128;
outOff += 128;
}
if (inLen >= 64) {
ImplProcessBlock(inBuf.AsSpan(inOff), outBuf.AsSpan(outOff));
inOff += 64;
inLen -= 64;
outOff += 64;
}
if (inLen > 0) {
GenerateKeyStream(keyStream);
AdvanceCounter();
for (int i = 0; i < inLen; i++) {
outBuf[outOff + i] = (byte)(inBuf[i + inOff] ^ keyStream[i]);
}
}
engineState[12] = 0;
}
internal void ProcessBlock(ReadOnlySpan<byte> input, Span<byte> output)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (LimitExceeded(64))
throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
ImplProcessBlock(input, output);
}
internal void ProcessBlocks2(ReadOnlySpan<byte> input, Span<byte> output)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (LimitExceeded(128))
throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
if (Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled)
ImplProcessBlocks2_X86_Avx2(rounds, engineState, input, output);
else if (Org.BouncyCastle.Runtime.Intrinsics.X86.Sse2.IsEnabled) {
ImplProcessBlocks2_X86_Sse2(rounds, engineState, input, output);
} else {
ImplProcessBlock(input, output);
ImplProcessBlock(input.Slice(64, input.Length - 64), output.Slice(64, output.Length - 64));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void ImplProcessBlock(ReadOnlySpan<byte> input, Span<byte> output)
{
ChaChaEngine.ChachaCore(rounds, engineState, keyStream);
AdvanceCounter();
for (int i = 0; i < 64; i++) {
output[i] = (byte)(keyStream[i] ^ input[i]);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ImplProcessBlocks2_X86_Avx2(int rounds, uint[] state, ReadOnlySpan<byte> input, Span<byte> output)
{
if (!Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled)
throw new PlatformNotSupportedException();
Vector128<uint> vector = Load128_UInt32(state.AsSpan());
Vector128<uint> vector2 = Load128_UInt32(state.AsSpan(4));
Vector128<uint> vector3 = Load128_UInt32(state.AsSpan(8));
Vector128<uint> lower = Load128_UInt32(state.AsSpan(12));
state[12]++;
Vector128<uint> upper = Load128_UInt32(state.AsSpan(12));
state[12]++;
Vector256<uint> vector4 = Vector256.Create(vector, vector);
Vector256<uint> vector5 = Vector256.Create(vector2, vector2);
Vector256<uint> vector6 = Vector256.Create(vector3, vector3);
Vector256<uint> vector7 = Vector256.Create(lower, upper);
Vector256<uint> vector8 = vector4;
Vector256<uint> vector9 = vector5;
Vector256<uint> left = vector6;
Vector256<uint> left2 = vector7;
for (int num = rounds; num > 0; num -= 2) {
vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 16));
left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 12), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 20));
vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 24));
left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 7), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 25));
vector9 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector9, 57);
left = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left, 78);
left2 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left2, 147);
vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 16));
left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 12), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 20));
vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8);
left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 24));
left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 7), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 25));
vector9 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector9, 147);
left = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left, 78);
left2 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left2, 57);
}
vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector4);
vector9 = System.Runtime.Intrinsics.X86.Avx2.Add(vector9, vector5);
left = System.Runtime.Intrinsics.X86.Avx2.Add(left, vector6);
left2 = System.Runtime.Intrinsics.X86.Avx2.Add(left2, vector7);
Vector256<byte> left3 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(vector8, vector9, 32).AsByte();
Vector256<byte> left4 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(left, left2, 32).AsByte();
Vector256<byte> left5 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(vector8, vector9, 49).AsByte();
Vector256<byte> left6 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(left, left2, 49).AsByte();
left3 = System.Runtime.Intrinsics.X86.Avx2.Xor(left3, Load256_Byte(input));
left4 = System.Runtime.Intrinsics.X86.Avx2.Xor(left4, Load256_Byte(input.Slice(32, input.Length - 32)));
left5 = System.Runtime.Intrinsics.X86.Avx2.Xor(left5, Load256_Byte(input.Slice(64, input.Length - 64)));
left6 = System.Runtime.Intrinsics.X86.Avx2.Xor(left6, Load256_Byte(input.Slice(96, input.Length - 96)));
Store256_Byte(left3, output);
Store256_Byte(left4, output.Slice(32, output.Length - 32));
Store256_Byte(left5, output.Slice(64, output.Length - 64));
Store256_Byte(left6, output.Slice(96, output.Length - 96));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ImplProcessBlocks2_X86_Sse2(int rounds, uint[] state, ReadOnlySpan<byte> input, Span<byte> output)
{
if (!Org.BouncyCastle.Runtime.Intrinsics.X86.Sse2.IsEnabled)
throw new PlatformNotSupportedException();
Vector128<uint> vector = Load128_UInt32(state.AsSpan());
Vector128<uint> vector2 = Load128_UInt32(state.AsSpan(4));
Vector128<uint> vector3 = Load128_UInt32(state.AsSpan(8));
Vector128<uint> vector4 = Load128_UInt32(state.AsSpan(12));
state[12]++;
Vector128<uint> vector5 = vector;
Vector128<uint> vector6 = vector2;
Vector128<uint> left = vector3;
Vector128<uint> left2 = vector4;
for (int num = rounds; num > 0; num -= 2) {
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20));
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25));
vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 57);
left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78);
left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 147);
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20));
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25));
vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 147);
left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78);
left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 57);
}
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Add(vector6, vector2);
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, vector3);
left2 = System.Runtime.Intrinsics.X86.Sse2.Add(left2, vector4);
Vector128<byte> left3 = Load128_Byte(input);
Vector128<byte> left4 = Load128_Byte(input.Slice(16, input.Length - 16));
Vector128<byte> left5 = Load128_Byte(input.Slice(32, input.Length - 32));
Vector128<byte> left6 = Load128_Byte(input.Slice(48, input.Length - 48));
left3 = System.Runtime.Intrinsics.X86.Sse2.Xor(left3, vector5.AsByte());
Vector128<byte> s = System.Runtime.Intrinsics.X86.Sse2.Xor(left4, vector6.AsByte());
left5 = System.Runtime.Intrinsics.X86.Sse2.Xor(left5, left.AsByte());
left6 = System.Runtime.Intrinsics.X86.Sse2.Xor(left6, left2.AsByte());
Store128_Byte(left3, output);
Store128_Byte(s, output.Slice(16, output.Length - 16));
Store128_Byte(left5, output.Slice(32, output.Length - 32));
Store128_Byte(left6, output.Slice(48, output.Length - 48));
vector4 = Load128_UInt32(state.AsSpan(12));
state[12]++;
vector5 = vector;
vector6 = vector2;
left = vector3;
left2 = vector4;
for (int num2 = rounds; num2 > 0; num2 -= 2) {
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20));
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25));
vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 57);
left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78);
left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 147);
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20));
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5);
left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24));
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25));
vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 147);
left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78);
left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 57);
}
vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector);
vector6 = System.Runtime.Intrinsics.X86.Sse2.Add(vector6, vector2);
left = System.Runtime.Intrinsics.X86.Sse2.Add(left, vector3);
left2 = System.Runtime.Intrinsics.X86.Sse2.Add(left2, vector4);
left3 = Load128_Byte(input.Slice(64, input.Length - 64));
Vector128<byte> left7 = Load128_Byte(input.Slice(80, input.Length - 80));
left5 = Load128_Byte(input.Slice(96, input.Length - 96));
left6 = Load128_Byte(input.Slice(112, input.Length - 112));
left3 = System.Runtime.Intrinsics.X86.Sse2.Xor(left3, vector5.AsByte());
Vector128<byte> s2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left7, vector6.AsByte());
left5 = System.Runtime.Intrinsics.X86.Sse2.Xor(left5, left.AsByte());
left6 = System.Runtime.Intrinsics.X86.Sse2.Xor(left6, left2.AsByte());
Store128_Byte(left3, output.Slice(64, output.Length - 64));
Store128_Byte(s2, output.Slice(80, output.Length - 80));
Store128_Byte(left5, output.Slice(96, output.Length - 96));
Store128_Byte(left6, output.Slice(112, output.Length - 112));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> Load128_Byte(ReadOnlySpan<byte> t)
{
if (Vector.IsPackedLittleEndian)
return MemoryMarshal.Read<Vector128<byte>>(t);
return Vector128.Create(BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(0, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(8, t.Length - 8))).AsByte();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<uint> Load128_UInt32(ReadOnlySpan<uint> t)
{
if (Vector.IsPackedLittleEndian)
return MemoryMarshal.Read<Vector128<uint>>(MemoryMarshal.AsBytes(t));
return Vector128.Create(t[0], t[1], t[2], t[3]);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> Load256_Byte(ReadOnlySpan<byte> t)
{
if (Vector.IsPackedLittleEndian)
return MemoryMarshal.Read<Vector256<byte>>(t);
return Vector256.Create(BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(0, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(8, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(16, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(24, 8))).AsByte();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Store128_Byte(Vector128<byte> s, Span<byte> t)
{
if (Vector.IsPackedLittleEndian)
MemoryMarshal.Write(t, ref s);
else {
Vector128<ulong> vector = s.AsUInt64();
BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(0, 8), vector.GetElement(0));
BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(8, t.Length - 8), vector.GetElement(1));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Store256_Byte(Vector256<byte> s, Span<byte> t)
{
if (Vector.IsPackedLittleEndian)
MemoryMarshal.Write(t, ref s);
else {
Vector256<ulong> vector = s.AsUInt64();
BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(0, 8), vector.GetElement(0));
BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(8, 8), vector.GetElement(1));
BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(16, 8), vector.GetElement(2));
BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(24, 8), vector.GetElement(3));
}
}
}
}