<PackageReference Include="BouncyCastle.Cryptography" Version="2.5.0" />

ChaCha7539Engine

Implementation of Daniel J. Bernstein's ChaCha stream cipher.
using Org.BouncyCastle.Crypto.Utilities; using Org.BouncyCastle.Runtime.Intrinsics; using Org.BouncyCastle.Runtime.Intrinsics.X86; using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Engines { public class ChaCha7539Engine : Salsa20Engine { public override string AlgorithmName => "ChaCha7539"; protected override int NonceSize => 12; protected override void AdvanceCounter() { if (++engineState[12] == 0) throw new InvalidOperationException("attempt to increase counter past 2^32."); } protected override void ResetCounter() { engineState[12] = 0; } protected override void SetKey(byte[] keyBytes, byte[] ivBytes) { if (keyBytes != null) { if (keyBytes.Length != 32) throw new ArgumentException(AlgorithmName + " requires 256 bit key"); Salsa20Engine.PackTauOrSigma(keyBytes.Length, engineState, 0); Pack.LE_To_UInt32(keyBytes, 0, engineState, 4, 8); } Pack.LE_To_UInt32(ivBytes, 0, engineState, 13, 3); } protected override void GenerateKeyStream(byte[] output) { ChaChaEngine.ChachaCore(rounds, engineState, output); } internal void DoFinal(byte[] inBuf, int inOff, int inLen, byte[] outBuf, int outOff) { if (!initialised) throw new InvalidOperationException(AlgorithmName + " not initialised"); if (index != 0) throw new InvalidOperationException(AlgorithmName + " not in block-aligned state"); Check.DataLength(inBuf, inOff, inLen, "input buffer too short"); Check.OutputLength(outBuf, outOff, inLen, "output buffer too short"); while (inLen >= 128) { ProcessBlocks2(inBuf.AsSpan(inOff), outBuf.AsSpan(outOff)); inOff += 128; inLen -= 128; outOff += 128; } if (inLen >= 64) { ImplProcessBlock(inBuf.AsSpan(inOff), outBuf.AsSpan(outOff)); inOff += 64; inLen -= 64; outOff += 64; } if (inLen > 0) { GenerateKeyStream(keyStream); AdvanceCounter(); for (int i = 0; i < inLen; i++) { outBuf[outOff + i] = (byte)(inBuf[i + inOff] ^ keyStream[i]); } } engineState[12] = 0; } internal void ProcessBlock(ReadOnlySpan<byte> input, Span<byte> output) { if (!initialised) throw new InvalidOperationException(AlgorithmName + " not initialised"); if (LimitExceeded(64)) throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV"); ImplProcessBlock(input, output); } internal void ProcessBlocks2(ReadOnlySpan<byte> input, Span<byte> output) { if (!initialised) throw new InvalidOperationException(AlgorithmName + " not initialised"); if (LimitExceeded(128)) throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV"); if (Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled) ImplProcessBlocks2_X86_Avx2(rounds, engineState, input, output); else if (Org.BouncyCastle.Runtime.Intrinsics.X86.Sse2.IsEnabled) { ImplProcessBlocks2_X86_Sse2(rounds, engineState, input, output); } else { ImplProcessBlock(input, output); ImplProcessBlock(input.Slice(64, input.Length - 64), output.Slice(64, output.Length - 64)); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal void ImplProcessBlock(ReadOnlySpan<byte> input, Span<byte> output) { ChaChaEngine.ChachaCore(rounds, engineState, keyStream); AdvanceCounter(); for (int i = 0; i < 64; i++) { output[i] = (byte)(keyStream[i] ^ input[i]); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void ImplProcessBlocks2_X86_Avx2(int rounds, uint[] state, ReadOnlySpan<byte> input, Span<byte> output) { if (!Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled) throw new PlatformNotSupportedException(); Vector128<uint> vector = Load128_UInt32(state.AsSpan()); Vector128<uint> vector2 = Load128_UInt32(state.AsSpan(4)); Vector128<uint> vector3 = Load128_UInt32(state.AsSpan(8)); Vector128<uint> lower = Load128_UInt32(state.AsSpan(12)); state[12]++; Vector128<uint> upper = Load128_UInt32(state.AsSpan(12)); state[12]++; Vector256<uint> vector4 = Vector256.Create(vector, vector); Vector256<uint> vector5 = Vector256.Create(vector2, vector2); Vector256<uint> vector6 = Vector256.Create(vector3, vector3); Vector256<uint> vector7 = Vector256.Create(lower, upper); Vector256<uint> vector8 = vector4; Vector256<uint> vector9 = vector5; Vector256<uint> left = vector6; Vector256<uint> left2 = vector7; for (int num = rounds; num > 0; num -= 2) { vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 16)); left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 12), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 20)); vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 24)); left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 7), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 25)); vector9 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector9, 57); left = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left, 78); left2 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left2, 147); vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 16)); left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 12), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 20)); vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector9); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(left2, vector8); left2 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(left2, 24)); left = System.Runtime.Intrinsics.X86.Avx2.Add(left, left2); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(vector9, left); vector9 = System.Runtime.Intrinsics.X86.Avx2.Xor(System.Runtime.Intrinsics.X86.Avx2.ShiftLeftLogical(vector9, 7), System.Runtime.Intrinsics.X86.Avx2.ShiftRightLogical(vector9, 25)); vector9 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(vector9, 147); left = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left, 78); left2 = System.Runtime.Intrinsics.X86.Avx2.Shuffle(left2, 57); } vector8 = System.Runtime.Intrinsics.X86.Avx2.Add(vector8, vector4); vector9 = System.Runtime.Intrinsics.X86.Avx2.Add(vector9, vector5); left = System.Runtime.Intrinsics.X86.Avx2.Add(left, vector6); left2 = System.Runtime.Intrinsics.X86.Avx2.Add(left2, vector7); Vector256<byte> left3 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(vector8, vector9, 32).AsByte(); Vector256<byte> left4 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(left, left2, 32).AsByte(); Vector256<byte> left5 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(vector8, vector9, 49).AsByte(); Vector256<byte> left6 = System.Runtime.Intrinsics.X86.Avx2.Permute2x128(left, left2, 49).AsByte(); left3 = System.Runtime.Intrinsics.X86.Avx2.Xor(left3, Load256_Byte(input)); left4 = System.Runtime.Intrinsics.X86.Avx2.Xor(left4, Load256_Byte(input.Slice(32, input.Length - 32))); left5 = System.Runtime.Intrinsics.X86.Avx2.Xor(left5, Load256_Byte(input.Slice(64, input.Length - 64))); left6 = System.Runtime.Intrinsics.X86.Avx2.Xor(left6, Load256_Byte(input.Slice(96, input.Length - 96))); Store256_Byte(left3, output); Store256_Byte(left4, output.Slice(32, output.Length - 32)); Store256_Byte(left5, output.Slice(64, output.Length - 64)); Store256_Byte(left6, output.Slice(96, output.Length - 96)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void ImplProcessBlocks2_X86_Sse2(int rounds, uint[] state, ReadOnlySpan<byte> input, Span<byte> output) { if (!Org.BouncyCastle.Runtime.Intrinsics.X86.Sse2.IsEnabled) throw new PlatformNotSupportedException(); Vector128<uint> vector = Load128_UInt32(state.AsSpan()); Vector128<uint> vector2 = Load128_UInt32(state.AsSpan(4)); Vector128<uint> vector3 = Load128_UInt32(state.AsSpan(8)); Vector128<uint> vector4 = Load128_UInt32(state.AsSpan(12)); state[12]++; Vector128<uint> vector5 = vector; Vector128<uint> vector6 = vector2; Vector128<uint> left = vector3; Vector128<uint> left2 = vector4; for (int num = rounds; num > 0; num -= 2) { vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20)); vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25)); vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 57); left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78); left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 147); vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20)); vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25)); vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 147); left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78); left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 57); } vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector); vector6 = System.Runtime.Intrinsics.X86.Sse2.Add(vector6, vector2); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, vector3); left2 = System.Runtime.Intrinsics.X86.Sse2.Add(left2, vector4); Vector128<byte> left3 = Load128_Byte(input); Vector128<byte> left4 = Load128_Byte(input.Slice(16, input.Length - 16)); Vector128<byte> left5 = Load128_Byte(input.Slice(32, input.Length - 32)); Vector128<byte> left6 = Load128_Byte(input.Slice(48, input.Length - 48)); left3 = System.Runtime.Intrinsics.X86.Sse2.Xor(left3, vector5.AsByte()); Vector128<byte> s = System.Runtime.Intrinsics.X86.Sse2.Xor(left4, vector6.AsByte()); left5 = System.Runtime.Intrinsics.X86.Sse2.Xor(left5, left.AsByte()); left6 = System.Runtime.Intrinsics.X86.Sse2.Xor(left6, left2.AsByte()); Store128_Byte(left3, output); Store128_Byte(s, output.Slice(16, output.Length - 16)); Store128_Byte(left5, output.Slice(32, output.Length - 32)); Store128_Byte(left6, output.Slice(48, output.Length - 48)); vector4 = Load128_UInt32(state.AsSpan(12)); state[12]++; vector5 = vector; vector6 = vector2; left = vector3; left2 = vector4; for (int num2 = rounds; num2 > 0; num2 -= 2) { vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20)); vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25)); vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 57); left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78); left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 147); vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 16), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 16)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 12), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 20)); vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector6); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left2, vector5); left2 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(left2, 8), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(left2, 24)); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, left2); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(vector6, left); vector6 = System.Runtime.Intrinsics.X86.Sse2.Xor(System.Runtime.Intrinsics.X86.Sse2.ShiftLeftLogical(vector6, 7), System.Runtime.Intrinsics.X86.Sse2.ShiftRightLogical(vector6, 25)); vector6 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(vector6, 147); left = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left, 78); left2 = System.Runtime.Intrinsics.X86.Sse2.Shuffle(left2, 57); } vector5 = System.Runtime.Intrinsics.X86.Sse2.Add(vector5, vector); vector6 = System.Runtime.Intrinsics.X86.Sse2.Add(vector6, vector2); left = System.Runtime.Intrinsics.X86.Sse2.Add(left, vector3); left2 = System.Runtime.Intrinsics.X86.Sse2.Add(left2, vector4); left3 = Load128_Byte(input.Slice(64, input.Length - 64)); Vector128<byte> left7 = Load128_Byte(input.Slice(80, input.Length - 80)); left5 = Load128_Byte(input.Slice(96, input.Length - 96)); left6 = Load128_Byte(input.Slice(112, input.Length - 112)); left3 = System.Runtime.Intrinsics.X86.Sse2.Xor(left3, vector5.AsByte()); Vector128<byte> s2 = System.Runtime.Intrinsics.X86.Sse2.Xor(left7, vector6.AsByte()); left5 = System.Runtime.Intrinsics.X86.Sse2.Xor(left5, left.AsByte()); left6 = System.Runtime.Intrinsics.X86.Sse2.Xor(left6, left2.AsByte()); Store128_Byte(left3, output.Slice(64, output.Length - 64)); Store128_Byte(s2, output.Slice(80, output.Length - 80)); Store128_Byte(left5, output.Slice(96, output.Length - 96)); Store128_Byte(left6, output.Slice(112, output.Length - 112)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128<byte> Load128_Byte(ReadOnlySpan<byte> t) { if (Vector.IsPackedLittleEndian) return MemoryMarshal.Read<Vector128<byte>>(t); return Vector128.Create(BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(0, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(8, t.Length - 8))).AsByte(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128<uint> Load128_UInt32(ReadOnlySpan<uint> t) { if (Vector.IsPackedLittleEndian) return MemoryMarshal.Read<Vector128<uint>>(MemoryMarshal.AsBytes(t)); return Vector128.Create(t[0], t[1], t[2], t[3]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256<byte> Load256_Byte(ReadOnlySpan<byte> t) { if (Vector.IsPackedLittleEndian) return MemoryMarshal.Read<Vector256<byte>>(t); return Vector256.Create(BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(0, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(8, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(16, 8)), BinaryPrimitives.ReadUInt64LittleEndian(t.Slice(24, 8))).AsByte(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Store128_Byte(Vector128<byte> s, Span<byte> t) { if (Vector.IsPackedLittleEndian) MemoryMarshal.Write(t, ref s); else { Vector128<ulong> vector = s.AsUInt64(); BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(0, 8), vector.GetElement(0)); BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(8, t.Length - 8), vector.GetElement(1)); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Store256_Byte(Vector256<byte> s, Span<byte> t) { if (Vector.IsPackedLittleEndian) MemoryMarshal.Write(t, ref s); else { Vector256<ulong> vector = s.AsUInt64(); BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(0, 8), vector.GetElement(0)); BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(8, 8), vector.GetElement(1)); BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(16, 8), vector.GetElement(2)); BinaryPrimitives.WriteUInt64LittleEndian(t.Slice(24, 8), vector.GetElement(3)); } } } }