4747
4848#include " hwy/highway.h"
4949// After highway.h
50+ #include " compression/int-inl.h"
5051#include " compression/nuq-inl.h"
5152#include " compression/sfp-inl.h"
5253
@@ -416,6 +417,34 @@ struct CompressTraits<SfpStream> {
416417 }
417418};
418419
420+ // Integer quantization.
421+ template <>
422+ struct CompressTraits <I8Stream> {
423+ using Packed = I8Stream;
424+
425+ template <class DF , HWY_IF_F32_D(DF)>
426+ static HWY_INLINE void Compress (DF df, const float * HWY_RESTRICT raw,
427+ size_t num, CompressPerThread& tls,
428+ const PackedSpan<Packed>& packed,
429+ const size_t packed_ofs) {
430+ IntCodec::Enc (df, raw, num, packed, packed_ofs);
431+ }
432+
433+ template <class D > // Caller checks this is f32 or bf16
434+ static HWY_INLINE void Load2 (D d, const PackedSpan<const Packed>& packed,
435+ const size_t packed_ofs, hn::Vec<D>& raw0,
436+ hn::Vec<D>& raw1) {
437+ IntCodec::Dec2 (d, packed, packed_ofs, raw0, raw1);
438+ }
439+
440+ template <class D , typename Raw>
441+ static HWY_INLINE void DecompressAndZeroPad (
442+ D d, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
443+ Raw* raw, const size_t num) {
444+ IntCodec::DecompressAndZeroPad (d, packed, packed_ofs, raw, num);
445+ }
446+ };
447+
419448// Nonuniform quantization, 4.5 bits per element, two separate streams.
420449template <>
421450struct CompressTraits <NuqStream> {
@@ -737,9 +766,10 @@ template <class DF, typename T, typename T1, class Func>
737766HWY_INLINE void Decompress1AndCompressInplace (DF df, T* HWY_RESTRICT inout,
738767 size_t num,
739768 const T1* HWY_RESTRICT p1,
769+ const size_t p1_ofs,
740770 Func&& func) {
741771 const auto packed_inout = MakeSpan (inout, num);
742- const auto packed1 = MakeSpan (p1, num);
772+ const auto packed1 = MakeSpan (p1, p1_ofs + num);
743773
744774 using VF = hn::Vec<decltype (df)>;
745775 HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes (df);
@@ -749,7 +779,7 @@ HWY_INLINE void Decompress1AndCompressInplace(DF df, T* HWY_RESTRICT inout,
749779 VF v0, v1;
750780 Decompress2 (df, packed_inout, i, v0, v1);
751781 VF v10, v11;
752- Decompress2 (df, packed1, i, v10, v11);
782+ Decompress2 (df, packed1, p1_ofs + i, v10, v11);
753783 const VF out0 = func (df, v0, v10);
754784 const VF out1 = func (df, v1, v11);
755785 Compress2 (df, out0, out1, packed_inout, i);
@@ -765,7 +795,7 @@ HWY_INLINE void Decompress1AndCompressInplace(DF df, T* HWY_RESTRICT inout,
765795 hn::Store (hn::Zero (df), df, buf_inout + NF);
766796 hn::Store (hn::Zero (df), df, buf1 + NF);
767797 DecompressAndZeroPad (df, packed_inout, i, buf_inout, remaining);
768- DecompressAndZeroPad (df, packed1, i, buf1, remaining);
798+ DecompressAndZeroPad (df, packed1, p1_ofs + i, buf1, remaining);
769799 const VF v0 = hn::Load (df, buf_inout);
770800 const VF v1 = hn::Load (df, buf_inout + NF);
771801 const VF v10 = hn::Load (df, buf1);
@@ -827,10 +857,10 @@ template <class DF, typename T, typename T1, typename T2, class Func>
827857HWY_INLINE void Decompress2AndCompressTo (DF df, T* HWY_RESTRICT out, size_t num,
828858 const T1* HWY_RESTRICT p1,
829859 const T2* HWY_RESTRICT p2,
830- Func&& func) {
860+ const size_t p2_ofs, Func&& func) {
831861 const auto packed_out = MakeSpan (out, num);
832862 const auto packed1 = MakeSpan (p1, num);
833- const auto packed2 = MakeSpan (p2, num);
863+ const auto packed2 = MakeSpan (p2, p2_ofs + num);
834864
835865 using VF = hn::Vec<decltype (df)>;
836866 HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes (df);
@@ -839,7 +869,7 @@ HWY_INLINE void Decompress2AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
839869 for (; i <= num - 2 * NF; i += 2 * NF) {
840870 VF v10, v11, v20, v21;
841871 Decompress2 (df, packed1, i, v10, v11);
842- Decompress2 (df, packed2, i, v20, v21);
872+ Decompress2 (df, packed2, p2_ofs + i, v20, v21);
843873 const VF out0 = func (df, v10, v20);
844874 const VF out1 = func (df, v11, v21);
845875 Compress2 (df, out0, out1, packed_out, i);
@@ -856,7 +886,7 @@ HWY_INLINE void Decompress2AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
856886 hn::Store (hn::Zero (df), df, buf1 + NF);
857887 hn::Store (hn::Zero (df), df, buf2 + NF);
858888 DecompressAndZeroPad (df, packed1, i, buf1, remaining);
859- DecompressAndZeroPad (df, packed2, i, buf2, remaining);
889+ DecompressAndZeroPad (df, packed2, p2_ofs + i, buf2, remaining);
860890 const VF v10 = hn::Load (df, buf1);
861891 const VF v11 = hn::Load (df, buf1 + NF);
862892 const VF v20 = hn::Load (df, buf2);
0 commit comments